summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/blob
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/db/blob')
-rw-r--r--src/rocksdb/db/blob/blob_constants.h16
-rw-r--r--src/rocksdb/db/blob/blob_contents.cc90
-rw-r--r--src/rocksdb/db/blob/blob_contents.h56
-rw-r--r--src/rocksdb/db/blob/blob_counting_iterator.h146
-rw-r--r--src/rocksdb/db/blob/blob_counting_iterator_test.cc327
-rw-r--r--src/rocksdb/db/blob/blob_fetcher.cc34
-rw-r--r--src/rocksdb/db/blob/blob_fetcher.h37
-rw-r--r--src/rocksdb/db/blob/blob_file_addition.cc156
-rw-r--r--src/rocksdb/db/blob/blob_file_addition.h67
-rw-r--r--src/rocksdb/db/blob/blob_file_addition_test.cc211
-rw-r--r--src/rocksdb/db/blob/blob_file_builder.cc446
-rw-r--r--src/rocksdb/db/blob/blob_file_builder.h112
-rw-r--r--src/rocksdb/db/blob/blob_file_builder_test.cc680
-rw-r--r--src/rocksdb/db/blob/blob_file_cache.cc102
-rw-r--r--src/rocksdb/db/blob/blob_file_cache.h52
-rw-r--r--src/rocksdb/db/blob/blob_file_cache_test.cc269
-rw-r--r--src/rocksdb/db/blob/blob_file_completion_callback.h101
-rw-r--r--src/rocksdb/db/blob/blob_file_garbage.cc134
-rw-r--r--src/rocksdb/db/blob/blob_file_garbage.h57
-rw-r--r--src/rocksdb/db/blob/blob_file_garbage_test.cc174
-rw-r--r--src/rocksdb/db/blob/blob_file_meta.cc62
-rw-r--r--src/rocksdb/db/blob/blob_file_meta.h170
-rw-r--r--src/rocksdb/db/blob/blob_file_reader.cc610
-rw-r--r--src/rocksdb/db/blob/blob_file_reader.h108
-rw-r--r--src/rocksdb/db/blob/blob_file_reader_test.cc1024
-rw-r--r--src/rocksdb/db/blob/blob_garbage_meter.cc100
-rw-r--r--src/rocksdb/db/blob/blob_garbage_meter.h102
-rw-r--r--src/rocksdb/db/blob/blob_garbage_meter_test.cc197
-rw-r--r--src/rocksdb/db/blob/blob_index.h187
-rw-r--r--src/rocksdb/db/blob/blob_log_format.cc143
-rw-r--r--src/rocksdb/db/blob/blob_log_format.h164
-rw-r--r--src/rocksdb/db/blob/blob_log_sequential_reader.cc134
-rw-r--r--src/rocksdb/db/blob/blob_log_sequential_reader.h83
-rw-r--r--src/rocksdb/db/blob/blob_log_writer.cc178
-rw-r--r--src/rocksdb/db/blob/blob_log_writer.h83
-rw-r--r--src/rocksdb/db/blob/blob_read_request.h58
-rw-r--r--src/rocksdb/db/blob/blob_source.cc488
-rw-r--r--src/rocksdb/db/blob/blob_source.h153
-rw-r--r--src/rocksdb/db/blob/blob_source_test.cc1624
-rw-r--r--src/rocksdb/db/blob/db_blob_basic_test.cc1789
-rw-r--r--src/rocksdb/db/blob/db_blob_compaction_test.cc913
-rw-r--r--src/rocksdb/db/blob/db_blob_corruption_test.cc82
-rw-r--r--src/rocksdb/db/blob/db_blob_index_test.cc602
-rw-r--r--src/rocksdb/db/blob/prefetch_buffer_collection.cc21
-rw-r--r--src/rocksdb/db/blob/prefetch_buffer_collection.h38
45 files changed, 12380 insertions, 0 deletions
diff --git a/src/rocksdb/db/blob/blob_constants.h b/src/rocksdb/db/blob/blob_constants.h
new file mode 100644
index 000000000..a5d09ac76
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_constants.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.cc b/src/rocksdb/db/blob/blob_contents.cc
new file mode 100644
index 000000000..9015609e7
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.cc
@@ -0,0 +1,90 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_contents.h"
+
+#include <cassert>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_helpers.h"
+#include "port/malloc.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<BlobContents> BlobContents::Create(
+ CacheAllocationPtr&& allocation, size_t size) {
+ return std::unique_ptr<BlobContents>(
+ new BlobContents(std::move(allocation), size));
+}
+
+size_t BlobContents::ApproximateMemoryUsage() const {
+ size_t usage = 0;
+
+ if (allocation_) {
+ MemoryAllocator* const allocator = allocation_.get_deleter().allocator;
+
+ if (allocator) {
+ usage += allocator->UsableSize(allocation_.get(), data_.size());
+ } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(allocation_.get());
+#else
+ usage += data_.size();
+#endif
+ }
+ }
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<BlobContents*>(this));
+#else
+ usage += sizeof(*this);
+#endif
+
+ return usage;
+}
+
+size_t BlobContents::SizeCallback(void* obj) {
+ assert(obj);
+
+ return static_cast<const BlobContents*>(obj)->size();
+}
+
+Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
+ size_t length, void* out) {
+ assert(from_obj);
+
+ const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
+ assert(buf->size() >= from_offset + length);
+
+ memcpy(out, buf->data().data() + from_offset, length);
+
+ return Status::OK();
+}
+
+Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
+ static Cache::CacheItemHelper cache_helper(
+ &SizeCallback, &SaveToCallback,
+ GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
+
+ return &cache_helper;
+}
+
+Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
+ const void* buf, size_t size,
+ void** out_obj, size_t* charge) {
+ assert(allocation);
+
+ memcpy(allocation.get(), buf, size);
+
+ std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
+ BlobContents* const contents = obj.release();
+
+ *out_obj = contents;
+ *charge = contents->ApproximateMemoryUsage();
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.h b/src/rocksdb/db/blob/blob_contents.h
new file mode 100644
index 000000000..9b7c5b969
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.h
@@ -0,0 +1,56 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "memory/memory_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class representing a single uncompressed value read from a blob file.
+class BlobContents {
+ public:
+ static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
+ size_t size);
+
+ BlobContents(const BlobContents&) = delete;
+ BlobContents& operator=(const BlobContents&) = delete;
+
+ BlobContents(BlobContents&&) = default;
+ BlobContents& operator=(BlobContents&&) = default;
+
+ ~BlobContents() = default;
+
+ const Slice& data() const { return data_; }
+ size_t size() const { return data_.size(); }
+
+ size_t ApproximateMemoryUsage() const;
+
+ // Callbacks for secondary cache
+ static size_t SizeCallback(void* obj);
+
+ static Status SaveToCallback(void* from_obj, size_t from_offset,
+ size_t length, void* out);
+
+ static Cache::CacheItemHelper* GetCacheItemHelper();
+
+ static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
+ size_t size, void** out_obj, size_t* charge);
+
+ private:
+ BlobContents(CacheAllocationPtr&& allocation, size_t size)
+ : allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
+
+ CacheAllocationPtr allocation_;
+ Slice data_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator.h b/src/rocksdb/db/blob/blob_counting_iterator.h
new file mode 100644
index 000000000..de549afa2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that passes each key-value encountered to
+// BlobGarbageMeter as inflow in order to measure the total number and size of
+// blobs in the compaction input on a per-blob file basis.
+class BlobCountingIterator : public InternalIterator {
+ public:
+ BlobCountingIterator(InternalIterator* iter,
+ BlobGarbageMeter* blob_garbage_meter)
+ : iter_(iter), blob_garbage_meter_(blob_garbage_meter) {
+ assert(iter_);
+ assert(blob_garbage_meter_);
+
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ bool Valid() const override { return iter_->Valid() && status_.ok(); }
+
+ void SeekToFirst() override {
+ iter_->SeekToFirst();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void SeekToLast() override {
+ iter_->SeekToLast();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void Seek(const Slice& target) override {
+ iter_->Seek(target);
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ iter_->SeekForPrev(target);
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ void Next() override {
+ assert(Valid());
+
+ iter_->Next();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ assert(Valid());
+
+ const bool res = iter_->NextAndGetResult(result);
+ UpdateAndCountBlobIfNeeded();
+ return res;
+ }
+
+ void Prev() override {
+ assert(Valid());
+
+ iter_->Prev();
+ UpdateAndCountBlobIfNeeded();
+ }
+
+ Slice key() const override {
+ assert(Valid());
+ return iter_->key();
+ }
+
+ Slice user_key() const override {
+ assert(Valid());
+ return iter_->user_key();
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ return iter_->value();
+ }
+
+ Status status() const override { return status_; }
+
+ bool PrepareValue() override {
+ assert(Valid());
+ return iter_->PrepareValue();
+ }
+
+ bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+ return iter_->MayBeOutOfLowerBound();
+ }
+
+ IterBoundCheck UpperBoundCheckResult() override {
+ assert(Valid());
+ return iter_->UpperBoundCheckResult();
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ iter_->SetPinnedItersMgr(pinned_iters_mgr);
+ }
+
+ bool IsKeyPinned() const override {
+ assert(Valid());
+ return iter_->IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ assert(Valid());
+ return iter_->IsValuePinned();
+ }
+
+ Status GetProperty(std::string prop_name, std::string* prop) override {
+ return iter_->GetProperty(prop_name, prop);
+ }
+
+ private:
+ void UpdateAndCountBlobIfNeeded() {
+ assert(!iter_->Valid() || iter_->status().ok());
+
+ if (!iter_->Valid()) {
+ status_ = iter_->status();
+ return;
+ }
+
+ TEST_SYNC_POINT(
+ "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow");
+
+ status_ = blob_garbage_meter_->ProcessInFlow(key(), value());
+ }
+
+ InternalIterator* iter_;
+ BlobGarbageMeter* blob_garbage_meter_;
+ Status status_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator_test.cc b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
new file mode 100644
index 000000000..c7bbc8f58
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
@@ -0,0 +1,327 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_counting_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter,
+ uint64_t blob_file_number, uint64_t count, uint64_t bytes) {
+ const auto& flows = blob_garbage_meter.flows();
+
+ const auto it = flows.find(blob_file_number);
+ if (it == flows.end()) {
+ ASSERT_EQ(count, 0);
+ ASSERT_EQ(bytes, 0);
+ return;
+ }
+
+ const auto& in = it->second.GetInFlow();
+
+ ASSERT_EQ(in.GetCount(), count);
+ ASSERT_EQ(in.GetBytes(), bytes);
+}
+
+TEST(BlobCountingIteratorTest, CountBlobs) {
+ // Note: the input consists of three key-values: two are blob references to
+ // different blob files, while the third one is a plain value.
+ constexpr char user_key0[] = "key0";
+ constexpr char user_key1[] = "key1";
+ constexpr char user_key2[] = "key2";
+
+ const std::vector<std::string> keys{
+ test::KeyStr(user_key0, 1, kTypeBlobIndex),
+ test::KeyStr(user_key1, 2, kTypeBlobIndex),
+ test::KeyStr(user_key2, 3, kTypeValue)};
+
+ constexpr uint64_t first_blob_file_number = 4;
+ constexpr uint64_t first_offset = 1000;
+ constexpr uint64_t first_size = 2000;
+
+ std::string first_blob_index;
+ BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset,
+ first_size, kNoCompression);
+
+ constexpr uint64_t second_blob_file_number = 6;
+ constexpr uint64_t second_offset = 2000;
+ constexpr uint64_t second_size = 4000;
+
+ std::string second_blob_index;
+ BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number,
+ second_offset, second_size, kNoCompression);
+
+ const std::vector<std::string> values{first_blob_index, second_blob_index,
+ "raw_value"};
+
+ assert(keys.size() == values.size());
+
+ VectorIterator input(keys, values);
+ BlobGarbageMeter blob_garbage_meter;
+
+ BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+ constexpr uint64_t first_expected_bytes =
+ first_size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1);
+ constexpr uint64_t second_expected_bytes =
+ second_size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1);
+
+ // Call SeekToFirst and iterate forward
+ blob_counter.SeekToFirst();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0);
+
+ blob_counter.Next();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ blob_counter.Next();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ blob_counter.Next();
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+ first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ // Do it again using NextAndGetResult
+ blob_counter.SeekToFirst();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+ second_expected_bytes);
+
+ {
+ IterateResult result;
+ ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+ ASSERT_EQ(result.key, keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+ }
+
+ {
+ IterateResult result;
+ ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+ ASSERT_EQ(result.key, keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+ }
+
+ {
+ IterateResult result;
+ ASSERT_FALSE(blob_counter.NextAndGetResult(&result));
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+ }
+
+ // Call SeekToLast and iterate backward
+ blob_counter.SeekToLast();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+ 2 * second_expected_bytes);
+
+ blob_counter.Prev();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+ 2 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ blob_counter.Prev();
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+ 3 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ blob_counter.Prev();
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+ 3 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ // Call Seek for all keys (plus one that's greater than all of them)
+ blob_counter.Seek(keys[0]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+ 3 * second_expected_bytes);
+
+ blob_counter.Seek(keys[1]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.Seek(keys[2]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.Seek("zzz");
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ // Call SeekForPrev for all keys (plus one that's less than all of them)
+ blob_counter.SeekForPrev("aaa");
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+ 4 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.SeekForPrev(keys[0]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[0]);
+ ASSERT_EQ(blob_counter.user_key(), user_key0);
+ ASSERT_EQ(blob_counter.value(), values[0]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+ 5 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+ 4 * second_expected_bytes);
+
+ blob_counter.SeekForPrev(keys[1]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[1]);
+ ASSERT_EQ(blob_counter.user_key(), user_key1);
+ ASSERT_EQ(blob_counter.value(), values[1]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+ 5 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+ 5 * second_expected_bytes);
+
+ blob_counter.SeekForPrev(keys[2]);
+ ASSERT_TRUE(blob_counter.Valid());
+ ASSERT_OK(blob_counter.status());
+ ASSERT_EQ(blob_counter.key(), keys[2]);
+ ASSERT_EQ(blob_counter.user_key(), user_key2);
+ ASSERT_EQ(blob_counter.value(), values[2]);
+ CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+ 5 * first_expected_bytes);
+ CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+ 5 * second_expected_bytes);
+}
+
+TEST(BlobCountingIteratorTest, CorruptBlobIndex) {
+ const std::vector<std::string> keys{
+ test::KeyStr("user_key", 1, kTypeBlobIndex)};
+ const std::vector<std::string> values{"i_am_not_a_blob_index"};
+
+ assert(keys.size() == values.size());
+
+ VectorIterator input(keys, values);
+ BlobGarbageMeter blob_garbage_meter;
+
+ BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+ blob_counter.SeekToFirst();
+ ASSERT_FALSE(blob_counter.Valid());
+ ASSERT_NOK(blob_counter.status());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_fetcher.cc b/src/rocksdb/db/blob/blob_fetcher.cc
new file mode 100644
index 000000000..124429f93
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_fetcher.h"
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+ const Slice& blob_index_slice,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value,
+ uint64_t* bytes_read) const {
+ assert(version_);
+
+ return version_->GetBlob(read_options_, user_key, blob_index_slice,
+ prefetch_buffer, blob_value, bytes_read);
+}
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+ const BlobIndex& blob_index,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value,
+ uint64_t* bytes_read) const {
+ assert(version_);
+
+ return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
+ blob_value, bytes_read);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_fetcher.h b/src/rocksdb/db/blob/blob_fetcher.h
new file mode 100644
index 000000000..8aeaf965d
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class Slice;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class BlobIndex;
+
+// A thin wrapper around the blob retrieval functionality of Version.
+class BlobFetcher {
+ public:
+ BlobFetcher(const Version* version, const ReadOptions& read_options)
+ : version_(version), read_options_(read_options) {}
+
+ Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ private:
+ const Version* version_;
+ ReadOptions read_options_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.cc b/src/rocksdb/db/blob/blob_file_addition.cc
new file mode 100644
index 000000000..71b1bb7fc
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileAddition::CustomFieldTags : uint32_t {
+ kEndMarker,
+
+ // Add forward compatible fields here
+
+ /////////////////////////////////////////////////////////////////////
+
+ kForwardIncompatibleMask = 1 << 6,
+
+ // Add forward incompatible fields here
+};
+
+void BlobFileAddition::EncodeTo(std::string* output) const {
+ PutVarint64(output, blob_file_number_);
+ PutVarint64(output, total_blob_count_);
+ PutVarint64(output, total_blob_bytes_);
+ PutLengthPrefixedSlice(output, checksum_method_);
+ PutLengthPrefixedSlice(output, checksum_value_);
+
+ // Encode any custom fields here. The format to use is a Varint32 tag (see
+ // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+ // fields will be ignored during decoding unless they're in the forward
+ // incompatible range.
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output);
+
+ PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileAddition::DecodeFrom(Slice* input) {
+ constexpr char class_name[] = "BlobFileAddition";
+
+ if (!GetVarint64(input, &blob_file_number_)) {
+ return Status::Corruption(class_name, "Error decoding blob file number");
+ }
+
+ if (!GetVarint64(input, &total_blob_count_)) {
+ return Status::Corruption(class_name, "Error decoding total blob count");
+ }
+
+ if (!GetVarint64(input, &total_blob_bytes_)) {
+ return Status::Corruption(class_name, "Error decoding total blob bytes");
+ }
+
+ Slice checksum_method;
+ if (!GetLengthPrefixedSlice(input, &checksum_method)) {
+ return Status::Corruption(class_name, "Error decoding checksum method");
+ }
+ checksum_method_ = checksum_method.ToString();
+
+ Slice checksum_value;
+ if (!GetLengthPrefixedSlice(input, &checksum_value)) {
+ return Status::Corruption(class_name, "Error decoding checksum value");
+ }
+ checksum_value_ = checksum_value.ToString();
+
+ while (true) {
+ uint32_t custom_field_tag = 0;
+ if (!GetVarint32(input, &custom_field_tag)) {
+ return Status::Corruption(class_name, "Error decoding custom field tag");
+ }
+
+ if (custom_field_tag == kEndMarker) {
+ break;
+ }
+
+ if (custom_field_tag & kForwardIncompatibleMask) {
+ return Status::Corruption(
+ class_name, "Forward incompatible custom field encountered");
+ }
+
+ Slice custom_field_value;
+ if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+ return Status::Corruption(class_name,
+ "Error decoding custom field value");
+ }
+ }
+
+ return Status::OK();
+}
+
+std::string BlobFileAddition::DebugString() const {
+ std::ostringstream oss;
+
+ oss << *this;
+
+ return oss.str();
+}
+
+std::string BlobFileAddition::DebugJSON() const {
+ JSONWriter jw;
+
+ jw << *this;
+
+ jw.EndObject();
+
+ return jw.Get();
+}
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+ return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+ lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() &&
+ lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() &&
+ lhs.GetChecksumMethod() == rhs.GetChecksumMethod() &&
+ lhs.GetChecksumValue() == rhs.GetChecksumValue();
+}
+
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+ return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileAddition& blob_file_addition) {
+ os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber()
+ << " total_blob_count: " << blob_file_addition.GetTotalBlobCount()
+ << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes()
+ << " checksum_method: " << blob_file_addition.GetChecksumMethod()
+ << " checksum_value: "
+ << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+ return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileAddition& blob_file_addition) {
+ jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber()
+ << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount()
+ << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes()
+ << "ChecksumMethod" << blob_file_addition.GetChecksumMethod()
+ << "ChecksumValue"
+ << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+ return jw;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.h b/src/rocksdb/db/blob/blob_file_addition.h
new file mode 100644
index 000000000..43b1a0bcb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileAddition {
+ public:
+ BlobFileAddition() = default;
+
+ BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value)
+ : blob_file_number_(blob_file_number),
+ total_blob_count_(total_blob_count),
+ total_blob_bytes_(total_blob_bytes),
+ checksum_method_(std::move(checksum_method)),
+ checksum_value_(std::move(checksum_value)) {
+ assert(checksum_method_.empty() == checksum_value_.empty());
+ }
+
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+ uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+ const std::string& GetChecksumMethod() const { return checksum_method_; }
+ const std::string& GetChecksumValue() const { return checksum_value_; }
+
+ void EncodeTo(std::string* output) const;
+ Status DecodeFrom(Slice* input);
+
+ std::string DebugString() const;
+ std::string DebugJSON() const;
+
+ private:
+ enum CustomFieldTags : uint32_t;
+
+ uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+ uint64_t total_blob_count_ = 0;
+ uint64_t total_blob_bytes_ = 0;
+ std::string checksum_method_;
+ std::string checksum_value_;
+};
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileAddition& blob_file_addition);
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileAddition& blob_file_addition);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition_test.cc b/src/rocksdb/db/blob/blob_file_addition_test.cc
new file mode 100644
index 000000000..64cb0a9d6
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition_test.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileAdditionTest : public testing::Test {
+ public:
+ static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) {
+ std::string encoded;
+ blob_file_addition.EncodeTo(&encoded);
+
+ BlobFileAddition decoded;
+ Slice input(encoded);
+ ASSERT_OK(decoded.DecodeFrom(&input));
+
+ ASSERT_EQ(blob_file_addition, decoded);
+ }
+};
+
+TEST_F(BlobFileAdditionTest, Empty) {
+ BlobFileAddition blob_file_addition;
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0);
+ ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty());
+ ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty());
+
+ TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, NonEmpty) {
+ constexpr uint64_t blob_file_number = 123;
+ constexpr uint64_t total_blob_count = 2;
+ constexpr uint64_t total_blob_bytes = 123456;
+ const std::string checksum_method("SHA1");
+ const std::string checksum_value(
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd");
+
+ BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method,
+ checksum_value);
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes);
+ ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method);
+ ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value);
+
+ TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, DecodeErrors) {
+ std::string str;
+ Slice slice(str);
+
+ BlobFileAddition blob_file_addition;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+ }
+
+ constexpr uint64_t blob_file_number = 123;
+ PutVarint64(&str, blob_file_number);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "total blob count"));
+ }
+
+ constexpr uint64_t total_blob_count = 4567;
+ PutVarint64(&str, total_blob_count);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes"));
+ }
+
+ constexpr uint64_t total_blob_bytes = 12345678;
+ PutVarint64(&str, total_blob_bytes);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "checksum method"));
+ }
+
+ constexpr char checksum_method[] = "SHA1";
+ PutLengthPrefixedSlice(&str, checksum_method);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "checksum value"));
+ }
+
+ constexpr char checksum_value[] =
+ "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+ "\x5c\xbd";
+ PutLengthPrefixedSlice(&str, checksum_value);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+ }
+
+ constexpr uint32_t custom_tag = 2;
+ PutVarint32(&str, custom_tag);
+ slice = str;
+
+ {
+ const Status s = blob_file_addition.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+ }
+}
+
+TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_compatible_tag = 2;
+ PutVarint32(output, forward_compatible_tag);
+
+ PutLengthPrefixedSlice(output, "deadbeef");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 678;
+ constexpr uint64_t total_blob_count = 9999;
+ constexpr uint64_t total_blob_bytes = 100000000;
+ const std::string checksum_method("CRC32");
+ const std::string checksum_value("\x3d\x87\xff\x57");
+
+ BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method,
+ checksum_value);
+
+ TestEncodeDecode(blob_file_addition);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+ PutVarint32(output, forward_incompatible_tag);
+
+ PutLengthPrefixedSlice(output, "foobar");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 456;
+ constexpr uint64_t total_blob_count = 100;
+ constexpr uint64_t total_blob_bytes = 2000000;
+ const std::string checksum_method("CRC32B");
+ const std::string checksum_value("\x6d\xbd\xf2\x3a");
+
+ BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+ total_blob_bytes, checksum_method,
+ checksum_value);
+
+ std::string encoded;
+ blob_file_addition.EncodeTo(&encoded);
+
+ BlobFileAddition decoded_blob_file_addition;
+ Slice input(encoded);
+ const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_builder.cc b/src/rocksdb/db/blob/blob_file_builder.cc
new file mode 100644
index 000000000..5e0e7f6cb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.cc
@@ -0,0 +1,446 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/event_helpers.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileBuilder::BlobFileBuilder(
+ VersionSet* versions, FileSystem* fs,
+ const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+ std::string db_id, std::string db_session_id, int job_id,
+ uint32_t column_family_id, const std::string& column_family_name,
+ Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions)
+ : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
+ immutable_options, mutable_cf_options, file_options,
+ db_id, db_session_id, job_id, column_family_id,
+ column_family_name, io_priority, write_hint, io_tracer,
+ blob_callback, creation_reason, blob_file_paths,
+ blob_file_additions) {}
+
+BlobFileBuilder::BlobFileBuilder(
+ std::function<uint64_t()> file_number_generator, FileSystem* fs,
+ const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+ std::string db_id, std::string db_session_id, int job_id,
+ uint32_t column_family_id, const std::string& column_family_name,
+ Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions)
+ : file_number_generator_(std::move(file_number_generator)),
+ fs_(fs),
+ immutable_options_(immutable_options),
+ min_blob_size_(mutable_cf_options->min_blob_size),
+ blob_file_size_(mutable_cf_options->blob_file_size),
+ blob_compression_type_(mutable_cf_options->blob_compression_type),
+ prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
+ file_options_(file_options),
+ db_id_(std::move(db_id)),
+ db_session_id_(std::move(db_session_id)),
+ job_id_(job_id),
+ column_family_id_(column_family_id),
+ column_family_name_(column_family_name),
+ io_priority_(io_priority),
+ write_hint_(write_hint),
+ io_tracer_(io_tracer),
+ blob_callback_(blob_callback),
+ creation_reason_(creation_reason),
+ blob_file_paths_(blob_file_paths),
+ blob_file_additions_(blob_file_additions),
+ blob_count_(0),
+ blob_bytes_(0) {
+ assert(file_number_generator_);
+ assert(fs_);
+ assert(immutable_options_);
+ assert(file_options_);
+ assert(blob_file_paths_);
+ assert(blob_file_paths_->empty());
+ assert(blob_file_additions_);
+ assert(blob_file_additions_->empty());
+}
+
+BlobFileBuilder::~BlobFileBuilder() = default;
+
+Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
+ std::string* blob_index) {
+ assert(blob_index);
+ assert(blob_index->empty());
+
+ if (value.size() < min_blob_size_) {
+ return Status::OK();
+ }
+
+ {
+ const Status s = OpenBlobFileIfNeeded();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ Slice blob = value;
+ std::string compressed_blob;
+
+ {
+ const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ uint64_t blob_file_number = 0;
+ uint64_t blob_offset = 0;
+
+ {
+ const Status s =
+ WriteBlobToFile(key, blob, &blob_file_number, &blob_offset);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ const Status s = CloseBlobFileIfNeeded();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ const Status s =
+ PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_options_->info_log,
+ "Failed to pre-populate the blob into blob cache: %s",
+ s.ToString().c_str());
+ }
+ }
+
+ BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(),
+ blob_compression_type_);
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::Finish() {
+ if (!IsBlobFileOpen()) {
+ return Status::OK();
+ }
+
+ return CloseBlobFile();
+}
+
+bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; }
+
+Status BlobFileBuilder::OpenBlobFileIfNeeded() {
+ if (IsBlobFileOpen()) {
+ return Status::OK();
+ }
+
+ assert(!blob_count_);
+ assert(!blob_bytes_);
+
+ assert(file_number_generator_);
+ const uint64_t blob_file_number = file_number_generator_();
+
+ assert(immutable_options_);
+ assert(!immutable_options_->cf_paths.empty());
+ std::string blob_file_path =
+ BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
+
+ if (blob_callback_) {
+ blob_callback_->OnBlobFileCreationStarted(
+ blob_file_path, column_family_name_, job_id_, creation_reason_);
+ }
+
+ std::unique_ptr<FSWritableFile> file;
+
+ {
+ assert(file_options_);
+ Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Note: files get added to blob_file_paths_ right after the open, so they
+ // can be cleaned up upon failure. Contrast this with blob_file_additions_,
+ // which only contains successfully written files.
+ assert(blob_file_paths_);
+ blob_file_paths_->emplace_back(std::move(blob_file_path));
+
+ assert(file);
+ file->SetIOPriority(io_priority_);
+ file->SetWriteLifeTimeHint(write_hint_);
+ FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
+ Statistics* const statistics = immutable_options_->stats;
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_paths_->back(), *file_options_,
+ immutable_options_->clock, io_tracer_, statistics,
+ immutable_options_->listeners,
+ immutable_options_->file_checksum_gen_factory.get(),
+ tmp_set.Contains(FileType::kBlobFile), false));
+
+ constexpr bool do_flush = false;
+
+ std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
+ std::move(file_writer), immutable_options_->clock, statistics,
+ blob_file_number, immutable_options_->use_fsync, do_flush));
+
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+
+ BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl,
+ expiration_range);
+
+ {
+ Status s = blob_log_writer->WriteHeader(header);
+
+ TEST_SYNC_POINT_CALLBACK(
+ "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ writer_ = std::move(blob_log_writer);
+
+ assert(IsBlobFileOpen());
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::CompressBlobIfNeeded(
+ Slice* blob, std::string* compressed_blob) const {
+ assert(blob);
+ assert(compressed_blob);
+ assert(compressed_blob->empty());
+ assert(immutable_options_);
+
+ if (blob_compression_type_ == kNoCompression) {
+ return Status::OK();
+ }
+
+ CompressionOptions opts;
+ CompressionContext context(blob_compression_type_);
+ constexpr uint64_t sample_for_compression = 0;
+
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ blob_compression_type_, sample_for_compression);
+
+ constexpr uint32_t compression_format_version = 2;
+
+ bool success = false;
+
+ {
+ StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+ BLOB_DB_COMPRESSION_MICROS);
+ success =
+ CompressData(*blob, info, compression_format_version, compressed_blob);
+ }
+
+ if (!success) {
+ return Status::Corruption("Error compressing blob");
+ }
+
+ *blob = Slice(*compressed_blob);
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
+ uint64_t* blob_file_number,
+ uint64_t* blob_offset) {
+ assert(IsBlobFileOpen());
+ assert(blob_file_number);
+ assert(blob_offset);
+
+ uint64_t key_offset = 0;
+
+ Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ *blob_file_number = writer_->get_log_number();
+
+ ++blob_count_;
+ blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size();
+
+ return Status::OK();
+}
+
+Status BlobFileBuilder::CloseBlobFile() {
+ assert(IsBlobFileOpen());
+
+ BlobLogFooter footer;
+ footer.blob_count = blob_count_;
+
+ std::string checksum_method;
+ std::string checksum_value;
+
+ Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ const uint64_t blob_file_number = writer_->get_log_number();
+
+ if (blob_callback_) {
+ s = blob_callback_->OnBlobFileCompleted(
+ blob_file_paths_->back(), column_family_name_, job_id_,
+ blob_file_number, creation_reason_, s, checksum_value, checksum_method,
+ blob_count_, blob_bytes_);
+ }
+
+ assert(blob_file_additions_);
+ blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_,
+ std::move(checksum_method),
+ std::move(checksum_value));
+
+ assert(immutable_options_);
+ ROCKS_LOG_INFO(immutable_options_->logger,
+ "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
+ " total blobs, %" PRIu64 " total bytes",
+ column_family_name_.c_str(), job_id_, blob_file_number,
+ blob_count_, blob_bytes_);
+
+ writer_.reset();
+ blob_count_ = 0;
+ blob_bytes_ = 0;
+
+ return s;
+}
+
+Status BlobFileBuilder::CloseBlobFileIfNeeded() {
+ assert(IsBlobFileOpen());
+
+ const WritableFileWriter* const file_writer = writer_->file();
+ assert(file_writer);
+
+ if (file_writer->GetFileSize() < blob_file_size_) {
+ return Status::OK();
+ }
+
+ return CloseBlobFile();
+}
+
+void BlobFileBuilder::Abandon(const Status& s) {
+ if (!IsBlobFileOpen()) {
+ return;
+ }
+ if (blob_callback_) {
+ // BlobFileBuilder::Abandon() is called because of error while writing to
+ // Blob files. So we can ignore the below error.
+ blob_callback_
+ ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_,
+ job_id_, writer_->get_log_number(),
+ creation_reason_, s, "", "", blob_count_,
+ blob_bytes_)
+ .PermitUncheckedError();
+ }
+
+ writer_.reset();
+ blob_count_ = 0;
+ blob_bytes_ = 0;
+}
+
+Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
+ uint64_t blob_file_number,
+ uint64_t blob_offset) const {
+ Status s = Status::OK();
+
+ auto blob_cache = immutable_options_->blob_cache;
+ auto statistics = immutable_options_->statistics.get();
+ bool warm_cache =
+ prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
+ creation_reason_ == BlobFileCreationReason::kFlush;
+
+ if (blob_cache && warm_cache) {
+ const OffsetableCacheKey base_cache_key(db_id_, db_session_id_,
+ blob_file_number);
+ const CacheKey cache_key = base_cache_key.WithOffset(blob_offset);
+ const Slice key = cache_key.AsSlice();
+
+ const Cache::Priority priority = Cache::Priority::BOTTOM;
+
+ // Objects to be put into the cache have to be heap-allocated and
+ // self-contained, i.e. own their contents. The Cache has to be able to
+ // take unique ownership of them.
+ CacheAllocationPtr allocation =
+ AllocateBlock(blob.size(), blob_cache->memory_allocator());
+ memcpy(allocation.get(), blob.data(), blob.size());
+ std::unique_ptr<BlobContents> buf =
+ BlobContents::Create(std::move(allocation), blob.size());
+
+ Cache::CacheItemHelper* const cache_item_helper =
+ BlobContents::GetCacheItemHelper();
+ assert(cache_item_helper);
+
+ if (immutable_options_->lowest_used_cache_tier ==
+ CacheTier::kNonVolatileBlockTier) {
+ s = blob_cache->Insert(key, buf.get(), cache_item_helper,
+ buf->ApproximateMemoryUsage(),
+ nullptr /* cache_handle */, priority);
+ } else {
+ s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
+ cache_item_helper->del_cb,
+ nullptr /* cache_handle */, priority);
+ }
+
+ if (s.ok()) {
+ RecordTick(statistics, BLOB_DB_CACHE_ADD);
+ RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
+ buf.release();
+ } else {
+ RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
+ }
+ }
+
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder.h b/src/rocksdb/db/blob/blob_file_builder.h
new file mode 100644
index 000000000..8e7aab502
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+class FileSystem;
+class SystemClock;
+struct ImmutableOptions;
+struct MutableCFOptions;
+struct FileOptions;
+class BlobFileAddition;
+class Status;
+class Slice;
+class BlobLogWriter;
+class IOTracer;
+class BlobFileCompletionCallback;
+
+class BlobFileBuilder {
+ public:
+ BlobFileBuilder(VersionSet* versions, FileSystem* fs,
+ const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options,
+ const FileOptions* file_options, std::string db_id,
+ std::string db_session_id, int job_id,
+ uint32_t column_family_id,
+ const std::string& column_family_name,
+ Env::IOPriority io_priority,
+ Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions);
+
+ BlobFileBuilder(std::function<uint64_t()> file_number_generator,
+ FileSystem* fs, const ImmutableOptions* immutable_options,
+ const MutableCFOptions* mutable_cf_options,
+ const FileOptions* file_options, std::string db_id,
+ std::string db_session_id, int job_id,
+ uint32_t column_family_id,
+ const std::string& column_family_name,
+ Env::IOPriority io_priority,
+ Env::WriteLifeTimeHint write_hint,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ BlobFileCompletionCallback* blob_callback,
+ BlobFileCreationReason creation_reason,
+ std::vector<std::string>* blob_file_paths,
+ std::vector<BlobFileAddition>* blob_file_additions);
+
+ BlobFileBuilder(const BlobFileBuilder&) = delete;
+ BlobFileBuilder& operator=(const BlobFileBuilder&) = delete;
+
+ ~BlobFileBuilder();
+
+ Status Add(const Slice& key, const Slice& value, std::string* blob_index);
+ Status Finish();
+ void Abandon(const Status& s);
+
+ private:
+ bool IsBlobFileOpen() const;
+ Status OpenBlobFileIfNeeded();
+ Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+ Status WriteBlobToFile(const Slice& key, const Slice& blob,
+ uint64_t* blob_file_number, uint64_t* blob_offset);
+ Status CloseBlobFile();
+ Status CloseBlobFileIfNeeded();
+
+ Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number,
+ uint64_t blob_offset) const;
+
+ std::function<uint64_t()> file_number_generator_;
+ FileSystem* fs_;
+ const ImmutableOptions* immutable_options_;
+ uint64_t min_blob_size_;
+ uint64_t blob_file_size_;
+ CompressionType blob_compression_type_;
+ PrepopulateBlobCache prepopulate_blob_cache_;
+ const FileOptions* file_options_;
+ const std::string db_id_;
+ const std::string db_session_id_;
+ int job_id_;
+ uint32_t column_family_id_;
+ std::string column_family_name_;
+ Env::IOPriority io_priority_;
+ Env::WriteLifeTimeHint write_hint_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ BlobFileCompletionCallback* blob_callback_;
+ BlobFileCreationReason creation_reason_;
+ std::vector<std::string>* blob_file_paths_;
+ std::vector<BlobFileAddition>* blob_file_additions_;
+ std::unique_ptr<BlobLogWriter> writer_;
+ uint64_t blob_count_;
+ uint64_t blob_bytes_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder_test.cc b/src/rocksdb/db/blob/blob_file_builder_test.cc
new file mode 100644
index 000000000..3a0feee45
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder_test.cc
@@ -0,0 +1,680 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_sequential_reader.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFileNumberGenerator {
+ public:
+ uint64_t operator()() { return ++next_file_number_; }
+
+ private:
+ uint64_t next_file_number_ = 1;
+};
+
+class BlobFileBuilderTest : public testing::Test {
+ protected:
+ BlobFileBuilderTest() {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ fs_ = mock_env_->GetFileSystem().get();
+ clock_ = mock_env_->GetSystemClock().get();
+ }
+
+ void VerifyBlobFile(uint64_t blob_file_number,
+ const std::string& blob_file_path,
+ uint32_t column_family_id,
+ CompressionType blob_compression_type,
+ const std::vector<std::pair<std::string, std::string>>&
+ expected_key_value_pairs,
+ const std::vector<std::string>& blob_indexes) {
+ assert(expected_key_value_pairs.size() == blob_indexes.size());
+
+ std::unique_ptr<FSRandomAccessFile> file;
+ constexpr IODebugContext* dbg = nullptr;
+ ASSERT_OK(
+ fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
+
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(std::move(file), blob_file_path, clock_));
+
+ constexpr Statistics* statistics = nullptr;
+ BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_,
+ statistics);
+
+ BlobLogHeader header;
+ ASSERT_OK(blob_log_reader.ReadHeader(&header));
+ ASSERT_EQ(header.version, kVersion1);
+ ASSERT_EQ(header.column_family_id, column_family_id);
+ ASSERT_EQ(header.compression, blob_compression_type);
+ ASSERT_FALSE(header.has_ttl);
+ ASSERT_EQ(header.expiration_range, ExpirationRange());
+
+ for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) {
+ BlobLogRecord record;
+ uint64_t blob_offset = 0;
+
+ ASSERT_OK(blob_log_reader.ReadRecord(
+ &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset));
+
+ // Check the contents of the blob file
+ const auto& expected_key_value = expected_key_value_pairs[i];
+ const auto& key = expected_key_value.first;
+ const auto& value = expected_key_value.second;
+
+ ASSERT_EQ(record.key_size, key.size());
+ ASSERT_EQ(record.value_size, value.size());
+ ASSERT_EQ(record.expiration, 0);
+ ASSERT_EQ(record.key, key);
+ ASSERT_EQ(record.value, value);
+
+ // Make sure the blob reference returned by the builder points to the
+ // right place
+ BlobIndex blob_index;
+ ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i]));
+ ASSERT_FALSE(blob_index.IsInlined());
+ ASSERT_FALSE(blob_index.HasTTL());
+ ASSERT_EQ(blob_index.file_number(), blob_file_number);
+ ASSERT_EQ(blob_index.offset(), blob_offset);
+ ASSERT_EQ(blob_index.size(), value.size());
+ }
+
+ BlobLogFooter footer;
+ ASSERT_OK(blob_log_reader.ReadFooter(&footer));
+ ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size());
+ ASSERT_EQ(footer.expiration_range, ExpirationRange());
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ FileSystem* fs_;
+ SystemClock* clock_;
+ FileOptions file_options_;
+};
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
+ // Build a single blob file
+ constexpr size_t number_of_blobs = 10;
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 4;
+ constexpr size_t value_offset = 1234;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_BuildAndCheckOneFile"),
+ 0);
+ options.enable_blob_files = true;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+ number_of_blobs);
+ std::vector<std::string> blob_indexes(number_of_blobs);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ auto& expected_key_value = expected_key_value_pairs[i];
+
+ auto& key = expected_key_value.first;
+ key = std::to_string(i);
+ assert(key.size() == key_size);
+
+ auto& value = expected_key_value.second;
+ value = std::to_string(i + value_offset);
+ assert(value.size() == value_size);
+
+ auto& blob_index = blob_indexes[i];
+
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+ }
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+
+ const std::string& blob_file_path = blob_file_paths[0];
+
+ ASSERT_EQ(
+ blob_file_path,
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_EQ(blob_file_additions.size(), 1);
+
+ const auto& blob_file_addition = blob_file_additions[0];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs);
+ ASSERT_EQ(
+ blob_file_addition.GetTotalBlobBytes(),
+ number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size));
+
+ // Verify the contents of the new blob file as well as the blob references
+ VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+ kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
+ // Build multiple blob files: file size limit is set to the size of a single
+ // value, so each blob ends up in a file of its own
+ constexpr size_t number_of_blobs = 10;
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 10;
+ constexpr size_t value_offset = 1234567890;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_BuildAndCheckMultipleFiles"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_file_size = value_size;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+ number_of_blobs);
+ std::vector<std::string> blob_indexes(number_of_blobs);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ auto& expected_key_value = expected_key_value_pairs[i];
+
+ auto& key = expected_key_value.first;
+ key = std::to_string(i);
+ assert(key.size() == key_size);
+
+ auto& value = expected_key_value.second;
+ value = std::to_string(i + value_offset);
+ assert(value.size() == value_size);
+
+ auto& blob_index = blob_indexes[i];
+
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+ }
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ ASSERT_EQ(blob_file_paths.size(), number_of_blobs);
+ ASSERT_EQ(blob_file_additions.size(), number_of_blobs);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ const uint64_t blob_file_number = i + 2;
+
+ ASSERT_EQ(blob_file_paths[i],
+ BlobFileName(immutable_options.cf_paths.front().path,
+ blob_file_number));
+
+ const auto& blob_file_addition = blob_file_additions[i];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+ BlobLogRecord::kHeaderSize + key_size + value_size);
+ }
+
+ // Verify the contents of the new blob files as well as the blob references
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pair{
+ expected_key_value_pairs[i]};
+ std::vector<std::string> blob_index{blob_indexes[i]};
+
+ VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression,
+ expected_key_value_pair, blob_index);
+ }
+}
+
+TEST_F(BlobFileBuilderTest, InlinedValues) {
+ // All values are below the min_blob_size threshold; no blob files get written
+ constexpr size_t number_of_blobs = 10;
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 10;
+ constexpr size_t value_offset = 1234567890;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_InlinedValues"),
+ 0);
+ options.enable_blob_files = true;
+ options.min_blob_size = 1024;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ for (size_t i = 0; i < number_of_blobs; ++i) {
+ const std::string key = std::to_string(i);
+ assert(key.size() == key_size);
+
+ const std::string value = std::to_string(i + value_offset);
+ assert(value.size() == value_size);
+
+ std::string blob_index;
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_TRUE(blob_index.empty());
+ }
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ ASSERT_TRUE(blob_file_paths.empty());
+ ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Compression) {
+ // Build a blob file with a compressed blob
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ constexpr size_t key_size = 1;
+ constexpr size_t value_size = 100;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_compression_type = kSnappyCompression;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ const std::string key("1");
+ const std::string uncompressed_value(value_size, 'x');
+
+ std::string blob_index;
+
+ ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+
+ const std::string& blob_file_path = blob_file_paths[0];
+
+ ASSERT_EQ(
+ blob_file_path,
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_EQ(blob_file_additions.size(), 1);
+
+ const auto& blob_file_addition = blob_file_additions[0];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+
+ CompressionOptions opts;
+ CompressionContext context(kSnappyCompression);
+ constexpr uint64_t sample_for_compression = 0;
+
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ kSnappyCompression, sample_for_compression);
+
+ std::string compressed_value;
+ ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
+ uncompressed_value.size(), &compressed_value));
+
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+ BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
+
+ // Verify the contents of the new blob file as well as the blob reference
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+ {key, compressed_value}};
+ std::vector<std::string> blob_indexes{blob_index};
+
+ VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+ kSnappyCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, CompressionError) {
+ // Simulate an error during compression
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderTest_CompressionError"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_compression_type = kSnappyCompression;
+ options.env = mock_env_.get();
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
+ [](void* arg) {
+ bool* ret = static_cast<bool*>(arg);
+ *ret = false;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr char key[] = "1";
+ constexpr char value[] = "deadbeef";
+
+ std::string blob_index;
+
+ ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+ ASSERT_EQ(
+ blob_file_paths[0],
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Checksum) {
+ // Build a blob file with checksum
+
+ class DummyFileChecksumGenerator : public FileChecksumGenerator {
+ public:
+ void Update(const char* /* data */, size_t /* n */) override {}
+
+ void Finalize() override {}
+
+ std::string GetChecksum() const override { return std::string("dummy"); }
+
+ const char* Name() const override { return "DummyFileChecksum"; }
+ };
+
+ class DummyFileChecksumGenFactory : public FileChecksumGenFactory {
+ public:
+ std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+ const FileChecksumGenContext& /* context */) override {
+ return std::unique_ptr<FileChecksumGenerator>(
+ new DummyFileChecksumGenerator);
+ }
+
+ const char* Name() const override { return "DummyFileChecksumGenFactory"; }
+ };
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"),
+ 0);
+ options.enable_blob_files = true;
+ options.file_checksum_gen_factory =
+ std::make_shared<DummyFileChecksumGenFactory>();
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ const std::string key("1");
+ const std::string value("deadbeef");
+
+ std::string blob_index;
+
+ ASSERT_OK(builder.Add(key, value, &blob_index));
+ ASSERT_FALSE(blob_index.empty());
+
+ ASSERT_OK(builder.Finish());
+
+ // Check the metadata generated
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+
+ const std::string& blob_file_path = blob_file_paths[0];
+
+ ASSERT_EQ(
+ blob_file_path,
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+ ASSERT_EQ(blob_file_additions.size(), 1);
+
+ const auto& blob_file_addition = blob_file_additions[0];
+
+ ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+ ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+ BlobLogRecord::kHeaderSize + key.size() + value.size());
+ ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum");
+ ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy");
+
+ // Verify the contents of the new blob file as well as the blob reference
+ std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+ {key, value}};
+ std::vector<std::string> blob_indexes{blob_index};
+
+ VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+ kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+class BlobFileBuilderIOErrorTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ fs_ = mock_env_->GetFileSystem().get();
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ FileSystem* fs_;
+ FileOptions file_options_;
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ BlobFileBuilderTest, BlobFileBuilderIOErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile",
+ "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader",
+ "BlobFileBuilder::WriteBlobToFile:AddRecord",
+ "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(BlobFileBuilderIOErrorTest, IOError) {
+ // Simulate an I/O error during the specified step of Add()
+ // Note: blob_file_size will be set to value_size in order for the first blob
+ // to trigger close
+ constexpr size_t value_size = 8;
+
+ Options options;
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileBuilderIOErrorTest_IOError"),
+ 0);
+ options.enable_blob_files = true;
+ options.blob_file_size = value_size;
+ options.env = mock_env_.get();
+
+ ImmutableOptions immutable_options(options);
+ MutableCFOptions mutable_cf_options(options);
+
+ constexpr int job_id = 1;
+ constexpr uint32_t column_family_id = 123;
+ constexpr char column_family_name[] = "foobar";
+ constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+ constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+ std::vector<std::string> blob_file_paths;
+ std::vector<BlobFileAddition> blob_file_additions;
+
+ BlobFileBuilder builder(
+ TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+ &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+ column_family_id, column_family_name, io_priority, write_hint,
+ nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+ BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+ Status* const s = static_cast<Status*>(arg);
+ assert(s);
+
+ (*s) = Status::IOError(sync_point_);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr char key[] = "1";
+ constexpr char value[] = "deadbeef";
+
+ std::string blob_index;
+
+ ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") {
+ ASSERT_TRUE(blob_file_paths.empty());
+ } else {
+ constexpr uint64_t blob_file_number = 2;
+
+ ASSERT_EQ(blob_file_paths.size(), 1);
+ ASSERT_EQ(blob_file_paths[0],
+ BlobFileName(immutable_options.cf_paths.front().path,
+ blob_file_number));
+ }
+
+ ASSERT_TRUE(blob_file_additions.empty());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_cache.cc b/src/rocksdb/db/blob/blob_file_cache.cc
new file mode 100644
index 000000000..1a6cdf688
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <memory>
+
+#include "db/blob/blob_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileCache::BlobFileCache(Cache* cache,
+ const ImmutableOptions* immutable_options,
+ const FileOptions* file_options,
+ uint32_t column_family_id,
+ HistogramImpl* blob_file_read_hist,
+ const std::shared_ptr<IOTracer>& io_tracer)
+ : cache_(cache),
+ mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+ immutable_options_(immutable_options),
+ file_options_(file_options),
+ column_family_id_(column_family_id),
+ blob_file_read_hist_(blob_file_read_hist),
+ io_tracer_(io_tracer) {
+ assert(cache_);
+ assert(immutable_options_);
+ assert(file_options_);
+}
+
+Status BlobFileCache::GetBlobFileReader(
+ uint64_t blob_file_number,
+ CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+ assert(blob_file_reader);
+ assert(blob_file_reader->IsEmpty());
+
+ const Slice key = GetSlice(&blob_file_number);
+
+ assert(cache_);
+
+ Cache::Handle* handle = cache_->Lookup(key);
+ if (handle) {
+ *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+ return Status::OK();
+ }
+
+ TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
+
+ // Check again while holding mutex
+ MutexLock lock(mutex_.get(key));
+
+ handle = cache_->Lookup(key);
+ if (handle) {
+ *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+ return Status::OK();
+ }
+
+ assert(immutable_options_);
+ Statistics* const statistics = immutable_options_->stats;
+
+ RecordTick(statistics, NO_FILE_OPENS);
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ {
+ assert(file_options_);
+ const Status s = BlobFileReader::Create(
+ *immutable_options_, *file_options_, column_family_id_,
+ blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
+ if (!s.ok()) {
+ RecordTick(statistics, NO_FILE_ERRORS);
+ return s;
+ }
+ }
+
+ {
+ constexpr size_t charge = 1;
+
+ const Status s = cache_->Insert(key, reader.get(), charge,
+ &DeleteCacheEntry<BlobFileReader>, &handle);
+ if (!s.ok()) {
+ RecordTick(statistics, NO_FILE_ERRORS);
+ return s;
+ }
+ }
+
+ reader.release();
+
+ *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache.h b/src/rocksdb/db/blob/blob_file_cache.h
new file mode 100644
index 000000000..8eec05f18
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+class Status;
+class BlobFileReader;
+class Slice;
+class IOTracer;
+
+class BlobFileCache {
+ public:
+ BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options,
+ const FileOptions* file_options, uint32_t column_family_id,
+ HistogramImpl* blob_file_read_hist,
+ const std::shared_ptr<IOTracer>& io_tracer);
+
+ BlobFileCache(const BlobFileCache&) = delete;
+ BlobFileCache& operator=(const BlobFileCache&) = delete;
+
+ Status GetBlobFileReader(uint64_t blob_file_number,
+ CacheHandleGuard<BlobFileReader>* blob_file_reader);
+
+ private:
+ Cache* cache_;
+ // Note: mutex_ below is used to guard against multiple threads racing to open
+ // the same file.
+ Striped<port::Mutex, Slice> mutex_;
+ const ImmutableOptions* immutable_options_;
+ const FileOptions* file_options_;
+ uint32_t column_family_id_;
+ HistogramImpl* blob_file_read_hist_;
+ std::shared_ptr<IOTracer> io_tracer_;
+
+ static constexpr size_t kNumberOfMutexStripes = 1 << 7;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache_test.cc b/src/rocksdb/db/blob/blob_file_cache_test.cc
new file mode 100644
index 000000000..d3a61b3c5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache_test.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with a single blob in it.
+void WriteBlobFile(uint32_t column_family_id,
+ const ImmutableOptions& immutable_options,
+ uint64_t blob_file_number) {
+ assert(!immutable_options.cf_paths.empty());
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+ statistics, blob_file_number, use_fsync,
+ do_flush);
+
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+
+ BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+ expiration_range);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ std::string compressed_blob;
+
+ uint64_t key_offset = 0;
+ uint64_t blob_offset = 0;
+
+ ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset));
+
+ BlobLogFooter footer;
+ footer.blob_count = 1;
+ footer.expiration_range = expiration_range;
+
+ std::string checksum_method;
+ std::string checksum_value;
+
+ ASSERT_OK(
+ blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+} // anonymous namespace
+
+class BlobFileCacheTest : public testing::Test {
+ protected:
+ BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+
+ std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr uint32_t column_family_id = 1;
+ ImmutableOptions immutable_options(options);
+ constexpr uint64_t blob_file_number = 123;
+
+ WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ // First try: reader should be opened and put in cache
+ CacheHandleGuard<BlobFileReader> first;
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+ ASSERT_NE(first.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+ // Second try: reader should be served from cache
+ CacheHandleGuard<BlobFileReader> second;
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+ ASSERT_NE(second.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+ ASSERT_EQ(first.GetValue(), second.GetValue());
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader_Race"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr uint32_t column_family_id = 1;
+ ImmutableOptions immutable_options(options);
+ constexpr uint64_t blob_file_number = 123;
+
+ WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ CacheHandleGuard<BlobFileReader> first;
+ CacheHandleGuard<BlobFileReader> second;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
+ // Disabling sync points to prevent infinite recursion
+ SyncPoint::GetInstance()->DisableProcessing();
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+ ASSERT_NE(second.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+ ASSERT_NE(first.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+ ASSERT_EQ(first.GetValue(), second.GetValue());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader_IOError"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ ImmutableOptions immutable_options(options);
+ FileOptions file_options;
+ constexpr uint32_t column_family_id = 1;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ // Note: there is no blob file with the below number
+ constexpr uint64_t blob_file_number = 123;
+
+ CacheHandleGuard<BlobFileReader> reader;
+
+ ASSERT_TRUE(
+ blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+ ASSERT_EQ(reader.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileCacheTest_GetBlobFileReader_CacheFull"),
+ 0);
+ options.enable_blob_files = true;
+
+ constexpr uint32_t column_family_id = 1;
+ ImmutableOptions immutable_options(options);
+ constexpr uint64_t blob_file_number = 123;
+
+ WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+ constexpr size_t capacity = 0;
+ constexpr int num_shard_bits = -1; // determined automatically
+ constexpr bool strict_capacity_limit = true;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+ &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/);
+
+ // Insert into cache should fail since it has zero capacity and
+ // strict_capacity_limit is set
+ CacheHandleGuard<BlobFileReader> reader;
+
+ ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
+ .IsMemoryLimit());
+ ASSERT_EQ(reader.GetValue(), nullptr);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+ ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_completion_callback.h b/src/rocksdb/db/blob/blob_file_completion_callback.h
new file mode 100644
index 000000000..ffe65a0ff
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_completion_callback.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileCompletionCallback {
+ public:
+ BlobFileCompletionCallback(
+ SstFileManager* sst_file_manager, InstrumentedMutex* mutex,
+ ErrorHandler* error_handler, EventLogger* event_logger,
+ const std::vector<std::shared_ptr<EventListener>>& listeners,
+ const std::string& dbname)
+ : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
+#ifndef ROCKSDB_LITE
+ sst_file_manager_ = sst_file_manager;
+ mutex_ = mutex;
+ error_handler_ = error_handler;
+#else
+ (void)sst_file_manager;
+ (void)mutex;
+ (void)error_handler;
+#endif // ROCKSDB_LITE
+ }
+
+ void OnBlobFileCreationStarted(const std::string& file_name,
+ const std::string& column_family_name,
+ int job_id,
+ BlobFileCreationReason creation_reason) {
+#ifndef ROCKSDB_LITE
+ // Notify the listeners.
+ EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
+ column_family_name, file_name,
+ job_id, creation_reason);
+#else
+ (void)file_name;
+ (void)column_family_name;
+ (void)job_id;
+ (void)creation_reason;
+#endif
+ }
+
+ Status OnBlobFileCompleted(const std::string& file_name,
+ const std::string& column_family_name, int job_id,
+ uint64_t file_number,
+ BlobFileCreationReason creation_reason,
+ const Status& report_status,
+ const std::string& checksum_value,
+ const std::string& checksum_method,
+ uint64_t blob_count, uint64_t blob_bytes) {
+ Status s;
+
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
+ if (sfm) {
+ // Report new blob files to SstFileManagerImpl
+ s = sfm->OnAddFile(file_name);
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ s = Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT(
+ "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached");
+ InstrumentedMutexLock l(mutex_);
+ error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
+ }
+ }
+#endif // !ROCKSDB_LITE
+
+ // Notify the listeners.
+ EventHelpers::LogAndNotifyBlobFileCreationFinished(
+ event_logger_, listeners_, dbname_, column_family_name, file_name,
+ job_id, file_number, creation_reason,
+ (!report_status.ok() ? report_status : s),
+ (checksum_value.empty() ? kUnknownFileChecksum : checksum_value),
+ (checksum_method.empty() ? kUnknownFileChecksumFuncName
+ : checksum_method),
+ blob_count, blob_bytes);
+ return s;
+ }
+
+ private:
+#ifndef ROCKSDB_LITE
+ SstFileManager* sst_file_manager_;
+ InstrumentedMutex* mutex_;
+ ErrorHandler* error_handler_;
+#endif // ROCKSDB_LITE
+ EventLogger* event_logger_;
+ std::vector<std::shared_ptr<EventListener>> listeners_;
+ std::string dbname_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.cc b/src/rocksdb/db/blob/blob_file_garbage.cc
new file mode 100644
index 000000000..52c336f49
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileGarbage::CustomFieldTags : uint32_t {
+ kEndMarker,
+
+ // Add forward compatible fields here
+
+ /////////////////////////////////////////////////////////////////////
+
+ kForwardIncompatibleMask = 1 << 6,
+
+ // Add forward incompatible fields here
+};
+
+void BlobFileGarbage::EncodeTo(std::string* output) const {
+ PutVarint64(output, blob_file_number_);
+ PutVarint64(output, garbage_blob_count_);
+ PutVarint64(output, garbage_blob_bytes_);
+
+ // Encode any custom fields here. The format to use is a Varint32 tag (see
+ // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+ // fields will be ignored during decoding unless they're in the forward
+ // incompatible range.
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output);
+
+ PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileGarbage::DecodeFrom(Slice* input) {
+ constexpr char class_name[] = "BlobFileGarbage";
+
+ if (!GetVarint64(input, &blob_file_number_)) {
+ return Status::Corruption(class_name, "Error decoding blob file number");
+ }
+
+ if (!GetVarint64(input, &garbage_blob_count_)) {
+ return Status::Corruption(class_name, "Error decoding garbage blob count");
+ }
+
+ if (!GetVarint64(input, &garbage_blob_bytes_)) {
+ return Status::Corruption(class_name, "Error decoding garbage blob bytes");
+ }
+
+ while (true) {
+ uint32_t custom_field_tag = 0;
+ if (!GetVarint32(input, &custom_field_tag)) {
+ return Status::Corruption(class_name, "Error decoding custom field tag");
+ }
+
+ if (custom_field_tag == kEndMarker) {
+ break;
+ }
+
+ if (custom_field_tag & kForwardIncompatibleMask) {
+ return Status::Corruption(
+ class_name, "Forward incompatible custom field encountered");
+ }
+
+ Slice custom_field_value;
+ if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+ return Status::Corruption(class_name,
+ "Error decoding custom field value");
+ }
+ }
+
+ return Status::OK();
+}
+
+std::string BlobFileGarbage::DebugString() const {
+ std::ostringstream oss;
+
+ oss << *this;
+
+ return oss.str();
+}
+
+std::string BlobFileGarbage::DebugJSON() const {
+ JSONWriter jw;
+
+ jw << *this;
+
+ jw.EndObject();
+
+ return jw.Get();
+}
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+ return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+ lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() &&
+ lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes();
+}
+
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+ return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileGarbage& blob_file_garbage) {
+ os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber()
+ << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount()
+ << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes();
+
+ return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileGarbage& blob_file_garbage) {
+ jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber()
+ << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount()
+ << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes();
+
+ return jw;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.h b/src/rocksdb/db/blob/blob_file_garbage.h
new file mode 100644
index 000000000..6dc14ddca
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileGarbage {
+ public:
+ BlobFileGarbage() = default;
+
+ BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes)
+ : blob_file_number_(blob_file_number),
+ garbage_blob_count_(garbage_blob_count),
+ garbage_blob_bytes_(garbage_blob_bytes) {}
+
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+ uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+ void EncodeTo(std::string* output) const;
+ Status DecodeFrom(Slice* input);
+
+ std::string DebugString() const;
+ std::string DebugJSON() const;
+
+ private:
+ enum CustomFieldTags : uint32_t;
+
+ uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+ uint64_t garbage_blob_count_ = 0;
+ uint64_t garbage_blob_bytes_ = 0;
+};
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+ const BlobFileGarbage& blob_file_garbage);
+JSONWriter& operator<<(JSONWriter& jw,
+ const BlobFileGarbage& blob_file_garbage);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage_test.cc b/src/rocksdb/db/blob/blob_file_garbage_test.cc
new file mode 100644
index 000000000..292a8b38a
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileGarbageTest : public testing::Test {
+ public:
+ static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) {
+ std::string encoded;
+ blob_file_garbage.EncodeTo(&encoded);
+
+ BlobFileGarbage decoded;
+ Slice input(encoded);
+ ASSERT_OK(decoded.DecodeFrom(&input));
+
+ ASSERT_EQ(blob_file_garbage, decoded);
+ }
+};
+
+TEST_F(BlobFileGarbageTest, Empty) {
+ BlobFileGarbage blob_file_garbage;
+
+ ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0);
+
+ TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, NonEmpty) {
+ constexpr uint64_t blob_file_number = 123;
+ constexpr uint64_t garbage_blob_count = 1;
+ constexpr uint64_t garbage_blob_bytes = 9876;
+
+ BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count);
+ ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes);
+
+ TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, DecodeErrors) {
+ std::string str;
+ Slice slice(str);
+
+ BlobFileGarbage blob_file_garbage;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+ }
+
+ constexpr uint64_t blob_file_number = 123;
+ PutVarint64(&str, blob_file_number);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count"));
+ }
+
+ constexpr uint64_t garbage_blob_count = 4567;
+ PutVarint64(&str, garbage_blob_count);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes"));
+ }
+
+ constexpr uint64_t garbage_blob_bytes = 12345678;
+ PutVarint64(&str, garbage_blob_bytes);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+ }
+
+ constexpr uint32_t custom_tag = 2;
+ PutVarint32(&str, custom_tag);
+ slice = str;
+
+ {
+ const Status s = blob_file_garbage.DecodeFrom(&slice);
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+ }
+}
+
+TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_compatible_tag = 2;
+ PutVarint32(output, forward_compatible_tag);
+
+ PutLengthPrefixedSlice(output, "deadbeef");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 678;
+ constexpr uint64_t garbage_blob_count = 9999;
+ constexpr uint64_t garbage_blob_bytes = 100000000;
+
+ BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ TestEncodeDecode(blob_file_garbage);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) {
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+ std::string* output = static_cast<std::string*>(arg);
+
+ constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+ PutVarint32(output, forward_incompatible_tag);
+
+ PutLengthPrefixedSlice(output, "foobar");
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr uint64_t blob_file_number = 456;
+ constexpr uint64_t garbage_blob_count = 100;
+ constexpr uint64_t garbage_blob_bytes = 2000000;
+
+ BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+ garbage_blob_bytes);
+
+ std::string encoded;
+ blob_file_garbage.EncodeTo(&encoded);
+
+ BlobFileGarbage decoded_blob_file_addition;
+ Slice input(encoded);
+ const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_meta.cc b/src/rocksdb/db/blob/blob_file_meta.cc
new file mode 100644
index 000000000..4913137e5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_meta.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t SharedBlobFileMetaData::GetBlobFileSize() const {
+ return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize;
+}
+
+std::string SharedBlobFileMetaData::DebugString() const {
+ std::ostringstream oss;
+ oss << (*this);
+
+ return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const SharedBlobFileMetaData& shared_meta) {
+ os << "blob_file_number: " << shared_meta.GetBlobFileNumber()
+ << " total_blob_count: " << shared_meta.GetTotalBlobCount()
+ << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes()
+ << " checksum_method: " << shared_meta.GetChecksumMethod()
+ << " checksum_value: "
+ << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true);
+
+ return os;
+}
+
+std::string BlobFileMetaData::DebugString() const {
+ std::ostringstream oss;
+ oss << (*this);
+
+ return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) {
+ const auto& shared_meta = meta.GetSharedMeta();
+ assert(shared_meta);
+ os << (*shared_meta);
+
+ os << " linked_ssts: {";
+ for (uint64_t file_number : meta.GetLinkedSsts()) {
+ os << ' ' << file_number;
+ }
+ os << " }";
+
+ os << " garbage_blob_count: " << meta.GetGarbageBlobCount()
+ << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes();
+
+ return os;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_meta.h b/src/rocksdb/db/blob/blob_file_meta.h
new file mode 100644
index 000000000..d7c8a1243
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SharedBlobFileMetaData represents the immutable part of blob files' metadata,
+// like the blob file number, total number and size of blobs, or checksum
+// method and value. There is supposed to be one object of this class per blob
+// file (shared across all versions that include the blob file in question);
+// hence, the type is neither copyable nor movable. A blob file can be marked
+// obsolete when the corresponding SharedBlobFileMetaData object is destroyed.
+
+class SharedBlobFileMetaData {
+ public:
+ static std::shared_ptr<SharedBlobFileMetaData> Create(
+ uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value) {
+ return std::shared_ptr<SharedBlobFileMetaData>(new SharedBlobFileMetaData(
+ blob_file_number, total_blob_count, total_blob_bytes,
+ std::move(checksum_method), std::move(checksum_value)));
+ }
+
+ template <typename Deleter>
+ static std::shared_ptr<SharedBlobFileMetaData> Create(
+ uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value, Deleter deleter) {
+ return std::shared_ptr<SharedBlobFileMetaData>(
+ new SharedBlobFileMetaData(blob_file_number, total_blob_count,
+ total_blob_bytes, std::move(checksum_method),
+ std::move(checksum_value)),
+ deleter);
+ }
+
+ SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete;
+ SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete;
+
+ SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete;
+ SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete;
+
+ uint64_t GetBlobFileSize() const;
+ uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+ uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+ uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+ const std::string& GetChecksumMethod() const { return checksum_method_; }
+ const std::string& GetChecksumValue() const { return checksum_value_; }
+
+ std::string DebugString() const;
+
+ private:
+ SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count,
+ uint64_t total_blob_bytes, std::string checksum_method,
+ std::string checksum_value)
+ : blob_file_number_(blob_file_number),
+ total_blob_count_(total_blob_count),
+ total_blob_bytes_(total_blob_bytes),
+ checksum_method_(std::move(checksum_method)),
+ checksum_value_(std::move(checksum_value)) {
+ assert(checksum_method_.empty() == checksum_value_.empty());
+ }
+
+ uint64_t blob_file_number_;
+ uint64_t total_blob_count_;
+ uint64_t total_blob_bytes_;
+ std::string checksum_method_;
+ std::string checksum_value_;
+};
+
+std::ostream& operator<<(std::ostream& os,
+ const SharedBlobFileMetaData& shared_meta);
+
+// BlobFileMetaData contains the part of the metadata for blob files that can
+// vary across versions, like the amount of garbage in the blob file. In
+// addition, BlobFileMetaData objects point to and share the ownership of the
+// SharedBlobFileMetaData object for the corresponding blob file. Similarly to
+// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They
+// are meant to be jointly owned by the versions in which the blob file has the
+// same (immutable *and* mutable) state.
+
+class BlobFileMetaData {
+ public:
+ using LinkedSsts = std::unordered_set<uint64_t>;
+
+ static std::shared_ptr<BlobFileMetaData> Create(
+ std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+ LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes) {
+ return std::shared_ptr<BlobFileMetaData>(
+ new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts),
+ garbage_blob_count, garbage_blob_bytes));
+ }
+
+ BlobFileMetaData(const BlobFileMetaData&) = delete;
+ BlobFileMetaData& operator=(const BlobFileMetaData&) = delete;
+
+ BlobFileMetaData(BlobFileMetaData&&) = delete;
+ BlobFileMetaData& operator=(BlobFileMetaData&&) = delete;
+
+ const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+ return shared_meta_;
+ }
+
+ uint64_t GetBlobFileSize() const {
+ assert(shared_meta_);
+ return shared_meta_->GetBlobFileSize();
+ }
+
+ uint64_t GetBlobFileNumber() const {
+ assert(shared_meta_);
+ return shared_meta_->GetBlobFileNumber();
+ }
+ uint64_t GetTotalBlobCount() const {
+ assert(shared_meta_);
+ return shared_meta_->GetTotalBlobCount();
+ }
+ uint64_t GetTotalBlobBytes() const {
+ assert(shared_meta_);
+ return shared_meta_->GetTotalBlobBytes();
+ }
+ const std::string& GetChecksumMethod() const {
+ assert(shared_meta_);
+ return shared_meta_->GetChecksumMethod();
+ }
+ const std::string& GetChecksumValue() const {
+ assert(shared_meta_);
+ return shared_meta_->GetChecksumValue();
+ }
+
+ const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; }
+
+ uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+ uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+ std::string DebugString() const;
+
+ private:
+ BlobFileMetaData(std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+ LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+ uint64_t garbage_blob_bytes)
+ : shared_meta_(std::move(shared_meta)),
+ linked_ssts_(std::move(linked_ssts)),
+ garbage_blob_count_(garbage_blob_count),
+ garbage_blob_bytes_(garbage_blob_bytes) {
+ assert(shared_meta_);
+ assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount());
+ assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes());
+ }
+
+ std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+ LinkedSsts linked_ssts_;
+ uint64_t garbage_blob_count_;
+ uint64_t garbage_blob_bytes_;
+};
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.cc b/src/rocksdb/db/blob/blob_file_reader.cc
new file mode 100644
index 000000000..a4eabb605
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.cc
@@ -0,0 +1,610 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/multiget_context.h"
+#include "test_util/sync_point.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFileReader::Create(
+ const ImmutableOptions& immutable_options, const FileOptions& file_options,
+ uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
+ uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
+ std::unique_ptr<BlobFileReader>* blob_file_reader) {
+ assert(blob_file_reader);
+ assert(!*blob_file_reader);
+
+ uint64_t file_size = 0;
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+
+ {
+ const Status s =
+ OpenFile(immutable_options, file_options, blob_file_read_hist,
+ blob_file_number, io_tracer, &file_size, &file_reader);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ assert(file_reader);
+
+ Statistics* const statistics = immutable_options.stats;
+
+ CompressionType compression_type = kNoCompression;
+
+ {
+ const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
+ &compression_type);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ {
+ const Status s = ReadFooter(file_reader.get(), file_size, statistics);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ blob_file_reader->reset(
+ new BlobFileReader(std::move(file_reader), file_size, compression_type,
+ immutable_options.clock, statistics));
+
+ return Status::OK();
+}
+
+Status BlobFileReader::OpenFile(
+ const ImmutableOptions& immutable_options, const FileOptions& file_opts,
+ HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+ const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size,
+ std::unique_ptr<RandomAccessFileReader>* file_reader) {
+ assert(file_size);
+ assert(file_reader);
+
+ const auto& cf_paths = immutable_options.cf_paths;
+ assert(!cf_paths.empty());
+
+ const std::string blob_file_path =
+ BlobFileName(cf_paths.front().path, blob_file_number);
+
+ FileSystem* const fs = immutable_options.fs.get();
+ assert(fs);
+
+ constexpr IODebugContext* dbg = nullptr;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize");
+
+ const Status s =
+ fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+ return Status::Corruption("Malformed blob file");
+ }
+
+ std::unique_ptr<FSRandomAccessFile> file;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile");
+
+ const Status s =
+ fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ assert(file);
+
+ if (immutable_options.advise_random_on_open) {
+ file->Hint(FSRandomAccessFile::kRandom);
+ }
+
+ file_reader->reset(new RandomAccessFileReader(
+ std::move(file), blob_file_path, immutable_options.clock, io_tracer,
+ immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS,
+ blob_file_read_hist, immutable_options.rate_limiter.get(),
+ immutable_options.listeners));
+
+ return Status::OK();
+}
+
+Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
+ uint32_t column_family_id,
+ Statistics* statistics,
+ CompressionType* compression_type) {
+ assert(file_reader);
+ assert(compression_type);
+
+ Slice header_slice;
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile");
+
+ constexpr uint64_t read_offset = 0;
+ constexpr size_t read_size = BlobLogHeader::kSize;
+
+ // TODO: rate limit reading headers from blob files.
+ const Status s = ReadFromFile(file_reader, read_offset, read_size,
+ statistics, &header_slice, &buf, &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult",
+ &header_slice);
+ }
+
+ BlobLogHeader header;
+
+ {
+ const Status s = header.DecodeFrom(header_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ constexpr ExpirationRange no_expiration_range;
+
+ if (header.has_ttl || header.expiration_range != no_expiration_range) {
+ return Status::Corruption("Unexpected TTL blob file");
+ }
+
+ if (header.column_family_id != column_family_id) {
+ return Status::Corruption("Column family ID mismatch");
+ }
+
+ *compression_type = header.compression;
+
+ return Status::OK();
+}
+
+Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
+ uint64_t file_size, Statistics* statistics) {
+ assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
+ assert(file_reader);
+
+ Slice footer_slice;
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ {
+ TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile");
+
+ const uint64_t read_offset = file_size - BlobLogFooter::kSize;
+ constexpr size_t read_size = BlobLogFooter::kSize;
+
+ // TODO: rate limit reading footers from blob files.
+ const Status s = ReadFromFile(file_reader, read_offset, read_size,
+ statistics, &footer_slice, &buf, &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult",
+ &footer_slice);
+ }
+
+ BlobLogFooter footer;
+
+ {
+ const Status s = footer.DecodeFrom(footer_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ constexpr ExpirationRange no_expiration_range;
+
+ if (footer.expiration_range != no_expiration_range) {
+ return Status::Corruption("Unexpected TTL blob file");
+ }
+
+ return Status::OK();
+}
+
+Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
+ uint64_t read_offset, size_t read_size,
+ Statistics* statistics, Slice* slice,
+ Buffer* buf, AlignedBuf* aligned_buf,
+ Env::IOPriority rate_limiter_priority) {
+ assert(slice);
+ assert(buf);
+ assert(aligned_buf);
+
+ assert(file_reader);
+
+ RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size);
+
+ Status s;
+
+ if (file_reader->use_direct_io()) {
+ constexpr char* scratch = nullptr;
+
+ s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
+ aligned_buf, rate_limiter_priority);
+ } else {
+ buf->reset(new char[read_size]);
+ constexpr AlignedBuf* aligned_scratch = nullptr;
+
+ s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
+ buf->get(), aligned_scratch, rate_limiter_priority);
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (slice->size() != read_size) {
+ return Status::Corruption("Failed to read data from blob file");
+ }
+
+ return Status::OK();
+}
+
+BlobFileReader::BlobFileReader(
+ std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
+ CompressionType compression_type, SystemClock* clock,
+ Statistics* statistics)
+ : file_reader_(std::move(file_reader)),
+ file_size_(file_size),
+ compression_type_(compression_type),
+ clock_(clock),
+ statistics_(statistics) {
+ assert(file_reader_);
+}
+
+BlobFileReader::~BlobFileReader() = default;
+
+Status BlobFileReader::GetBlob(
+ const ReadOptions& read_options, const Slice& user_key, uint64_t offset,
+ uint64_t value_size, CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator,
+ std::unique_ptr<BlobContents>* result, uint64_t* bytes_read) const {
+ assert(result);
+
+ const uint64_t key_size = user_key.size();
+
+ if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+ return Status::Corruption("Invalid blob offset");
+ }
+
+ if (compression_type != compression_type_) {
+ return Status::Corruption("Compression type mismatch when reading blob");
+ }
+
+ // Note: if verify_checksum is set, we read the entire blob record to be able
+ // to perform the verification; otherwise, we just read the blob itself. Since
+ // the offset in BlobIndex actually points to the blob value, we need to make
+ // an adjustment in the former case.
+ const uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+ : 0;
+ assert(offset >= adjustment);
+
+ const uint64_t record_offset = offset - adjustment;
+ const uint64_t record_size = value_size + adjustment;
+
+ Slice record_slice;
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ bool prefetched = false;
+
+ if (prefetch_buffer) {
+ Status s;
+ constexpr bool for_compaction = true;
+
+ prefetched = prefetch_buffer->TryReadFromCache(
+ IOOptions(), file_reader_.get(), record_offset,
+ static_cast<size_t>(record_size), &record_slice, &s,
+ read_options.rate_limiter_priority, for_compaction);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!prefetched) {
+ TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");
+ PERF_COUNTER_ADD(blob_read_count, 1);
+ PERF_COUNTER_ADD(blob_read_byte, record_size);
+ PERF_TIMER_GUARD(blob_read_time);
+ const Status s = ReadFromFile(file_reader_.get(), record_offset,
+ static_cast<size_t>(record_size), statistics_,
+ &record_slice, &buf, &aligned_buf,
+ read_options.rate_limiter_priority);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
+ &record_slice);
+
+ if (read_options.verify_checksums) {
+ const Status s = VerifyBlob(record_slice, user_key, value_size);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ const Slice value_slice(record_slice.data() + adjustment, value_size);
+
+ {
+ const Status s = UncompressBlobIfNeeded(
+ value_slice, compression_type, allocator, clock_, statistics_, result);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (bytes_read) {
+ *bytes_read = record_size;
+ }
+
+ return Status::OK();
+}
+
+void BlobFileReader::MultiGetBlob(
+ const ReadOptions& read_options, MemoryAllocator* allocator,
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+ blob_reqs,
+ uint64_t* bytes_read) const {
+ const size_t num_blobs = blob_reqs.size();
+ assert(num_blobs > 0);
+ assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < num_blobs - 1; ++i) {
+ assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset);
+ }
+#endif // !NDEBUG
+
+ std::vector<FSReadRequest> read_reqs;
+ autovector<uint64_t> adjustments;
+ uint64_t total_len = 0;
+ read_reqs.reserve(num_blobs);
+ for (size_t i = 0; i < num_blobs; ++i) {
+ BlobReadRequest* const req = blob_reqs[i].first;
+ assert(req);
+ assert(req->user_key);
+ assert(req->status);
+
+ const size_t key_size = req->user_key->size();
+ const uint64_t offset = req->offset;
+ const uint64_t value_size = req->len;
+
+ if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+ *req->status = Status::Corruption("Invalid blob offset");
+ continue;
+ }
+ if (req->compression != compression_type_) {
+ *req->status =
+ Status::Corruption("Compression type mismatch when reading a blob");
+ continue;
+ }
+
+ const uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+ : 0;
+ assert(req->offset >= adjustment);
+ adjustments.push_back(adjustment);
+
+ FSReadRequest read_req = {};
+ read_req.offset = req->offset - adjustment;
+ read_req.len = req->len + adjustment;
+ read_reqs.emplace_back(read_req);
+ total_len += read_req.len;
+ }
+
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
+
+ Buffer buf;
+ AlignedBuf aligned_buf;
+
+ Status s;
+ bool direct_io = file_reader_->use_direct_io();
+ if (direct_io) {
+ for (size_t i = 0; i < read_reqs.size(); ++i) {
+ read_reqs[i].scratch = nullptr;
+ }
+ } else {
+ buf.reset(new char[total_len]);
+ std::ptrdiff_t pos = 0;
+ for (size_t i = 0; i < read_reqs.size(); ++i) {
+ read_reqs[i].scratch = buf.get() + pos;
+ pos += read_reqs[i].len;
+ }
+ }
+ TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile");
+ PERF_COUNTER_ADD(blob_read_count, num_blobs);
+ PERF_COUNTER_ADD(blob_read_byte, total_len);
+ s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(),
+ direct_io ? &aligned_buf : nullptr,
+ read_options.rate_limiter_priority);
+ if (!s.ok()) {
+ for (auto& req : read_reqs) {
+ req.status.PermitUncheckedError();
+ }
+ for (auto& blob_req : blob_reqs) {
+ BlobReadRequest* const req = blob_req.first;
+ assert(req);
+ assert(req->status);
+
+ if (!req->status->IsCorruption()) {
+ // Avoid overwriting corruption status.
+ *req->status = s;
+ }
+ }
+ return;
+ }
+
+ assert(s.ok());
+
+ uint64_t total_bytes = 0;
+ for (size_t i = 0, j = 0; i < num_blobs; ++i) {
+ BlobReadRequest* const req = blob_reqs[i].first;
+ assert(req);
+ assert(req->user_key);
+ assert(req->status);
+
+ if (!req->status->ok()) {
+ continue;
+ }
+
+ assert(j < read_reqs.size());
+ auto& read_req = read_reqs[j++];
+ const auto& record_slice = read_req.result;
+ if (read_req.status.ok() && record_slice.size() != read_req.len) {
+ read_req.status =
+ IOStatus::Corruption("Failed to read data from blob file");
+ }
+
+ *req->status = read_req.status;
+ if (!req->status->ok()) {
+ continue;
+ }
+
+ // Verify checksums if enabled
+ if (read_options.verify_checksums) {
+ *req->status = VerifyBlob(record_slice, *req->user_key, req->len);
+ if (!req->status->ok()) {
+ continue;
+ }
+ }
+
+ // Uncompress blob if needed
+ Slice value_slice(record_slice.data() + adjustments[i], req->len);
+ *req->status =
+ UncompressBlobIfNeeded(value_slice, compression_type_, allocator,
+ clock_, statistics_, &blob_reqs[i].second);
+ if (req->status->ok()) {
+ total_bytes += record_slice.size();
+ }
+ }
+
+ if (bytes_read) {
+ *bytes_read = total_bytes;
+ }
+}
+
+Status BlobFileReader::VerifyBlob(const Slice& record_slice,
+ const Slice& user_key, uint64_t value_size) {
+ PERF_TIMER_GUARD(blob_checksum_time);
+
+ BlobLogRecord record;
+
+ const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize);
+
+ {
+ const Status s = record.DecodeHeaderFrom(header_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (record.key_size != user_key.size()) {
+ return Status::Corruption("Key size mismatch when reading blob");
+ }
+
+ if (record.value_size != value_size) {
+ return Status::Corruption("Value size mismatch when reading blob");
+ }
+
+ record.key =
+ Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size);
+ if (record.key != user_key) {
+ return Status::Corruption("Key mismatch when reading blob");
+ }
+
+ record.value = Slice(record.key.data() + record.key_size, value_size);
+
+ {
+ TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC",
+ &record);
+
+ const Status s = record.CheckBlobCRC();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status BlobFileReader::UncompressBlobIfNeeded(
+ const Slice& value_slice, CompressionType compression_type,
+ MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics,
+ std::unique_ptr<BlobContents>* result) {
+ assert(result);
+
+ if (compression_type == kNoCompression) {
+ CacheAllocationPtr allocation =
+ AllocateBlock(value_slice.size(), allocator);
+ memcpy(allocation.get(), value_slice.data(), value_slice.size());
+
+ *result = BlobContents::Create(std::move(allocation), value_slice.size());
+
+ return Status::OK();
+ }
+
+ UncompressionContext context(compression_type);
+ UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+ compression_type);
+
+ size_t uncompressed_size = 0;
+ constexpr uint32_t compression_format_version = 2;
+
+ CacheAllocationPtr output;
+
+ {
+ PERF_TIMER_GUARD(blob_decompress_time);
+ StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
+ output = UncompressData(info, value_slice.data(), value_slice.size(),
+ &uncompressed_size, compression_format_version,
+ allocator);
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+
+ if (!output) {
+ return Status::Corruption("Unable to uncompress blob");
+ }
+
+ *result = BlobContents::Create(std::move(output), uncompressed_size);
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.h b/src/rocksdb/db/blob/blob_file_reader.h
new file mode 100644
index 000000000..75b756da1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "db/blob/blob_read_request.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+struct ReadOptions;
+class Slice;
+class FilePrefetchBuffer;
+class BlobContents;
+class Statistics;
+
+class BlobFileReader {
+ public:
+ static Status Create(const ImmutableOptions& immutable_options,
+ const FileOptions& file_options,
+ uint32_t column_family_id,
+ HistogramImpl* blob_file_read_hist,
+ uint64_t blob_file_number,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ std::unique_ptr<BlobFileReader>* reader);
+
+ BlobFileReader(const BlobFileReader&) = delete;
+ BlobFileReader& operator=(const BlobFileReader&) = delete;
+
+ ~BlobFileReader();
+
+ Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ uint64_t offset, uint64_t value_size,
+ CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer,
+ MemoryAllocator* allocator,
+ std::unique_ptr<BlobContents>* result,
+ uint64_t* bytes_read) const;
+
+ // offsets must be sorted in ascending order by caller.
+ void MultiGetBlob(
+ const ReadOptions& read_options, MemoryAllocator* allocator,
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+ blob_reqs,
+ uint64_t* bytes_read) const;
+
+ CompressionType GetCompressionType() const { return compression_type_; }
+
+ uint64_t GetFileSize() const { return file_size_; }
+
+ private:
+ BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+ uint64_t file_size, CompressionType compression_type,
+ SystemClock* clock, Statistics* statistics);
+
+ static Status OpenFile(const ImmutableOptions& immutable_options,
+ const FileOptions& file_opts,
+ HistogramImpl* blob_file_read_hist,
+ uint64_t blob_file_number,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ uint64_t* file_size,
+ std::unique_ptr<RandomAccessFileReader>* file_reader);
+
+ static Status ReadHeader(const RandomAccessFileReader* file_reader,
+ uint32_t column_family_id, Statistics* statistics,
+ CompressionType* compression_type);
+
+ static Status ReadFooter(const RandomAccessFileReader* file_reader,
+ uint64_t file_size, Statistics* statistics);
+
+ using Buffer = std::unique_ptr<char[]>;
+
+ static Status ReadFromFile(const RandomAccessFileReader* file_reader,
+ uint64_t read_offset, size_t read_size,
+ Statistics* statistics, Slice* slice, Buffer* buf,
+ AlignedBuf* aligned_buf,
+ Env::IOPriority rate_limiter_priority);
+
+ static Status VerifyBlob(const Slice& record_slice, const Slice& user_key,
+ uint64_t value_size);
+
+ static Status UncompressBlobIfNeeded(const Slice& value_slice,
+ CompressionType compression_type,
+ MemoryAllocator* allocator,
+ SystemClock* clock,
+ Statistics* statistics,
+ std::unique_ptr<BlobContents>* result);
+
+ std::unique_ptr<RandomAccessFileReader> file_reader_;
+ uint64_t file_size_;
+ CompressionType compression_type_;
+ SystemClock* clock_;
+ Statistics* statistics_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader_test.cc b/src/rocksdb/db/blob/blob_file_reader_test.cc
new file mode 100644
index 000000000..03458e2b5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader_test.cc
@@ -0,0 +1,1024 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+ uint32_t column_family_id, bool has_ttl,
+ const ExpirationRange& expiration_range_header,
+ const ExpirationRange& expiration_range_footer,
+ uint64_t blob_file_number, const std::vector<Slice>& keys,
+ const std::vector<Slice>& blobs, CompressionType compression,
+ std::vector<uint64_t>& blob_offsets,
+ std::vector<uint64_t>& blob_sizes) {
+ assert(!immutable_options.cf_paths.empty());
+ size_t num = keys.size();
+ assert(num == blobs.size());
+ assert(num == blob_offsets.size());
+ assert(num == blob_sizes.size());
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+ statistics, blob_file_number, use_fsync,
+ do_flush);
+
+ BlobLogHeader header(column_family_id, compression, has_ttl,
+ expiration_range_header);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+ std::vector<std::string> compressed_blobs(num);
+ std::vector<Slice> blobs_to_write(num);
+ if (kNoCompression == compression) {
+ for (size_t i = 0; i < num; ++i) {
+ blobs_to_write[i] = blobs[i];
+ blob_sizes[i] = blobs[i].size();
+ }
+ } else {
+ CompressionOptions opts;
+ CompressionContext context(compression);
+ constexpr uint64_t sample_for_compression = 0;
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ compression, sample_for_compression);
+
+ constexpr uint32_t compression_format_version = 2;
+
+ for (size_t i = 0; i < num; ++i) {
+ ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+ &compressed_blobs[i]));
+ blobs_to_write[i] = compressed_blobs[i];
+ blob_sizes[i] = compressed_blobs[i].size();
+ }
+ }
+
+ for (size_t i = 0; i < num; ++i) {
+ uint64_t key_offset = 0;
+ ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+ &blob_offsets[i]));
+ }
+
+ BlobLogFooter footer;
+ footer.blob_count = num;
+ footer.expiration_range = expiration_range_footer;
+
+ std::string checksum_method;
+ std::string checksum_value;
+ ASSERT_OK(
+ blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+// Creates a test blob file with a single blob in it. Note: this method
+// makes it possible to test various corner cases by allowing the caller
+// to specify the contents of various blob file header/footer fields.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+ uint32_t column_family_id, bool has_ttl,
+ const ExpirationRange& expiration_range_header,
+ const ExpirationRange& expiration_range_footer,
+ uint64_t blob_file_number, const Slice& key,
+ const Slice& blob, CompressionType compression,
+ uint64_t* blob_offset, uint64_t* blob_size) {
+ std::vector<Slice> keys{key};
+ std::vector<Slice> blobs{blob};
+ std::vector<uint64_t> blob_offsets{0};
+ std::vector<uint64_t> blob_sizes{0};
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range_header, expiration_range_footer,
+ blob_file_number, keys, blobs, compression, blob_offsets,
+ blob_sizes);
+ if (blob_offset) {
+ *blob_offset = blob_offsets[0];
+ }
+ if (blob_size) {
+ *blob_size = blob_sizes[0];
+ }
+}
+
+} // anonymous namespace
+
+class BlobFileReaderTest : public testing::Test {
+ protected:
+ BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+ std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_CreateReaderAndGetBlob"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr size_t num_blobs = 3;
+ const std::vector<std::string> key_strs = {"key1", "key2", "key3"};
+ const std::vector<std::string> blob_strs = {"blob1", "blob2", "blob3"};
+
+ const std::vector<Slice> keys = {key_strs[0], key_strs[1], key_strs[2]};
+ const std::vector<Slice> blobs = {blob_strs[0], blob_strs[1], blob_strs[2]};
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ // Make sure the blob can be retrieved with and without checksum verification
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0],
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[0]);
+ ASSERT_EQ(bytes_read, blob_sizes[0]);
+
+ // MultiGetBlob
+ bytes_read = 0;
+ size_t total_size = 0;
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ requests_buf[i] =
+ BlobReadRequest(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, nullptr, &statuses_buf[i]);
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ const auto& result = blob_reqs[i].second;
+
+ ASSERT_OK(statuses_buf[i]);
+ ASSERT_NE(result, nullptr);
+ ASSERT_EQ(result->data(), blobs[i]);
+ total_size += blob_sizes[i];
+ }
+ ASSERT_EQ(bytes_read, total_size);
+ }
+
+ read_options.verify_checksums = true;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1],
+ blob_sizes[1], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[1]);
+
+ const uint64_t key_size = keys[1].size();
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+ blob_sizes[1]);
+ }
+
+ // Invalid offset (too close to start of file)
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[0], blob_offsets[0] - 1,
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ // Invalid offset (too close to end of file)
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[2], blob_offsets[2] + 1,
+ blob_sizes[2], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ // Incorrect compression type
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[0], blob_offsets[0],
+ blob_sizes[0], kZSTD, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ // Incorrect key size
+ {
+ constexpr char shorter_key[] = "k";
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, shorter_key,
+ blob_offsets[0] -
+ (keys[0].size() - sizeof(shorter_key) + 1),
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ // MultiGetBlob
+ autovector<std::reference_wrapper<const Slice>> key_refs;
+ for (const auto& key_ref : keys) {
+ key_refs.emplace_back(std::cref(key_ref));
+ }
+ Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1);
+ key_refs[1] = std::cref(shorter_key_slice);
+
+ autovector<uint64_t> offsets{
+ blob_offsets[0],
+ blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()),
+ blob_offsets[2]};
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ requests_buf[i] =
+ BlobReadRequest(key_refs[i], offsets[i], blob_sizes[i],
+ kNoCompression, nullptr, &statuses_buf[i]);
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i == 1) {
+ ASSERT_TRUE(statuses_buf[i].IsCorruption());
+ } else {
+ ASSERT_OK(statuses_buf[i]);
+ }
+ }
+ }
+
+ // Incorrect key
+ {
+ constexpr char incorrect_key[] = "foo1";
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, incorrect_key, blob_offsets[0],
+ blob_sizes[0], kNoCompression, prefetch_buffer,
+ allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ // MultiGetBlob
+ autovector<std::reference_wrapper<const Slice>> key_refs;
+ for (const auto& key_ref : keys) {
+ key_refs.emplace_back(std::cref(key_ref));
+ }
+ Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1);
+ key_refs[2] = std::cref(wrong_key_slice);
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ requests_buf[i] =
+ BlobReadRequest(key_refs[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, nullptr, &statuses_buf[i]);
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i == num_blobs - 1) {
+ ASSERT_TRUE(statuses_buf[i].IsCorruption());
+ } else {
+ ASSERT_OK(statuses_buf[i]);
+ }
+ }
+ }
+
+ // Incorrect value size
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(read_options, keys[1], blob_offsets[1],
+ blob_sizes[1] + 1, kNoCompression,
+ prefetch_buffer, allocator, &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ // MultiGetBlob
+ autovector<std::reference_wrapper<const Slice>> key_refs;
+ for (const auto& key_ref : keys) {
+ key_refs.emplace_back(std::cref(key_ref));
+ }
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<BlobReadRequest, num_blobs> requests_buf;
+
+ requests_buf[0] =
+ BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0],
+ kNoCompression, nullptr, &statuses_buf[0]);
+ requests_buf[1] =
+ BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1,
+ kNoCompression, nullptr, &statuses_buf[1]);
+ requests_buf[2] =
+ BlobReadRequest(key_refs[2], blob_offsets[2], blob_sizes[2],
+ kNoCompression, nullptr, &statuses_buf[2]);
+
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+ }
+
+ reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i != 1) {
+ ASSERT_OK(statuses_buf[i]);
+ } else {
+ ASSERT_TRUE(statuses_buf[i].IsCorruption());
+ }
+ }
+ }
+}
+
+TEST_F(BlobFileReaderTest, Malformed) {
+ // Write a blob file consisting of nothing but a header, and make sure we
+ // detect the error when we open it for reading
+
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr uint64_t blob_file_number = 1;
+
+ {
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(
+ new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
+ immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer),
+ immutable_options.clock, statistics,
+ blob_file_number, use_fsync, do_flush);
+
+ BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+ expiration_range);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+ }
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, TTL) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = true;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_ExpirationRangeInHeader"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ const ExpirationRange expiration_range_header(
+ 1, 2); // can be made constexpr when we adopt C++14
+ constexpr ExpirationRange expiration_range_footer;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range_header, expiration_range_footer,
+ blob_file_number, key, blob, kNoCompression, &blob_offset,
+ &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_ExpirationRangeInFooter"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range_header;
+ const ExpirationRange expiration_range_footer(
+ 1, 2); // can be made constexpr when we adopt C++14
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range_header, expiration_range_footer,
+ blob_file_number, key, blob, kNoCompression, &blob_offset,
+ &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/,
+ &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_IncorrectColumnFamily"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ constexpr uint32_t incorrect_column_family_id = 2;
+
+ ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+ incorrect_column_family_id,
+ blob_file_read_hist, blob_file_number,
+ nullptr /*IOTracer*/, &reader)
+ .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, BlobCRCError) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
+ BlobLogRecord* const record = static_cast<BlobLogRecord*>(arg);
+ assert(record);
+
+ record->blob_crc = 0xfaceb00c;
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kNoCompression, prefetch_buffer, allocator, &value,
+ &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileReaderTest, Compression) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob,
+ kSnappyCompression, &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ // Make sure the blob can be retrieved with and without checksum verification
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+ kSnappyCompression, prefetch_buffer, allocator,
+ &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blob);
+ ASSERT_EQ(bytes_read, blob_size);
+ }
+
+ read_options.verify_checksums = true;
+
+ {
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+ kSnappyCompression, prefetch_buffer, allocator,
+ &value, &bytes_read));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blob);
+
+ constexpr uint64_t key_size = sizeof(key) - 1;
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+ blob_size);
+ }
+}
+
+TEST_F(BlobFileReaderTest, UncompressionError) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderTest_UncompressionError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob,
+ kSnappyCompression, &blob_offset, &blob_size);
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ ASSERT_OK(BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
+ CacheAllocationPtr* const output =
+ static_cast<CacheAllocationPtr*>(arg);
+ assert(output);
+
+ output->reset();
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kSnappyCompression, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderIOErrorTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ BlobFileReaderIOErrorTest() : sync_point_(GetParam()) {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get()));
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::OpenFile:GetFileSize",
+ "BlobFileReader::OpenFile:NewRandomAccessFile",
+ "BlobFileReader::ReadHeader:ReadFromFile",
+ "BlobFileReader::ReadFooter:ReadFromFile",
+ "BlobFileReader::GetBlob:ReadFromFile"}));
+
+TEST_P(BlobFileReaderIOErrorTest, IOError) {
+ // Simulates an I/O error during the specified step
+
+ Options options;
+ options.env = fault_injection_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(fault_injection_env_.get(),
+ "BlobFileReaderIOErrorTest_IOError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ const Status s = BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader);
+
+ const bool fail_during_create =
+ (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
+
+ if (fail_during_create) {
+ ASSERT_TRUE(s.IsIOError());
+ } else {
+ ASSERT_OK(s);
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kNoCompression, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsIOError());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderDecodingErrorTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) {
+ mock_env_.reset(MockEnv::Create(Env::Default()));
+ }
+
+ std::unique_ptr<Env> mock_env_;
+ std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::ReadHeader:TamperWithResult",
+ "BlobFileReader::ReadFooter:TamperWithResult",
+ "BlobFileReader::GetBlob:TamperWithResult"}));
+
+TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
+ Options options;
+ options.env = mock_env_.get();
+ options.cf_paths.emplace_back(
+ test::PerThreadDBPath(mock_env_.get(),
+ "BlobFileReaderDecodingErrorTest_DecodingError"),
+ 0);
+ options.enable_blob_files = true;
+
+ ImmutableOptions immutable_options(options);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ uint64_t blob_offset = 0;
+ uint64_t blob_size = 0;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, key, blob, kNoCompression,
+ &blob_offset, &blob_size);
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) {
+ Slice* const slice = static_cast<Slice*>(arg);
+ assert(slice);
+ assert(!slice->empty());
+
+ slice->remove_prefix(1);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileReader> reader;
+
+ const Status s = BlobFileReader::Create(
+ immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+ blob_file_number, nullptr /*IOTracer*/, &reader);
+
+ const bool fail_during_create =
+ sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
+
+ if (fail_during_create) {
+ ASSERT_TRUE(s.IsCorruption());
+ } else {
+ ASSERT_OK(s);
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+ constexpr MemoryAllocator* allocator = nullptr;
+
+ std::unique_ptr<BlobContents> value;
+ uint64_t bytes_read = 0;
+
+ ASSERT_TRUE(reader
+ ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+ kNoCompression, prefetch_buffer, allocator,
+ &value, &bytes_read)
+ .IsCorruption());
+ ASSERT_EQ(value, nullptr);
+ ASSERT_EQ(bytes_read, 0);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.cc b/src/rocksdb/db/blob/blob_garbage_meter.cc
new file mode 100644
index 000000000..d328d7ff4
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) {
+ uint64_t blob_file_number = kInvalidBlobFileNumber;
+ uint64_t bytes = 0;
+
+ const Status s = Parse(key, value, &blob_file_number, &bytes);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (blob_file_number == kInvalidBlobFileNumber) {
+ return Status::OK();
+ }
+
+ flows_[blob_file_number].AddInFlow(bytes);
+
+ return Status::OK();
+}
+
+Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) {
+ uint64_t blob_file_number = kInvalidBlobFileNumber;
+ uint64_t bytes = 0;
+
+ const Status s = Parse(key, value, &blob_file_number, &bytes);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (blob_file_number == kInvalidBlobFileNumber) {
+ return Status::OK();
+ }
+
+ // Note: in order to measure the amount of additional garbage, we only need to
+ // track the outflow for preexisting files, i.e. those that also had inflow.
+ // (Newly written files would only have outflow.)
+ auto it = flows_.find(blob_file_number);
+ if (it == flows_.end()) {
+ return Status::OK();
+ }
+
+ it->second.AddOutFlow(bytes);
+
+ return Status::OK();
+}
+
+Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value,
+ uint64_t* blob_file_number, uint64_t* bytes) {
+ assert(blob_file_number);
+ assert(*blob_file_number == kInvalidBlobFileNumber);
+ assert(bytes);
+ assert(*bytes == 0);
+
+ ParsedInternalKey ikey;
+
+ {
+ constexpr bool log_err_key = false;
+ const Status s = ParseInternalKey(key, &ikey, log_err_key);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (ikey.type != kTypeBlobIndex) {
+ return Status::OK();
+ }
+
+ BlobIndex blob_index;
+
+ {
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (blob_index.IsInlined() || blob_index.HasTTL()) {
+ return Status::Corruption("Unexpected TTL/inlined blob index");
+ }
+
+ *blob_file_number = blob_index.file_number();
+ *bytes =
+ blob_index.size() +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size());
+
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.h b/src/rocksdb/db/blob/blob_garbage_meter.h
new file mode 100644
index 000000000..a6c04b0b2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <unordered_map>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// A class that can be used to compute the amount of additional garbage
+// generated by a compaction. It parses the keys and blob references in the
+// input and output of a compaction, and aggregates the "inflow" and "outflow"
+// on a per-blob file basis. The amount of additional garbage for any given blob
+// file can then be computed by subtracting the outflow from the inflow.
+class BlobGarbageMeter {
+ public:
+ // A class to store the number and total size of blobs on a per-blob file
+ // basis.
+ class BlobStats {
+ public:
+ void Add(uint64_t bytes) {
+ ++count_;
+ bytes_ += bytes;
+ }
+ void Add(uint64_t count, uint64_t bytes) {
+ count_ += count;
+ bytes_ += bytes;
+ }
+
+ uint64_t GetCount() const { return count_; }
+ uint64_t GetBytes() const { return bytes_; }
+
+ private:
+ uint64_t count_ = 0;
+ uint64_t bytes_ = 0;
+ };
+
+ // A class to keep track of the "inflow" and the "outflow" and to compute the
+ // amount of additional garbage for a given blob file.
+ class BlobInOutFlow {
+ public:
+ void AddInFlow(uint64_t bytes) {
+ in_flow_.Add(bytes);
+ assert(IsValid());
+ }
+ void AddOutFlow(uint64_t bytes) {
+ out_flow_.Add(bytes);
+ assert(IsValid());
+ }
+
+ const BlobStats& GetInFlow() const { return in_flow_; }
+ const BlobStats& GetOutFlow() const { return out_flow_; }
+
+ bool IsValid() const {
+ return in_flow_.GetCount() >= out_flow_.GetCount() &&
+ in_flow_.GetBytes() >= out_flow_.GetBytes();
+ }
+ bool HasGarbage() const {
+ assert(IsValid());
+ return in_flow_.GetCount() > out_flow_.GetCount();
+ }
+ uint64_t GetGarbageCount() const {
+ assert(IsValid());
+ assert(HasGarbage());
+ return in_flow_.GetCount() - out_flow_.GetCount();
+ }
+ uint64_t GetGarbageBytes() const {
+ assert(IsValid());
+ assert(HasGarbage());
+ return in_flow_.GetBytes() - out_flow_.GetBytes();
+ }
+
+ private:
+ BlobStats in_flow_;
+ BlobStats out_flow_;
+ };
+
+ Status ProcessInFlow(const Slice& key, const Slice& value);
+ Status ProcessOutFlow(const Slice& key, const Slice& value);
+
+ const std::unordered_map<uint64_t, BlobInOutFlow>& flows() const {
+ return flows_;
+ }
+
+ private:
+ static Status Parse(const Slice& key, const Slice& value,
+ uint64_t* blob_file_number, uint64_t* bytes);
+
+ std::unordered_map<uint64_t, BlobInOutFlow> flows_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter_test.cc b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
new file mode 100644
index 000000000..ba53f06f1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(BlobGarbageMeterTest, MeasureGarbage) {
+ BlobGarbageMeter blob_garbage_meter;
+
+ struct BlobDescriptor {
+ std::string user_key;
+ uint64_t blob_file_number;
+ uint64_t offset;
+ uint64_t size;
+ CompressionType compression_type;
+ bool has_in_flow;
+ bool has_out_flow;
+
+ uint64_t GetExpectedBytes() const {
+ return size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size());
+ }
+ };
+
+ // Note: blob file 4 has the same inflow and outflow and hence no additional
+ // garbage. Blob file 5 has less outflow than inflow and thus it does have
+ // additional garbage. Blob file 6 is a newly written file (i.e. no inflow,
+ // only outflow) and is thus not tracked by the meter.
+ std::vector<BlobDescriptor> blobs{
+ {"key", 4, 1234, 555, kLZ4Compression, true, true},
+ {"other_key", 4, 6789, 101010, kLZ4Compression, true, true},
+ {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true},
+ {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true},
+ {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false},
+ {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false},
+ {"new_key", 6, 7777, 9999, kNoCompression, false, true}};
+
+ for (const auto& blob : blobs) {
+ constexpr SequenceNumber seq = 123;
+ const InternalKey key(blob.user_key, seq, kTypeBlobIndex);
+ const Slice key_slice = key.Encode();
+
+ std::string value;
+ BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size,
+ blob.compression_type);
+ const Slice value_slice(value);
+
+ if (blob.has_in_flow) {
+ ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ }
+ if (blob.has_out_flow) {
+ ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+ }
+ }
+
+ const auto& flows = blob_garbage_meter.flows();
+ ASSERT_EQ(flows.size(), 2);
+
+ {
+ const auto it = flows.find(4);
+ ASSERT_NE(it, flows.end());
+
+ const auto& flow = it->second;
+
+ constexpr uint64_t expected_count = 2;
+ const uint64_t expected_bytes =
+ blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes();
+
+ const auto& in = flow.GetInFlow();
+ ASSERT_EQ(in.GetCount(), expected_count);
+ ASSERT_EQ(in.GetBytes(), expected_bytes);
+
+ const auto& out = flow.GetOutFlow();
+ ASSERT_EQ(out.GetCount(), expected_count);
+ ASSERT_EQ(out.GetBytes(), expected_bytes);
+
+ ASSERT_TRUE(flow.IsValid());
+ ASSERT_FALSE(flow.HasGarbage());
+ }
+
+ {
+ const auto it = flows.find(5);
+ ASSERT_NE(it, flows.end());
+
+ const auto& flow = it->second;
+
+ const auto& in = flow.GetInFlow();
+
+ constexpr uint64_t expected_in_count = 4;
+ const uint64_t expected_in_bytes =
+ blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() +
+ blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes();
+
+ ASSERT_EQ(in.GetCount(), expected_in_count);
+ ASSERT_EQ(in.GetBytes(), expected_in_bytes);
+
+ const auto& out = flow.GetOutFlow();
+
+ constexpr uint64_t expected_out_count = 2;
+ const uint64_t expected_out_bytes =
+ blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes();
+
+ ASSERT_EQ(out.GetCount(), expected_out_count);
+ ASSERT_EQ(out.GetBytes(), expected_out_bytes);
+
+ ASSERT_TRUE(flow.IsValid());
+ ASSERT_TRUE(flow.HasGarbage());
+ ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count);
+ ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes);
+ }
+}
+
+TEST(BlobGarbageMeterTest, PlainValue) {
+ constexpr char user_key[] = "user_key";
+ constexpr SequenceNumber seq = 123;
+
+ const InternalKey key(user_key, seq, kTypeValue);
+ const Slice key_slice = key.Encode();
+
+ constexpr char value[] = "value";
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+ ASSERT_TRUE(blob_garbage_meter.flows().empty());
+}
+
+TEST(BlobGarbageMeterTest, CorruptInternalKey) {
+ constexpr char corrupt_key[] = "i_am_corrupt";
+ const Slice key_slice(corrupt_key);
+
+ constexpr char value[] = "value";
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, CorruptBlobIndex) {
+ constexpr char user_key[] = "user_key";
+ constexpr SequenceNumber seq = 123;
+
+ const InternalKey key(user_key, seq, kTypeBlobIndex);
+ const Slice key_slice = key.Encode();
+
+ constexpr char value[] = "i_am_not_a_blob_index";
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) {
+ constexpr char user_key[] = "user_key";
+ constexpr SequenceNumber seq = 123;
+
+ const InternalKey key(user_key, seq, kTypeBlobIndex);
+ const Slice key_slice = key.Encode();
+
+ constexpr uint64_t expiration = 1234567890;
+ constexpr char inlined_value[] = "inlined";
+
+ std::string value;
+ BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value);
+
+ const Slice value_slice(value);
+
+ BlobGarbageMeter blob_garbage_meter;
+
+ ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+ ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_index.h b/src/rocksdb/db/blob/blob_index.h
new file mode 100644
index 000000000..e9944d784
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_index.h
@@ -0,0 +1,187 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+// kInlinedTTL:
+// +------+------------+---------------+
+// | type | expiration | value |
+// +------+------------+---------------+
+// | char | varint64 | variable size |
+// +------+------------+---------------+
+//
+// kBlob:
+// +------+-------------+----------+----------+-------------+
+// | type | file number | offset | size | compression |
+// +------+-------------+----------+----------+-------------+
+// | char | varint64 | varint64 | varint64 | char |
+// +------+-------------+----------+----------+-------------+
+//
+// kBlobTTL:
+// +------+------------+-------------+----------+----------+-------------+
+// | type | expiration | file number | offset | size | compression |
+// +------+------------+-------------+----------+----------+-------------+
+// | char | varint64 | varint64 | varint64 | varint64 | char |
+// +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+ enum class Type : unsigned char {
+ kInlinedTTL = 0,
+ kBlob = 1,
+ kBlobTTL = 2,
+ kUnknown = 3,
+ };
+
+ BlobIndex() : type_(Type::kUnknown) {}
+
+ BlobIndex(const BlobIndex&) = default;
+ BlobIndex& operator=(const BlobIndex&) = default;
+
+ bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+ bool HasTTL() const {
+ return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+ }
+
+ uint64_t expiration() const {
+ assert(HasTTL());
+ return expiration_;
+ }
+
+ const Slice& value() const {
+ assert(IsInlined());
+ return value_;
+ }
+
+ uint64_t file_number() const {
+ assert(!IsInlined());
+ return file_number_;
+ }
+
+ uint64_t offset() const {
+ assert(!IsInlined());
+ return offset_;
+ }
+
+ uint64_t size() const {
+ assert(!IsInlined());
+ return size_;
+ }
+
+ CompressionType compression() const {
+ assert(!IsInlined());
+ return compression_;
+ }
+
+ Status DecodeFrom(Slice slice) {
+ const char* kErrorMessage = "Error while decoding blob index";
+ assert(slice.size() > 0);
+ type_ = static_cast<Type>(*slice.data());
+ if (type_ >= Type::kUnknown) {
+ return Status::Corruption(kErrorMessage,
+ "Unknown blob index type: " +
+ std::to_string(static_cast<char>(type_)));
+ }
+ slice = Slice(slice.data() + 1, slice.size() - 1);
+ if (HasTTL()) {
+ if (!GetVarint64(&slice, &expiration_)) {
+ return Status::Corruption(kErrorMessage, "Corrupted expiration");
+ }
+ }
+ if (IsInlined()) {
+ value_ = slice;
+ } else {
+ if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+ GetVarint64(&slice, &size_) && slice.size() == 1) {
+ compression_ = static_cast<CompressionType>(*slice.data());
+ } else {
+ return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+ }
+ }
+ return Status::OK();
+ }
+
+ std::string DebugString(bool output_hex) const {
+ std::ostringstream oss;
+
+ if (IsInlined()) {
+ oss << "[inlined blob] value:" << value_.ToString(output_hex);
+ } else {
+ oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+ << " size:" << size_
+ << " compression: " << CompressionTypeToString(compression_);
+ }
+
+ if (HasTTL()) {
+ oss << " exp:" << expiration_;
+ }
+
+ return oss.str();
+ }
+
+ static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+ const Slice& value) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(1 + kMaxVarint64Length + value.size());
+ dst->push_back(static_cast<char>(Type::kInlinedTTL));
+ PutVarint64(dst, expiration);
+ dst->append(value.data(), value.size());
+ }
+
+ static void EncodeBlob(std::string* dst, uint64_t file_number,
+ uint64_t offset, uint64_t size,
+ CompressionType compression) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(kMaxVarint64Length * 3 + 2);
+ dst->push_back(static_cast<char>(Type::kBlob));
+ PutVarint64(dst, file_number);
+ PutVarint64(dst, offset);
+ PutVarint64(dst, size);
+ dst->push_back(static_cast<char>(compression));
+ }
+
+ static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+ uint64_t file_number, uint64_t offset,
+ uint64_t size, CompressionType compression) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(kMaxVarint64Length * 4 + 2);
+ dst->push_back(static_cast<char>(Type::kBlobTTL));
+ PutVarint64(dst, expiration);
+ PutVarint64(dst, file_number);
+ PutVarint64(dst, offset);
+ PutVarint64(dst, size);
+ dst->push_back(static_cast<char>(compression));
+ }
+
+ private:
+ Type type_ = Type::kUnknown;
+ uint64_t expiration_ = 0;
+ Slice value_;
+ uint64_t file_number_ = 0;
+ uint64_t offset_ = 0;
+ uint64_t size_ = 0;
+ CompressionType compression_ = kNoCompression;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.cc b/src/rocksdb/db/blob/blob_log_format.cc
new file mode 100644
index 000000000..8e26281e3
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_format.h"
+
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlobLogHeader::EncodeTo(std::string* dst) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(BlobLogHeader::kSize);
+ PutFixed32(dst, kMagicNumber);
+ PutFixed32(dst, version);
+ PutFixed32(dst, column_family_id);
+ unsigned char flags = (has_ttl ? 1 : 0);
+ dst->push_back(flags);
+ dst->push_back(compression);
+ PutFixed64(dst, expiration_range.first);
+ PutFixed64(dst, expiration_range.second);
+}
+
+Status BlobLogHeader::DecodeFrom(Slice src) {
+ const char* kErrorMessage = "Error while decoding blob log header";
+ if (src.size() != BlobLogHeader::kSize) {
+ return Status::Corruption(kErrorMessage,
+ "Unexpected blob file header size");
+ }
+ uint32_t magic_number;
+ unsigned char flags;
+ if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
+ !GetFixed32(&src, &column_family_id)) {
+ return Status::Corruption(
+ kErrorMessage,
+ "Error decoding magic number, version and column family id");
+ }
+ if (magic_number != kMagicNumber) {
+ return Status::Corruption(kErrorMessage, "Magic number mismatch");
+ }
+ if (version != kVersion1) {
+ return Status::Corruption(kErrorMessage, "Unknown header version");
+ }
+ flags = src.data()[0];
+ compression = static_cast<CompressionType>(src.data()[1]);
+ has_ttl = (flags & 1) == 1;
+ src.remove_prefix(2);
+ if (!GetFixed64(&src, &expiration_range.first) ||
+ !GetFixed64(&src, &expiration_range.second)) {
+ return Status::Corruption(kErrorMessage, "Error decoding expiration range");
+ }
+ return Status::OK();
+}
+
+void BlobLogFooter::EncodeTo(std::string* dst) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(BlobLogFooter::kSize);
+ PutFixed32(dst, kMagicNumber);
+ PutFixed64(dst, blob_count);
+ PutFixed64(dst, expiration_range.first);
+ PutFixed64(dst, expiration_range.second);
+ crc = crc32c::Value(dst->c_str(), dst->size());
+ crc = crc32c::Mask(crc);
+ PutFixed32(dst, crc);
+}
+
+Status BlobLogFooter::DecodeFrom(Slice src) {
+ const char* kErrorMessage = "Error while decoding blob log footer";
+ if (src.size() != BlobLogFooter::kSize) {
+ return Status::Corruption(kErrorMessage,
+ "Unexpected blob file footer size");
+ }
+ uint32_t src_crc = 0;
+ src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
+ src_crc = crc32c::Mask(src_crc);
+ uint32_t magic_number = 0;
+ if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
+ !GetFixed64(&src, &expiration_range.first) ||
+ !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
+ return Status::Corruption(kErrorMessage, "Error decoding content");
+ }
+ if (magic_number != kMagicNumber) {
+ return Status::Corruption(kErrorMessage, "Magic number mismatch");
+ }
+ if (src_crc != crc) {
+ return Status::Corruption(kErrorMessage, "CRC mismatch");
+ }
+ return Status::OK();
+}
+
+void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
+ assert(dst != nullptr);
+ dst->clear();
+ dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
+ PutFixed64(dst, key.size());
+ PutFixed64(dst, value.size());
+ PutFixed64(dst, expiration);
+ header_crc = crc32c::Value(dst->c_str(), dst->size());
+ header_crc = crc32c::Mask(header_crc);
+ PutFixed32(dst, header_crc);
+ blob_crc = crc32c::Value(key.data(), key.size());
+ blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
+ blob_crc = crc32c::Mask(blob_crc);
+ PutFixed32(dst, blob_crc);
+}
+
+Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
+ const char* kErrorMessage = "Error while decoding blob record";
+ if (src.size() != BlobLogRecord::kHeaderSize) {
+ return Status::Corruption(kErrorMessage,
+ "Unexpected blob record header size");
+ }
+ uint32_t src_crc = 0;
+ src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
+ src_crc = crc32c::Mask(src_crc);
+ if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
+ !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
+ !GetFixed32(&src, &blob_crc)) {
+ return Status::Corruption(kErrorMessage, "Error decoding content");
+ }
+ if (src_crc != header_crc) {
+ return Status::Corruption(kErrorMessage, "Header CRC mismatch");
+ }
+ return Status::OK();
+}
+
+Status BlobLogRecord::CheckBlobCRC() const {
+ uint32_t expected_crc = 0;
+ expected_crc = crc32c::Value(key.data(), key.size());
+ expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
+ expected_crc = crc32c::Mask(expected_crc);
+ if (expected_crc != blob_crc) {
+ return Status::Corruption("Blob CRC mismatch");
+ }
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.h b/src/rocksdb/db/blob/blob_log_format.h
new file mode 100644
index 000000000..607db2367
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Log format information shared by reader and writer.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37
+constexpr uint32_t kVersion1 = 1;
+
+using ExpirationRange = std::pair<uint64_t, uint64_t>;
+
+// clang-format off
+
+// Format of blob log file header (30 bytes):
+//
+// +--------------+---------+---------+-------+-------------+-------------------+
+// | magic number | version | cf id | flags | compression | expiration range |
+// +--------------+---------+---------+-------+-------------+-------------------+
+// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 |
+// +--------------+---------+---------+-------+-------------+-------------------+
+//
+// List of flags:
+// has_ttl: Whether the file contain TTL data.
+//
+// Expiration range in the header is a rough range based on
+// blob_db_options.ttl_range_secs.
+
+// clang-format on
+
+struct BlobLogHeader {
+ static constexpr size_t kSize = 30;
+
+ BlobLogHeader() = default;
+ BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+ bool _has_ttl, const ExpirationRange& _expiration_range)
+ : column_family_id(_column_family_id),
+ compression(_compression),
+ has_ttl(_has_ttl),
+ expiration_range(_expiration_range) {}
+
+ uint32_t version = kVersion1;
+ uint32_t column_family_id = 0;
+ CompressionType compression = kNoCompression;
+ bool has_ttl = false;
+ ExpirationRange expiration_range;
+
+ void EncodeTo(std::string* dst);
+
+ Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Format of blob log file footer (32 bytes):
+//
+// +--------------+------------+-------------------+------------+
+// | magic number | blob count | expiration range | footer CRC |
+// +--------------+------------+-------------------+------------+
+// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed32 |
+// +--------------+------------+-------------------+------------+
+//
+// The footer will be presented only when the blob file is properly closed.
+//
+// Unlike the same field in file header, expiration range in the footer is the
+// range of smallest and largest expiration of the data in this file.
+
+// clang-format on
+
+struct BlobLogFooter {
+ static constexpr size_t kSize = 32;
+
+ uint64_t blob_count = 0;
+ ExpirationRange expiration_range = std::make_pair(0, 0);
+ uint32_t crc = 0;
+
+ void EncodeTo(std::string* dst);
+
+ Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Blob record format (32 bytes header + key + value):
+//
+// +------------+--------------+------------+------------+----------+---------+-----------+
+// | key length | value length | expiration | header CRC | blob CRC | key | value |
+// +------------+--------------+------------+------------+----------+---------+-----------+
+// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len |
+// +------------+--------------+------------+------------+----------+---------+-----------+
+//
+// If file has has_ttl = false, expiration field is always 0, and the blob
+// doesn't has expiration.
+//
+// Also note that if compression is used, value is compressed value and value
+// length is compressed value length.
+//
+// Header CRC is the checksum of (key_len + val_len + expiration), while
+// blob CRC is the checksum of (key + value).
+//
+// We could use variable length encoding (Varint64) to save more space, but it
+// make reader more complicated.
+
+// clang-format on
+
+struct BlobLogRecord {
+ // header include fields up to blob CRC
+ static constexpr size_t kHeaderSize = 32;
+
+ // Note that the offset field of BlobIndex actually points to the blob value
+ // as opposed to the start of the blob record. The following method can
+ // be used to calculate the adjustment needed to read the blob record header.
+ static constexpr uint64_t CalculateAdjustmentForRecordHeader(
+ uint64_t key_size) {
+ return key_size + kHeaderSize;
+ }
+
+ uint64_t key_size = 0;
+ uint64_t value_size = 0;
+ uint64_t expiration = 0;
+ uint32_t header_crc = 0;
+ uint32_t blob_crc = 0;
+ Slice key;
+ Slice value;
+ std::unique_ptr<char[]> key_buf;
+ std::unique_ptr<char[]> value_buf;
+
+ uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
+
+ void EncodeHeaderTo(std::string* dst);
+
+ Status DecodeHeaderFrom(Slice src);
+
+ Status CheckBlobCRC() const;
+};
+
+// Checks whether a blob offset is potentially valid or not.
+inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size,
+ uint64_t value_size, uint64_t file_size) {
+ if (value_offset <
+ BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) {
+ return false;
+ }
+
+ if (value_offset + value_size + BlobLogFooter::kSize > file_size) {
+ return false;
+ }
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.cc b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
new file mode 100644
index 000000000..778725189
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_sequential_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/statistics.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogSequentialReader::BlobLogSequentialReader(
+ std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
+ Statistics* statistics)
+ : file_(std::move(file_reader)),
+ clock_(clock),
+ statistics_(statistics),
+ next_byte_(0) {}
+
+BlobLogSequentialReader::~BlobLogSequentialReader() = default;
+
+Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
+ char* buf) {
+ assert(slice);
+ assert(file_);
+
+ StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+ // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?)
+ Status s =
+ file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size), slice,
+ buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ next_byte_ += size;
+ if (!s.ok()) {
+ return s;
+ }
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
+ if (slice->size() != size) {
+ return Status::Corruption("EOF reached while reading record");
+ }
+ return s;
+}
+
+Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) {
+ assert(header);
+ assert(next_byte_ == 0);
+
+ static_assert(BlobLogHeader::kSize <= sizeof(header_buf_),
+ "Buffer is smaller than BlobLogHeader::kSize");
+
+ Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (buffer_.size() != BlobLogHeader::kSize) {
+ return Status::Corruption("EOF reached before file header");
+ }
+
+ return header->DecodeFrom(buffer_);
+}
+
+Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record,
+ ReadLevel level,
+ uint64_t* blob_offset) {
+ assert(record);
+ static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_),
+ "Buffer is smaller than BlobLogRecord::kHeaderSize");
+
+ Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (buffer_.size() != BlobLogRecord::kHeaderSize) {
+ return Status::Corruption("EOF reached before record header");
+ }
+
+ s = record->DecodeHeaderFrom(buffer_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ uint64_t kb_size = record->key_size + record->value_size;
+ if (blob_offset != nullptr) {
+ *blob_offset = next_byte_ + record->key_size;
+ }
+
+ switch (level) {
+ case kReadHeader:
+ next_byte_ += kb_size;
+ break;
+
+ case kReadHeaderKey:
+ record->key_buf.reset(new char[record->key_size]);
+ s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+ next_byte_ += record->value_size;
+ break;
+
+ case kReadHeaderKeyBlob:
+ record->key_buf.reset(new char[record->key_size]);
+ s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+ if (s.ok()) {
+ record->value_buf.reset(new char[record->value_size]);
+ s = ReadSlice(record->value_size, &record->value,
+ record->value_buf.get());
+ }
+ if (s.ok()) {
+ s = record->CheckBlobCRC();
+ }
+ break;
+ }
+ return s;
+}
+
+Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) {
+ assert(footer);
+ static_assert(BlobLogFooter::kSize <= sizeof(header_buf_),
+ "Buffer is smaller than BlobLogFooter::kSize");
+
+ Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (buffer_.size() != BlobLogFooter::kSize) {
+ return Status::Corruption("EOF reached before file footer");
+ }
+
+ return footer->DecodeFrom(buffer_);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.h b/src/rocksdb/db/blob/blob_log_sequential_reader.h
new file mode 100644
index 000000000..98afa8518
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <memory>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c))
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class Env;
+class Statistics;
+class Status;
+class SystemClock;
+
+/**
+ * BlobLogSequentialReader is a general purpose log stream reader
+ * implementation. The actual job of reading from the device is implemented by
+ * the RandomAccessFileReader interface.
+ *
+ * Please see BlobLogWriter for details on the file and record layout.
+ */
+
+class BlobLogSequentialReader {
+ public:
+ enum ReadLevel {
+ kReadHeader,
+ kReadHeaderKey,
+ kReadHeaderKeyBlob,
+ };
+
+ // Create a reader that will return log records from "*file_reader".
+ BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+ SystemClock* clock, Statistics* statistics);
+
+ // No copying allowed
+ BlobLogSequentialReader(const BlobLogSequentialReader&) = delete;
+ BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete;
+
+ ~BlobLogSequentialReader();
+
+ Status ReadHeader(BlobLogHeader* header);
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. The contents filled in
+ // *record will only be valid until the next mutating operation on this
+ // reader.
+ // If blob_offset is non-null, return offset of the blob through it.
+ Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
+ uint64_t* blob_offset = nullptr);
+
+ Status ReadFooter(BlobLogFooter* footer);
+
+ void ResetNextByte() { next_byte_ = 0; }
+
+ uint64_t GetNextByte() const { return next_byte_; }
+
+ private:
+ Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
+ const std::unique_ptr<RandomAccessFileReader> file_;
+ SystemClock* clock_;
+
+ Statistics* statistics_;
+
+ Slice buffer_;
+ char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize,
+ BlobLogRecord::kHeaderSize)];
+
+ // which byte to read next
+ uint64_t next_byte_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#undef MAX_HEADER_SIZE \ No newline at end of file
diff --git a/src/rocksdb/db/blob/blob_log_writer.cc b/src/rocksdb/db/blob/blob_log_writer.cc
new file mode 100644
index 000000000..9dbac7f25
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_log_writer.h"
+
+#include <cstdint>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest,
+ SystemClock* clock, Statistics* statistics,
+ uint64_t log_number, bool use_fs, bool do_flush,
+ uint64_t boffset)
+ : dest_(std::move(dest)),
+ clock_(clock),
+ statistics_(statistics),
+ log_number_(log_number),
+ block_offset_(boffset),
+ use_fsync_(use_fs),
+ do_flush_(do_flush),
+ last_elem_type_(kEtNone) {}
+
+BlobLogWriter::~BlobLogWriter() = default;
+
+Status BlobLogWriter::Sync() {
+ TEST_SYNC_POINT("BlobLogWriter::Sync");
+
+ StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+ Status s = dest_->Sync(use_fsync_);
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+ return s;
+}
+
+Status BlobLogWriter::WriteHeader(BlobLogHeader& header) {
+ assert(block_offset_ == 0);
+ assert(last_elem_type_ == kEtNone);
+ std::string str;
+ header.EncodeTo(&str);
+
+ Status s = dest_->Append(Slice(str));
+ if (s.ok()) {
+ block_offset_ += str.size();
+ if (do_flush_) {
+ s = dest_->Flush();
+ }
+ }
+ last_elem_type_ = kEtFileHdr;
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ BlobLogHeader::kSize);
+ return s;
+}
+
+Status BlobLogWriter::AppendFooter(BlobLogFooter& footer,
+ std::string* checksum_method,
+ std::string* checksum_value) {
+ assert(block_offset_ != 0);
+ assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+ std::string str;
+ footer.EncodeTo(&str);
+
+ Status s;
+ if (dest_->seen_error()) {
+ s.PermitUncheckedError();
+ return Status::IOError("Seen Error. Skip closing.");
+ } else {
+ s = dest_->Append(Slice(str));
+ if (s.ok()) {
+ block_offset_ += str.size();
+
+ s = Sync();
+
+ if (s.ok()) {
+ s = dest_->Close();
+
+ if (s.ok()) {
+ assert(!!checksum_method == !!checksum_value);
+
+ if (checksum_method) {
+ assert(checksum_method->empty());
+
+ std::string method = dest_->GetFileChecksumFuncName();
+ if (method != kUnknownFileChecksumFuncName) {
+ *checksum_method = std::move(method);
+ }
+ }
+ if (checksum_value) {
+ assert(checksum_value->empty());
+
+ std::string value = dest_->GetFileChecksum();
+ if (value != kUnknownFileChecksum) {
+ *checksum_value = std::move(value);
+ }
+ }
+ }
+ }
+ }
+
+ dest_.reset();
+ }
+
+ last_elem_type_ = kEtFileFooter;
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ BlobLogFooter::kSize);
+ return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+ uint64_t expiration, uint64_t* key_offset,
+ uint64_t* blob_offset) {
+ assert(block_offset_ != 0);
+ assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+ std::string buf;
+ ConstructBlobHeader(&buf, key, val, expiration);
+
+ Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+ return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+ uint64_t* key_offset, uint64_t* blob_offset) {
+ assert(block_offset_ != 0);
+ assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+ std::string buf;
+ ConstructBlobHeader(&buf, key, val, 0);
+
+ Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+ return s;
+}
+
+void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key,
+ const Slice& val, uint64_t expiration) {
+ BlobLogRecord record;
+ record.key = key;
+ record.value = val;
+ record.expiration = expiration;
+ record.EncodeHeaderTo(buf);
+}
+
+Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
+ const Slice& key, const Slice& val,
+ uint64_t* key_offset,
+ uint64_t* blob_offset) {
+ StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
+ Status s = dest_->Append(Slice(headerbuf));
+ if (s.ok()) {
+ s = dest_->Append(key);
+ }
+ if (s.ok()) {
+ s = dest_->Append(val);
+ }
+ if (do_flush_ && s.ok()) {
+ s = dest_->Flush();
+ }
+
+ *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
+ *blob_offset = *key_offset + key.size();
+ block_offset_ = *blob_offset + val.size();
+ last_elem_type_ = kEtRecord;
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+ BlobLogRecord::kHeaderSize + key.size() + val.size());
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_writer.h b/src/rocksdb/db/blob/blob_log_writer.h
new file mode 100644
index 000000000..c1f9f31ad
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+class SystemClock;
+/**
+ * BlobLogWriter is the blob log stream writer. It provides an append-only
+ * abstraction for writing blob data.
+ *
+ *
+ * Look at blob_db_format.h to see the details of the record formats.
+ */
+
+class BlobLogWriter {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this BlobLogWriter is in use.
+ BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock,
+ Statistics* statistics, uint64_t log_number, bool use_fsync,
+ bool do_flush, uint64_t boffset = 0);
+ // No copying allowed
+ BlobLogWriter(const BlobLogWriter&) = delete;
+ BlobLogWriter& operator=(const BlobLogWriter&) = delete;
+
+ ~BlobLogWriter();
+
+ static void ConstructBlobHeader(std::string* buf, const Slice& key,
+ const Slice& val, uint64_t expiration);
+
+ Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+ uint64_t* blob_offset);
+
+ Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
+ uint64_t* key_offset, uint64_t* blob_offset);
+
+ Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
+ const Slice& val, uint64_t* key_offset,
+ uint64_t* blob_offset);
+
+ Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method,
+ std::string* checksum_value);
+
+ Status WriteHeader(BlobLogHeader& header);
+
+ WritableFileWriter* file() { return dest_.get(); }
+
+ const WritableFileWriter* file() const { return dest_.get(); }
+
+ uint64_t get_log_number() const { return log_number_; }
+
+ Status Sync();
+
+ private:
+ std::unique_ptr<WritableFileWriter> dest_;
+ SystemClock* clock_;
+ Statistics* statistics_;
+ uint64_t log_number_;
+ uint64_t block_offset_; // Current offset in block
+ bool use_fsync_;
+ bool do_flush_;
+
+ public:
+ enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
+ ElemType last_elem_type_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_read_request.h b/src/rocksdb/db/blob/blob_read_request.h
new file mode 100644
index 000000000..f9668ca2e
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_read_request.h
@@ -0,0 +1,58 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A read Blob request structure for use in BlobSource::MultiGetBlob and
+// BlobFileReader::MultiGetBlob.
+struct BlobReadRequest {
+ // User key to lookup the paired blob
+ const Slice* user_key = nullptr;
+
+ // File offset in bytes
+ uint64_t offset = 0;
+
+ // Length to read in bytes
+ size_t len = 0;
+
+ // Blob compression type
+ CompressionType compression = kNoCompression;
+
+ // Output parameter set by MultiGetBlob() to point to the data buffer, and
+ // the number of valid bytes
+ PinnableSlice* result = nullptr;
+
+ // Status of read
+ Status* status = nullptr;
+
+ BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len,
+ CompressionType _compression, PinnableSlice* _result,
+ Status* _status)
+ : user_key(&_user_key),
+ offset(_offset),
+ len(_len),
+ compression(_compression),
+ result(_result),
+ status(_status) {}
+
+ BlobReadRequest() = default;
+ BlobReadRequest(const BlobReadRequest& other) = default;
+ BlobReadRequest& operator=(const BlobReadRequest& other) = default;
+};
+
+using BlobFileReadRequests =
+ std::tuple<uint64_t /* file_number */, uint64_t /* file_size */,
+ autovector<BlobReadRequest>>;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.cc b/src/rocksdb/db/blob/blob_source.cc
new file mode 100644
index 000000000..bfade2507
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.cc
@@ -0,0 +1,488 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "cache/charged_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobSource::BlobSource(const ImmutableOptions* immutable_options,
+ const std::string& db_id,
+ const std::string& db_session_id,
+ BlobFileCache* blob_file_cache)
+ : db_id_(db_id),
+ db_session_id_(db_session_id),
+ statistics_(immutable_options->statistics.get()),
+ blob_file_cache_(blob_file_cache),
+ blob_cache_(immutable_options->blob_cache),
+ lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
+#ifndef ROCKSDB_LITE
+ auto bbto =
+ immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
+ if (bbto &&
+ bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
+ .charged == CacheEntryRoleOptions::Decision::kEnabled) {
+ blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
+ bbto->block_cache);
+ }
+#endif // ROCKSDB_LITE
+}
+
+BlobSource::~BlobSource() = default;
+
+Status BlobSource::GetBlobFromCache(
+ const Slice& cache_key, CacheHandleGuard<BlobContents>* cached_blob) const {
+ assert(blob_cache_);
+ assert(!cache_key.empty());
+ assert(cached_blob);
+ assert(cached_blob->IsEmpty());
+
+ Cache::Handle* cache_handle = nullptr;
+ cache_handle = GetEntryFromCache(cache_key);
+ if (cache_handle != nullptr) {
+ *cached_blob =
+ CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+ assert(cached_blob->GetValue());
+
+ PERF_COUNTER_ADD(blob_cache_hit_count, 1);
+ RecordTick(statistics_, BLOB_DB_CACHE_HIT);
+ RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ,
+ cached_blob->GetValue()->size());
+
+ return Status::OK();
+ }
+
+ RecordTick(statistics_, BLOB_DB_CACHE_MISS);
+
+ return Status::NotFound("Blob not found in cache");
+}
+
+Status BlobSource::PutBlobIntoCache(
+ const Slice& cache_key, std::unique_ptr<BlobContents>* blob,
+ CacheHandleGuard<BlobContents>* cached_blob) const {
+ assert(blob_cache_);
+ assert(!cache_key.empty());
+ assert(blob);
+ assert(*blob);
+ assert(cached_blob);
+ assert(cached_blob->IsEmpty());
+
+ Cache::Handle* cache_handle = nullptr;
+ const Status s = InsertEntryIntoCache(cache_key, blob->get(),
+ (*blob)->ApproximateMemoryUsage(),
+ &cache_handle, Cache::Priority::BOTTOM);
+ if (s.ok()) {
+ blob->release();
+
+ assert(cache_handle != nullptr);
+ *cached_blob =
+ CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+ assert(cached_blob->GetValue());
+
+ RecordTick(statistics_, BLOB_DB_CACHE_ADD);
+ RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE,
+ cached_blob->GetValue()->size());
+
+ } else {
+ RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES);
+ }
+
+ return s;
+}
+
+Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
+ Cache::Handle* cache_handle = nullptr;
+
+ if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+ Cache::CreateCallback create_cb =
+ [allocator = blob_cache_->memory_allocator()](
+ const void* buf, size_t size, void** out_obj,
+ size_t* charge) -> Status {
+ return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
+ size, out_obj, charge);
+ };
+
+ cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
+ create_cb, Cache::Priority::BOTTOM,
+ true /* wait_for_cache */, statistics_);
+ } else {
+ cache_handle = blob_cache_->Lookup(key, statistics_);
+ }
+
+ return cache_handle;
+}
+
+void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+ PinnableSlice* value) {
+ assert(cached_blob);
+ assert(cached_blob->GetValue());
+ assert(value);
+
+ // To avoid copying the cached blob into the buffer provided by the
+ // application, we can simply transfer ownership of the cache handle to
+ // the target PinnableSlice. This has the potential to save a lot of
+ // CPU, especially with large blob values.
+
+ value->Reset();
+
+ constexpr Cleanable* cleanable = nullptr;
+ value->PinSlice(cached_blob->GetValue()->data(), cleanable);
+
+ cached_blob->TransferTo(value);
+}
+
+void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+ PinnableSlice* value) {
+ assert(owned_blob);
+ assert(*owned_blob);
+ assert(value);
+
+ BlobContents* const blob = owned_blob->release();
+ assert(blob);
+
+ value->Reset();
+ value->PinSlice(
+ blob->data(),
+ [](void* arg1, void* /* arg2 */) {
+ delete static_cast<BlobContents*>(arg1);
+ },
+ blob, nullptr);
+}
+
+Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
+ size_t charge,
+ Cache::Handle** cache_handle,
+ Cache::Priority priority) const {
+ Status s;
+
+ Cache::CacheItemHelper* const cache_item_helper =
+ BlobContents::GetCacheItemHelper();
+ assert(cache_item_helper);
+
+ if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+ s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
+ priority);
+ } else {
+ s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
+ cache_handle, priority);
+ }
+
+ return s;
+}
+
+Status BlobSource::GetBlob(const ReadOptions& read_options,
+ const Slice& user_key, uint64_t file_number,
+ uint64_t offset, uint64_t file_size,
+ uint64_t value_size,
+ CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer,
+ PinnableSlice* value, uint64_t* bytes_read) {
+ assert(value);
+
+ Status s;
+
+ const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+
+ CacheHandleGuard<BlobContents> blob_handle;
+
+ // First, try to get the blob from the cache
+ //
+ // If blob cache is enabled, we'll try to read from it.
+ if (blob_cache_) {
+ Slice key = cache_key.AsSlice();
+ s = GetBlobFromCache(key, &blob_handle);
+ if (s.ok()) {
+ PinCachedBlob(&blob_handle, value);
+
+ // For consistency, the size of on-disk (possibly compressed) blob record
+ // is assigned to bytes_read.
+ uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+ user_key.size())
+ : 0;
+ assert(offset >= adjustment);
+
+ uint64_t record_size = value_size + adjustment;
+ if (bytes_read) {
+ *bytes_read = record_size;
+ }
+ return s;
+ }
+ }
+
+ assert(blob_handle.IsEmpty());
+
+ const bool no_io = read_options.read_tier == kBlockCacheTier;
+ if (no_io) {
+ s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+ return s;
+ }
+
+ // Can't find the blob from the cache. Since I/O is allowed, read from the
+ // file.
+ std::unique_ptr<BlobContents> blob_contents;
+
+ {
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(blob_file_reader.GetValue());
+
+ if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) {
+ return Status::Corruption("Compression type mismatch when reading blob");
+ }
+
+ MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+ ? blob_cache_->memory_allocator()
+ : nullptr;
+
+ uint64_t read_size = 0;
+ s = blob_file_reader.GetValue()->GetBlob(
+ read_options, user_key, offset, value_size, compression_type,
+ prefetch_buffer, allocator, &blob_contents, &read_size);
+ if (!s.ok()) {
+ return s;
+ }
+ if (bytes_read) {
+ *bytes_read = read_size;
+ }
+ }
+
+ if (blob_cache_ && read_options.fill_cache) {
+ // If filling cache is allowed and a cache is configured, try to put the
+ // blob to the cache.
+ Slice key = cache_key.AsSlice();
+ s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+ if (!s.ok()) {
+ return s;
+ }
+
+ PinCachedBlob(&blob_handle, value);
+ } else {
+ PinOwnedBlob(&blob_contents, value);
+ }
+
+ assert(s.ok());
+ return s;
+}
+
+void BlobSource::MultiGetBlob(const ReadOptions& read_options,
+ autovector<BlobFileReadRequests>& blob_reqs,
+ uint64_t* bytes_read) {
+ assert(blob_reqs.size() > 0);
+
+ uint64_t total_bytes_read = 0;
+ uint64_t bytes_read_in_file = 0;
+
+ for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) {
+ // sort blob_reqs_in_file by file offset.
+ std::sort(
+ blob_reqs_in_file.begin(), blob_reqs_in_file.end(),
+ [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
+ return lhs.offset < rhs.offset;
+ });
+
+ MultiGetBlobFromOneFile(read_options, file_number, file_size,
+ blob_reqs_in_file, &bytes_read_in_file);
+
+ total_bytes_read += bytes_read_in_file;
+ }
+
+ if (bytes_read) {
+ *bytes_read = total_bytes_read;
+ }
+}
+
+void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
+ uint64_t file_number,
+ uint64_t /*file_size*/,
+ autovector<BlobReadRequest>& blob_reqs,
+ uint64_t* bytes_read) {
+ const size_t num_blobs = blob_reqs.size();
+ assert(num_blobs > 0);
+ assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < num_blobs - 1; ++i) {
+ assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset);
+ }
+#endif // !NDEBUG
+
+ using Mask = uint64_t;
+ Mask cache_hit_mask = 0;
+
+ uint64_t total_bytes = 0;
+ const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+ if (blob_cache_) {
+ size_t cached_blob_count = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ auto& req = blob_reqs[i];
+
+ CacheHandleGuard<BlobContents> blob_handle;
+ const CacheKey cache_key = base_cache_key.WithOffset(req.offset);
+ const Slice key = cache_key.AsSlice();
+
+ const Status s = GetBlobFromCache(key, &blob_handle);
+
+ if (s.ok()) {
+ assert(req.status);
+ *req.status = s;
+
+ PinCachedBlob(&blob_handle, req.result);
+
+ // Update the counter for the number of valid blobs read from the cache.
+ ++cached_blob_count;
+
+ // For consistency, the size of each on-disk (possibly compressed) blob
+ // record is accumulated to total_bytes.
+ uint64_t adjustment =
+ read_options.verify_checksums
+ ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+ req.user_key->size())
+ : 0;
+ assert(req.offset >= adjustment);
+ total_bytes += req.len + adjustment;
+ cache_hit_mask |= (Mask{1} << i); // cache hit
+ }
+ }
+
+ // All blobs were read from the cache.
+ if (cached_blob_count == num_blobs) {
+ if (bytes_read) {
+ *bytes_read = total_bytes;
+ }
+ return;
+ }
+ }
+
+ const bool no_io = read_options.read_tier == kBlockCacheTier;
+ if (no_io) {
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (!(cache_hit_mask & (Mask{1} << i))) {
+ BlobReadRequest& req = blob_reqs[i];
+ assert(req.status);
+
+ *req.status =
+ Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+ }
+ }
+ return;
+ }
+
+ {
+ // Find the rest of blobs from the file since I/O is allowed.
+ autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+ _blob_reqs;
+ uint64_t _bytes_read = 0;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (!(cache_hit_mask & (Mask{1} << i))) {
+ _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr<BlobContents>());
+ }
+ }
+
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ Status s =
+ blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+ if (!s.ok()) {
+ for (size_t i = 0; i < _blob_reqs.size(); ++i) {
+ BlobReadRequest* const req = _blob_reqs[i].first;
+ assert(req);
+ assert(req->status);
+
+ *req->status = s;
+ }
+ return;
+ }
+
+ assert(blob_file_reader.GetValue());
+
+ MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+ ? blob_cache_->memory_allocator()
+ : nullptr;
+
+ blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
+ _blob_reqs, &_bytes_read);
+
+ if (blob_cache_ && read_options.fill_cache) {
+ // If filling cache is allowed and a cache is configured, try to put
+ // the blob(s) to the cache.
+ for (auto& [req, blob_contents] : _blob_reqs) {
+ assert(req);
+
+ if (req->status->ok()) {
+ CacheHandleGuard<BlobContents> blob_handle;
+ const CacheKey cache_key = base_cache_key.WithOffset(req->offset);
+ const Slice key = cache_key.AsSlice();
+ s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+ if (!s.ok()) {
+ *req->status = s;
+ } else {
+ PinCachedBlob(&blob_handle, req->result);
+ }
+ }
+ }
+ } else {
+ for (auto& [req, blob_contents] : _blob_reqs) {
+ assert(req);
+
+ if (req->status->ok()) {
+ PinOwnedBlob(&blob_contents, req->result);
+ }
+ }
+ }
+
+ total_bytes += _bytes_read;
+ if (bytes_read) {
+ *bytes_read = total_bytes;
+ }
+ }
+}
+
+bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+ uint64_t offset, size_t* charge) const {
+ const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+ const Slice key = cache_key.AsSlice();
+
+ CacheHandleGuard<BlobContents> blob_handle;
+ const Status s = GetBlobFromCache(key, &blob_handle);
+
+ if (s.ok() && blob_handle.GetValue() != nullptr) {
+ if (charge) {
+ const Cache* const cache = blob_handle.GetCache();
+ assert(cache);
+
+ Cache::Handle* const handle = blob_handle.GetCacheHandle();
+ assert(handle);
+
+ *charge = cache->GetUsage(handle);
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.h b/src/rocksdb/db/blob/blob_source.h
new file mode 100644
index 000000000..2ed296eeb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.h
@@ -0,0 +1,153 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_read_request.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableOptions;
+class Status;
+class FilePrefetchBuffer;
+class Slice;
+class BlobContents;
+
+// BlobSource is a class that provides universal access to blobs, regardless of
+// whether they are in the blob cache, secondary cache, or (remote) storage.
+// Depending on user settings, it always fetch blobs from multi-tier cache and
+// storage with minimal cost.
+class BlobSource {
+ public:
+ BlobSource(const ImmutableOptions* immutable_options,
+ const std::string& db_id, const std::string& db_session_id,
+ BlobFileCache* blob_file_cache);
+
+ BlobSource(const BlobSource&) = delete;
+ BlobSource& operator=(const BlobSource&) = delete;
+
+ ~BlobSource();
+
+ // Read a blob from the underlying cache or one blob file.
+ //
+ // If successful, returns ok and sets "*value" to the newly retrieved
+ // uncompressed blob. If there was an error while fetching the blob, sets
+ // "*value" to empty and returns a non-ok status.
+ //
+ // Note: For consistency, whether the blob is found in the cache or on disk,
+ // sets "*bytes_read" to the size of on-disk (possibly compressed) blob
+ // record.
+ Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+ uint64_t file_number, uint64_t offset, uint64_t file_size,
+ uint64_t value_size, CompressionType compression_type,
+ FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+ uint64_t* bytes_read);
+
+ // Read multiple blobs from the underlying cache or blob file(s).
+ //
+ // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+ // to the newly retrieved uncompressed blobs. If there was an error while
+ // fetching one of blobs, sets its "result" to empty and sets its
+ // corresponding "status" to a non-ok status.
+ //
+ // Note:
+ // - The main difference between this function and MultiGetBlobFromOneFile is
+ // that this function can read multiple blobs from multiple blob files.
+ //
+ // - For consistency, whether the blob is found in the cache or on disk, sets
+ // "*bytes_read" to the total size of on-disk (possibly compressed) blob
+ // records.
+ void MultiGetBlob(const ReadOptions& read_options,
+ autovector<BlobFileReadRequests>& blob_reqs,
+ uint64_t* bytes_read);
+
+ // Read multiple blobs from the underlying cache or one blob file.
+ //
+ // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+ // to the newly retrieved uncompressed blobs. If there was an error while
+ // fetching one of blobs, sets its "result" to empty and sets its
+ // corresponding "status" to a non-ok status.
+ //
+ // Note:
+ // - The main difference between this function and MultiGetBlob is that this
+ // function is only used for the case where the demanded blobs are stored in
+ // one blob file. MultiGetBlob will call this function multiple times if the
+ // demanded blobs are stored in multiple blob files.
+ //
+ // - For consistency, whether the blob is found in the cache or on disk, sets
+ // "*bytes_read" to the total size of on-disk (possibly compressed) blob
+ // records.
+ void MultiGetBlobFromOneFile(const ReadOptions& read_options,
+ uint64_t file_number, uint64_t file_size,
+ autovector<BlobReadRequest>& blob_reqs,
+ uint64_t* bytes_read);
+
+ inline Status GetBlobFileReader(
+ uint64_t blob_file_number,
+ CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+ return blob_file_cache_->GetBlobFileReader(blob_file_number,
+ blob_file_reader);
+ }
+
+ inline Cache* GetBlobCache() const { return blob_cache_.get(); }
+
+ bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+ uint64_t offset, size_t* charge = nullptr) const;
+
+ private:
+ Status GetBlobFromCache(const Slice& cache_key,
+ CacheHandleGuard<BlobContents>* cached_blob) const;
+
+ Status PutBlobIntoCache(const Slice& cache_key,
+ std::unique_ptr<BlobContents>* blob,
+ CacheHandleGuard<BlobContents>* cached_blob) const;
+
+ static void PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+ PinnableSlice* value);
+
+ static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+ PinnableSlice* value);
+
+ Cache::Handle* GetEntryFromCache(const Slice& key) const;
+
+ Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
+ size_t charge, Cache::Handle** cache_handle,
+ Cache::Priority priority) const;
+
+ inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
+ uint64_t offset) const {
+ OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+ return base_cache_key.WithOffset(offset);
+ }
+
+ const std::string& db_id_;
+ const std::string& db_session_id_;
+
+ Statistics* statistics_;
+
+ // A cache to store blob file reader.
+ BlobFileCache* blob_file_cache_;
+
+ // A cache to store uncompressed blobs.
+ std::shared_ptr<Cache> blob_cache_;
+
+ // The control option of how the cache tiers will be used. Currently rocksdb
+ // support block/blob cache (volatile tier) and secondary cache (this tier
+ // isn't strictly speaking a non-volatile tier since the compressed cache in
+ // this tier is in volatile memory).
+ const CacheTier lowest_used_cache_tier_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source_test.cc b/src/rocksdb/db/blob/blob_source_test.cc
new file mode 100644
index 000000000..a85ed8646
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source_test.cc
@@ -0,0 +1,1624 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "cache/charged_cache.h"
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/db_test_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+ uint32_t column_family_id, bool has_ttl,
+ const ExpirationRange& expiration_range_header,
+ const ExpirationRange& expiration_range_footer,
+ uint64_t blob_file_number, const std::vector<Slice>& keys,
+ const std::vector<Slice>& blobs, CompressionType compression,
+ std::vector<uint64_t>& blob_offsets,
+ std::vector<uint64_t>& blob_sizes) {
+ assert(!immutable_options.cf_paths.empty());
+ size_t num = keys.size();
+ assert(num == blobs.size());
+ assert(num == blob_offsets.size());
+ assert(num == blob_sizes.size());
+
+ const std::string blob_file_path =
+ BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+ std::unique_ptr<FSWritableFile> file;
+ ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+ FileOptions()));
+
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+ constexpr Statistics* statistics = nullptr;
+ constexpr bool use_fsync = false;
+ constexpr bool do_flush = false;
+
+ BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+ statistics, blob_file_number, use_fsync,
+ do_flush);
+
+ BlobLogHeader header(column_family_id, compression, has_ttl,
+ expiration_range_header);
+
+ ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+ std::vector<std::string> compressed_blobs(num);
+ std::vector<Slice> blobs_to_write(num);
+ if (kNoCompression == compression) {
+ for (size_t i = 0; i < num; ++i) {
+ blobs_to_write[i] = blobs[i];
+ blob_sizes[i] = blobs[i].size();
+ }
+ } else {
+ CompressionOptions opts;
+ CompressionContext context(compression);
+ constexpr uint64_t sample_for_compression = 0;
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ compression, sample_for_compression);
+
+ constexpr uint32_t compression_format_version = 2;
+
+ for (size_t i = 0; i < num; ++i) {
+ ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+ &compressed_blobs[i]));
+ blobs_to_write[i] = compressed_blobs[i];
+ blob_sizes[i] = compressed_blobs[i].size();
+ }
+ }
+
+ for (size_t i = 0; i < num; ++i) {
+ uint64_t key_offset = 0;
+ ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+ &blob_offsets[i]));
+ }
+
+ BlobLogFooter footer;
+ footer.blob_count = num;
+ footer.expiration_range = expiration_range_footer;
+
+ std::string checksum_method;
+ std::string checksum_value;
+ ASSERT_OK(
+ blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+} // anonymous namespace
+
+class BlobSourceTest : public DBTestBase {
+ protected:
+ public:
+ explicit BlobSourceTest()
+ : DBTestBase("blob_source_test", /*env_do_fsync=*/true) {
+ options_.env = env_;
+ options_.enable_blob_files = true;
+ options_.create_if_missing = true;
+
+ LRUCacheOptions co;
+ co.capacity = 8 << 20;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ co.high_pri_pool_ratio = 0.2;
+ co.low_pri_pool_ratio = 0.2;
+ options_.blob_cache = NewLRUCache(co);
+ options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+ assert(db_->GetDbIdentity(db_id_).ok());
+ assert(db_->GetDbSessionId(db_session_id_).ok());
+ }
+
+ Options options_;
+ std::string db_id_;
+ std::string db_session_id_;
+};
+
+TEST_F(BlobSourceTest, GetBlobsFromCache) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_GetBlobsFromCache"), 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr size_t num_blobs = 16;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ uint64_t file_size = BlobLogHeader::kSize;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+ }
+ file_size += BlobLogFooter::kSize;
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 1024;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+ {
+ // GetBlob
+ std::vector<PinnableSlice> values(keys.size());
+ uint64_t bytes_read = 0;
+ uint64_t blob_bytes = 0;
+ uint64_t total_bytes = 0;
+
+ read_options.fill_cache = false;
+ get_perf_context()->Reset();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ total_bytes += bytes_read;
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+ read_options.fill_cache = true;
+ blob_bytes = 0;
+ total_bytes = 0;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ blob_bytes += blob_sizes[i];
+ total_bytes += bytes_read;
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i + 1);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), blob_bytes);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+ blob_bytes);
+
+ read_options.fill_cache = true;
+ total_bytes = 0;
+ blob_bytes = 0;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ total_bytes += bytes_read; // on-disk blob record size
+ blob_bytes += blob_sizes[i]; // cached blob value size
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_bytes * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+ // Cache-only GetBlob
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ total_bytes = 0;
+ blob_bytes = 0;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read));
+ ASSERT_EQ(values[i], blobs[i]);
+ ASSERT_TRUE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ total_bytes += bytes_read;
+ blob_bytes += blob_sizes[i];
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_bytes * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ options_.blob_cache->EraseUnRefEntries();
+
+ {
+ // Cache-only GetBlob
+ std::vector<PinnableSlice> values(keys.size());
+ uint64_t bytes_read = 0;
+
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ read_options.fill_cache = true;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_TRUE(blob_source
+ .GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read)
+ .IsIncomplete());
+ ASSERT_TRUE(values[i].empty());
+ ASSERT_FALSE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read, 0);
+
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ {
+ // GetBlob from non-existing file
+ std::vector<PinnableSlice> values(keys.size());
+ uint64_t bytes_read = 0;
+ uint64_t file_number = 100; // non-existing file
+
+ read_options.read_tier = ReadTier::kReadAllTier;
+ read_options.fill_cache = true;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_TRUE(blob_source
+ .GetBlob(read_options, keys[i], file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer, &values[i],
+ &bytes_read)
+ .IsIOError());
+ ASSERT_TRUE(values[i].empty());
+ ASSERT_FALSE(values[i].IsPinned());
+ ASSERT_EQ(bytes_read, 0);
+
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+ // GetBlob, and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+}
+
+TEST_F(BlobSourceTest, GetCompressedBlobs) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ const CompressionType compression = kSnappyCompression;
+
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_GetCompressedBlobs"), 0);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr size_t num_blobs = 256;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ }
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ constexpr size_t capacity = 1024;
+ auto backing_cache = NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ uint64_t bytes_read = 0;
+ std::vector<PinnableSlice> values(keys.size());
+
+ {
+ // Snappy Compression
+ const uint64_t file_number = 1;
+
+ read_options.read_tier = ReadTier::kReadAllTier;
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range, expiration_range, file_number, keys, blobs,
+ compression, blob_offsets, blob_sizes);
+
+ CacheHandleGuard<BlobFileReader> blob_file_reader;
+ ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
+ ASSERT_NE(blob_file_reader.GetValue(), nullptr);
+
+ const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
+ ASSERT_EQ(blob_file_reader.GetValue()->GetCompressionType(), compression);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_NE(blobs[i].size() /*uncompressed size*/,
+ blob_sizes[i] /*compressed size*/);
+ }
+
+ read_options.fill_cache = true;
+ read_options.read_tier = ReadTier::kReadAllTier;
+ get_perf_context()->Reset();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ compression, nullptr /*prefetch_buffer*/,
+ &values[i], &bytes_read));
+ ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+ ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ ASSERT_GE((int)get_perf_context()->blob_decompress_time, 0);
+
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ get_perf_context()->Reset();
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+
+ // Compressed blob size is passed in GetBlob
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ compression, nullptr /*prefetch_buffer*/,
+ &values[i], &bytes_read));
+ ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+ ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+ }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromMultiFiles"),
+ 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_files = 2;
+ constexpr size_t num_blobs = 32;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ uint64_t file_size = BlobLogHeader::kSize;
+ uint64_t blob_value_bytes = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ blob_value_bytes += blobs[i].size();
+ file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+ }
+ file_size += BlobLogFooter::kSize;
+ const uint64_t blob_records_bytes =
+ file_size - BlobLogHeader::kSize - BlobLogFooter::kSize;
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ {
+ // Write key/blob pairs to multiple blob files.
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ WriteBlobFile(immutable_options, column_family_id, has_ttl,
+ expiration_range, expiration_range, file_number, keys,
+ blobs, kNoCompression, blob_offsets, blob_sizes);
+ }
+ }
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ uint64_t bytes_read = 0;
+
+ {
+ // MultiGetBlob
+ read_options.fill_cache = true;
+ read_options.read_tier = ReadTier::kReadAllTier;
+
+ autovector<BlobFileReadRequests> blob_reqs;
+ std::array<autovector<BlobReadRequest>, blob_files> blob_reqs_in_file;
+ std::array<PinnableSlice, num_blobs * blob_files> value_buf;
+ std::array<Status, num_blobs * blob_files> statuses_buf;
+
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ for (size_t j = 0; j < num_blobs; ++j) {
+ blob_reqs_in_file[i].emplace_back(
+ keys[j], blob_offsets[j], blob_sizes[j], kNoCompression,
+ &value_buf[i * num_blobs + j], &statuses_buf[i * num_blobs + j]);
+ }
+ blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file[i]);
+ }
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ for (size_t j = 0; j < num_blobs; ++j) {
+ ASSERT_OK(statuses_buf[i * num_blobs + j]);
+ ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[j]));
+ }
+ }
+
+ // Retrieved all blobs from 2 blob files twice via MultiGetBlob and
+ // TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+ num_blobs * blob_files);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count,
+ num_blobs * blob_files); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+ blob_records_bytes * blob_files); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS),
+ num_blobs * blob_files); // MultiGetBlob
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+ num_blobs * blob_files); // TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD),
+ num_blobs * blob_files); // MultiGetBlob
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_value_bytes * blob_files); // TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+ blob_value_bytes * blob_files); // MultiGetBlob
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ autovector<BlobReadRequest> fake_blob_reqs_in_file;
+ std::array<PinnableSlice, num_blobs> fake_value_buf;
+ std::array<Status, num_blobs> fake_statuses_buf;
+
+ const uint64_t fake_file_number = 100;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ fake_blob_reqs_in_file.emplace_back(
+ keys[i], blob_offsets[i], blob_sizes[i], kNoCompression,
+ &fake_value_buf[i], &fake_statuses_buf[i]);
+ }
+
+ // Add a fake multi-get blob request.
+ blob_reqs.emplace_back(fake_file_number, file_size, fake_blob_reqs_in_file);
+
+ blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+ // Check the real blob read requests.
+ for (size_t i = 0; i < blob_files; ++i) {
+ const uint64_t file_number = i + 1;
+ for (size_t j = 0; j < num_blobs; ++j) {
+ ASSERT_OK(statuses_buf[i * num_blobs + j]);
+ ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[j]));
+ }
+ }
+
+ // Check the fake blob request.
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(fake_statuses_buf[i].IsIOError());
+ ASSERT_TRUE(fake_value_buf[i].empty());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(fake_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Retrieved all blobs from 3 blob files (including the fake one) twice
+ // via MultiGetBlob and TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+ num_blobs * blob_files * 2);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count,
+ 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+ 0); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ // Fake blob requests: MultiGetBlob and TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ // Real blob requests: MultiGetBlob and TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+ num_blobs * blob_files * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ // Real blob requests: MultiGetBlob and TEST_BlobInCache
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_value_bytes * blob_files * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromCache"), 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t blob_file_number = 1;
+ constexpr size_t num_blobs = 16;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> blob_strs;
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ key_strs.push_back("key" + std::to_string(i));
+ blob_strs.push_back("blob" + std::to_string(i));
+ }
+
+ std::vector<Slice> keys;
+ std::vector<Slice> blobs;
+
+ uint64_t file_size = BlobLogHeader::kSize;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ keys.push_back({key_strs[i]});
+ blobs.push_back({blob_strs[i]});
+ file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+ }
+ file_size += BlobLogFooter::kSize;
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, blob_file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache =
+ NewLRUCache(capacity); // Blob file cache
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+ {
+ // MultiGetBlobFromOneFile
+ uint64_t bytes_read = 0;
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<PinnableSlice, num_blobs> value_buf;
+ autovector<BlobReadRequest> blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; i += 2) { // even index
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ read_options.fill_cache = true;
+ read_options.read_tier = ReadTier::kReadAllTier;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ // Get half of blobs
+ blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ uint64_t fs_read_bytes = 0;
+ uint64_t ca_read_bytes = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ if (i % 2 == 0) {
+ ASSERT_OK(statuses_buf[i]);
+ ASSERT_EQ(value_buf[i], blobs[i]);
+ ASSERT_TRUE(value_buf[i].IsPinned());
+ fs_read_bytes +=
+ blob_sizes[i] + keys[i].size() + BlobLogRecord::kHeaderSize;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ ca_read_bytes += blob_sizes[i];
+ } else {
+ statuses_buf[i].PermitUncheckedError();
+ ASSERT_TRUE(value_buf[i].empty());
+ ASSERT_FALSE(value_buf[i].IsPinned());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+ }
+
+ constexpr int num_even_blobs = num_blobs / 2;
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_even_blobs);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count,
+ num_even_blobs); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+ fs_read_bytes); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_even_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_even_blobs);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ ca_read_bytes);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+ ca_read_bytes);
+
+ // Get the rest of blobs
+ for (size_t i = 1; i < num_blobs; i += 2) { // odd index
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+ blob_offsets[i], file_size, blob_sizes[i],
+ kNoCompression, prefetch_buffer,
+ &value_buf[i], &bytes_read));
+ ASSERT_EQ(value_buf[i], blobs[i]);
+ ASSERT_TRUE(value_buf[i].IsPinned());
+ ASSERT_EQ(bytes_read,
+ BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ // Cache-only MultiGetBlobFromOneFile
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_reqs.clear();
+ for (size_t i = 0; i < num_blobs; ++i) {
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ }
+
+ blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ uint64_t blob_bytes = 0;
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_OK(statuses_buf[i]);
+ ASSERT_EQ(value_buf[i], blobs[i]);
+ ASSERT_TRUE(value_buf[i].IsPinned());
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ blob_bytes += blob_sizes[i];
+ }
+
+ // Retrieved the blob cache num_blobs * 2 times via GetBlob and
+ // TEST_BlobInCache.
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 2);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o
+ ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+ blob_bytes * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ options_.blob_cache->EraseUnRefEntries();
+
+ {
+ // Cache-only MultiGetBlobFromOneFile
+ uint64_t bytes_read = 0;
+ read_options.read_tier = ReadTier::kBlockCacheTier;
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<PinnableSlice, num_blobs> value_buf;
+ autovector<BlobReadRequest> blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; i++) {
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(statuses_buf[i].IsIncomplete());
+ ASSERT_TRUE(value_buf[i].empty());
+ ASSERT_FALSE(value_buf[i].IsPinned());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+ blob_offsets[i]));
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+
+ {
+ // MultiGetBlobFromOneFile from non-existing file
+ uint64_t bytes_read = 0;
+ uint64_t non_existing_file_number = 100;
+ read_options.read_tier = ReadTier::kReadAllTier;
+
+ std::array<Status, num_blobs> statuses_buf;
+ std::array<PinnableSlice, num_blobs> value_buf;
+ autovector<BlobReadRequest> blob_reqs;
+
+ for (size_t i = 0; i < num_blobs; i++) {
+ blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+ kNoCompression, &value_buf[i], &statuses_buf[i]);
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+ file_size, blob_offsets[i]));
+ }
+
+ get_perf_context()->Reset();
+ statistics->Reset().PermitUncheckedError();
+
+ blob_source.MultiGetBlobFromOneFile(read_options, non_existing_file_number,
+ file_size, blob_reqs, &bytes_read);
+
+ for (size_t i = 0; i < num_blobs; ++i) {
+ ASSERT_TRUE(statuses_buf[i].IsIOError());
+ ASSERT_TRUE(value_buf[i].empty());
+ ASSERT_FALSE(value_buf[i].IsPinned());
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+ file_size, blob_offsets[i]));
+ }
+
+ ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o
+ ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+ ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+ }
+}
+
+class BlobSecondaryCacheTest : public DBTestBase {
+ protected:
+ public:
+ explicit BlobSecondaryCacheTest()
+ : DBTestBase("blob_secondary_cache_test", /*env_do_fsync=*/true) {
+ options_.env = env_;
+ options_.enable_blob_files = true;
+ options_.create_if_missing = true;
+
+ // Set a small cache capacity to evict entries from the cache, and to test
+ // that secondary cache is used properly.
+ lru_cache_opts_.capacity = 1024;
+ lru_cache_opts_.num_shard_bits = 0;
+ lru_cache_opts_.strict_capacity_limit = true;
+ lru_cache_opts_.metadata_charge_policy = kDontChargeCacheMetadata;
+ lru_cache_opts_.high_pri_pool_ratio = 0.2;
+ lru_cache_opts_.low_pri_pool_ratio = 0.2;
+
+ secondary_cache_opts_.capacity = 8 << 20; // 8 MB
+ secondary_cache_opts_.num_shard_bits = 0;
+ secondary_cache_opts_.metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy;
+
+ // Read blobs from the secondary cache if they are not in the primary cache
+ options_.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+ assert(db_->GetDbIdentity(db_id_).ok());
+ assert(db_->GetDbSessionId(db_session_id_).ok());
+ }
+
+ Options options_;
+
+ LRUCacheOptions lru_cache_opts_;
+ CompressedSecondaryCacheOptions secondary_cache_opts_;
+
+ std::string db_id_;
+ std::string db_session_id_;
+};
+
+TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
+ if (!Snappy_Supported()) {
+ return;
+ }
+
+ secondary_cache_opts_.compression_type = kSnappyCompression;
+ lru_cache_opts_.secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts_);
+ options_.blob_cache = NewLRUCache(lru_cache_opts_);
+
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(
+ env_, "BlobSecondaryCacheTest_GetBlobsFromSecondaryCache"),
+ 0);
+
+ options_.statistics = CreateDBStatistics();
+ Statistics* statistics = options_.statistics.get();
+ assert(statistics);
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr uint32_t column_family_id = 1;
+ constexpr bool has_ttl = false;
+ constexpr ExpirationRange expiration_range;
+ constexpr uint64_t file_number = 1;
+
+ Random rnd(301);
+
+ std::vector<std::string> key_strs{"key0", "key1"};
+ std::vector<std::string> blob_strs{rnd.RandomString(512),
+ rnd.RandomString(768)};
+
+ std::vector<Slice> keys{key_strs[0], key_strs[1]};
+ std::vector<Slice> blobs{blob_strs[0], blob_strs[1]};
+
+ std::vector<uint64_t> blob_offsets(keys.size());
+ std::vector<uint64_t> blob_sizes(keys.size());
+
+ WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+ expiration_range, file_number, keys, blobs, kNoCompression,
+ blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 1024;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache(new BlobFileCache(
+ backing_cache.get(), &immutable_options, &file_options, column_family_id,
+ blob_file_read_hist, nullptr /*IOTracer*/));
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ CacheHandleGuard<BlobFileReader> file_reader;
+ ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
+ ASSERT_NE(file_reader.GetValue(), nullptr);
+ const uint64_t file_size = file_reader.GetValue()->GetFileSize();
+ ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ auto blob_cache = options_.blob_cache;
+ auto secondary_cache = lru_cache_opts_.secondary_cache;
+
+ Cache::CreateCallback create_cb = [](const void* buf, size_t size,
+ void** out_obj,
+ size_t* charge) -> Status {
+ CacheAllocationPtr allocation(new char[size]);
+
+ return BlobContents::CreateCallback(std::move(allocation), buf, size,
+ out_obj, charge);
+ };
+
+ {
+ // GetBlob
+ std::vector<PinnableSlice> values(keys.size());
+
+ read_options.fill_cache = true;
+ get_perf_context()->Reset();
+
+ // key0 should be filled to the primary cache from the blob file.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+ blob_offsets[0], file_size, blob_sizes[0],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[0], nullptr /* bytes_read */));
+ // Release cache handle
+ values[0].Reset();
+
+ // key0 should be evicted and key0's dummy item is inserted into secondary
+ // cache. key1 should be filled to the primary cache from the blob file.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+ blob_offsets[1], file_size, blob_sizes[1],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[1], nullptr /* bytes_read */));
+
+ // Release cache handle
+ values[1].Reset();
+
+ // key0 should be filled to the primary cache from the blob file. key1
+ // should be evicted and key1's dummy item is inserted into secondary cache.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+ blob_offsets[0], file_size, blob_sizes[0],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[0], nullptr /* bytes_read */));
+ ASSERT_EQ(values[0], blobs[0]);
+ ASSERT_TRUE(
+ blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0]));
+
+ // Release cache handle
+ values[0].Reset();
+
+ // key0 should be evicted and is inserted into secondary cache.
+ // key1 should be filled to the primary cache from the blob file.
+ ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+ blob_offsets[1], file_size, blob_sizes[1],
+ kNoCompression, nullptr /* prefetch_buffer */,
+ &values[1], nullptr /* bytes_read */));
+ ASSERT_EQ(values[1], blobs[1]);
+ ASSERT_TRUE(
+ blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1]));
+
+ // Release cache handle
+ values[1].Reset();
+
+ OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+ // blob_cache here only looks at the primary cache since we didn't provide
+ // the cache item helper for the secondary cache. However, since key0 is
+ // demoted to the secondary cache, we shouldn't be able to find it in the
+ // primary cache.
+ {
+ CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
+ const Slice key0 = cache_key.AsSlice();
+ auto handle0 = blob_cache->Lookup(key0, statistics);
+ ASSERT_EQ(handle0, nullptr);
+
+ // key0's item should be in the secondary cache.
+ bool is_in_sec_cache = false;
+ auto sec_handle0 =
+ secondary_cache->Lookup(key0, create_cb, true,
+ /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_FALSE(is_in_sec_cache);
+ ASSERT_NE(sec_handle0, nullptr);
+ ASSERT_TRUE(sec_handle0->IsReady());
+ auto value = static_cast<BlobContents*>(sec_handle0->Value());
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[0]);
+ delete value;
+
+ // key0 doesn't exist in the blob cache although key0's dummy
+ // item exist in the secondary cache.
+ ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[0]));
+ }
+
+ // key1 should exists in the primary cache. key1's dummy item exists
+ // in the secondary cache.
+ {
+ CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
+ const Slice key1 = cache_key.AsSlice();
+ auto handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_NE(handle1, nullptr);
+ blob_cache->Release(handle1);
+
+ bool is_in_sec_cache = false;
+ auto sec_handle1 =
+ secondary_cache->Lookup(key1, create_cb, true,
+ /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_FALSE(is_in_sec_cache);
+ ASSERT_EQ(sec_handle1, nullptr);
+
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[1]));
+ }
+
+ {
+ // fetch key0 from the blob file to the primary cache.
+ // key1 is evicted and inserted into the secondary cache.
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys[0], file_number, blob_offsets[0], file_size,
+ blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */,
+ &values[0], nullptr /* bytes_read */));
+ ASSERT_EQ(values[0], blobs[0]);
+
+ // Release cache handle
+ values[0].Reset();
+
+ // key0 should be in the primary cache.
+ CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
+ const Slice key0 = cache_key0.AsSlice();
+ auto handle0 = blob_cache->Lookup(key0, statistics);
+ ASSERT_NE(handle0, nullptr);
+ auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
+ ASSERT_NE(value, nullptr);
+ ASSERT_EQ(value->data(), blobs[0]);
+ blob_cache->Release(handle0);
+
+ // key1 is not in the primary cache and is in the secondary cache.
+ CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
+ const Slice key1 = cache_key1.AsSlice();
+ auto handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_EQ(handle1, nullptr);
+
+ // erase key0 from the primary cache.
+ blob_cache->Erase(key0);
+ handle0 = blob_cache->Lookup(key0, statistics);
+ ASSERT_EQ(handle0, nullptr);
+
+ // key1 promotion should succeed due to the primary cache being empty. we
+ // did't call secondary cache's Lookup() here, because it will remove the
+ // key but it won't be able to promote the key to the primary cache.
+ // Instead we use the end-to-end blob source API to read key1.
+ // In function TEST_BlobInCache, key1's dummy item is inserted into the
+ // primary cache and a standalone handle is checked by GetValue().
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[1]));
+
+ // key1's dummy handle is in the primary cache and key1's item is still
+ // in the secondary cache. So, the primary cache's Lookup() without
+ // secondary cache support cannot see it. (NOTE: The dummy handle used
+ // to be a leaky abstraction but not anymore.)
+ handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_EQ(handle1, nullptr);
+
+ // But after another access, it is promoted to primary cache
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+ blob_offsets[1]));
+
+ // And Lookup() can find it (without secondary cache support)
+ handle1 = blob_cache->Lookup(key1, statistics);
+ ASSERT_NE(handle1, nullptr);
+ ASSERT_NE(blob_cache->Value(handle1), nullptr);
+ blob_cache->Release(handle1);
+ }
+ }
+}
+
+class BlobSourceCacheReservationTest : public DBTestBase {
+ public:
+ explicit BlobSourceCacheReservationTest()
+ : DBTestBase("blob_source_cache_reservation_test",
+ /*env_do_fsync=*/true) {
+ options_.env = env_;
+ options_.enable_blob_files = true;
+ options_.create_if_missing = true;
+
+ LRUCacheOptions co;
+ co.capacity = kCacheCapacity;
+ co.num_shard_bits = kNumShardBits;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+
+ co.high_pri_pool_ratio = 0.0;
+ co.low_pri_pool_ratio = 0.0;
+ std::shared_ptr<Cache> blob_cache = NewLRUCache(co);
+
+ co.high_pri_pool_ratio = 0.5;
+ co.low_pri_pool_ratio = 0.5;
+ std::shared_ptr<Cache> block_cache = NewLRUCache(co);
+
+ options_.blob_cache = blob_cache;
+ options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = block_cache;
+ block_based_options.cache_usage_options.options_overrides.insert(
+ {CacheEntryRole::kBlobCache,
+ {/* charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
+ options_.table_factory.reset(
+ NewBlockBasedTableFactory(block_based_options));
+
+ assert(db_->GetDbIdentity(db_id_).ok());
+ assert(db_->GetDbSessionId(db_session_id_).ok());
+ }
+
+ void GenerateKeysAndBlobs() {
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ key_strs_.push_back("key" + std::to_string(i));
+ blob_strs_.push_back("blob" + std::to_string(i));
+ }
+
+ blob_file_size_ = BlobLogHeader::kSize;
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ keys_.push_back({key_strs_[i]});
+ blobs_.push_back({blob_strs_[i]});
+ blob_file_size_ +=
+ BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size();
+ }
+ blob_file_size_ += BlobLogFooter::kSize;
+ }
+
+ static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
+ CacheEntryRole::kBlobCache>::GetDummyEntrySize();
+ static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
+ static constexpr int kNumShardBits = 0; // 2^0 shard
+
+ static constexpr uint32_t kColumnFamilyId = 1;
+ static constexpr bool kHasTTL = false;
+ static constexpr uint64_t kBlobFileNumber = 1;
+ static constexpr size_t kNumBlobs = 16;
+
+ std::vector<Slice> keys_;
+ std::vector<Slice> blobs_;
+ std::vector<std::string> key_strs_;
+ std::vector<std::string> blob_strs_;
+ uint64_t blob_file_size_;
+
+ Options options_;
+ std::string db_id_;
+ std::string db_session_id_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(
+ env_, "BlobSourceCacheReservationTest_SimpleCacheReservation"),
+ 0);
+
+ GenerateKeysAndBlobs();
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+
+ constexpr ExpirationRange expiration_range;
+
+ std::vector<uint64_t> blob_offsets(keys_.size());
+ std::vector<uint64_t> blob_sizes(keys_.size());
+
+ WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+ expiration_range, kBlobFileNumber, keys_, blobs_,
+ kNoCompression, blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ConcurrentCacheReservationManager* cache_res_mgr =
+ static_cast<ChargedCache*>(blob_source.GetBlobCache())
+ ->TEST_GetCacheReservationManager();
+ ASSERT_NE(cache_res_mgr, nullptr);
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ {
+ read_options.fill_cache = false;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+ }
+ }
+
+ {
+ read_options.fill_cache = true;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ // num_blobs is 16, so the total blob cache usage is less than a single
+ // dummy entry. Therefore, cache reservation manager only reserves one dummy
+ // entry here.
+ uint64_t blob_bytes = 0;
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+ size_t charge = 0;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+ blob_offsets[i], &charge));
+
+ blob_bytes += charge;
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+ options_.blob_cache->GetUsage());
+ }
+ }
+
+ {
+ OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber);
+ size_t blob_bytes = options_.blob_cache->GetUsage();
+
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ size_t charge = 0;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+ blob_offsets[i], &charge));
+
+ CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[i]);
+ // We didn't call options_.blob_cache->Erase() here, this is because
+ // the cache wrapper's Erase() method must be called to update the
+ // cache usage after erasing the cache entry.
+ blob_source.GetBlobCache()->Erase(cache_key.AsSlice());
+ if (i == kNumBlobs - 1) {
+ // All the blobs got removed from the cache. cache_res_mgr should not
+ // reserve any space for them.
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+ } else {
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+ }
+ blob_bytes -= charge;
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+ options_.blob_cache->GetUsage());
+ }
+ }
+}
+
+TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
+ options_.cf_paths.emplace_back(
+ test::PerThreadDBPath(
+ env_,
+ "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
+ 0);
+
+ GenerateKeysAndBlobs();
+
+ DestroyAndReopen(options_);
+
+ ImmutableOptions immutable_options(options_);
+ constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ blob_file_size_ -= blobs_[i].size(); // old blob size
+ blob_strs_[i].resize(blob_size, '@');
+ blobs_[i] = Slice(blob_strs_[i]);
+ blob_file_size_ += blobs_[i].size(); // new blob size
+ }
+
+ std::vector<uint64_t> blob_offsets(keys_.size());
+ std::vector<uint64_t> blob_sizes(keys_.size());
+
+ constexpr ExpirationRange expiration_range;
+ WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+ expiration_range, kBlobFileNumber, keys_, blobs_,
+ kNoCompression, blob_offsets, blob_sizes);
+
+ constexpr size_t capacity = 10;
+ std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+ FileOptions file_options;
+ constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+ std::unique_ptr<BlobFileCache> blob_file_cache =
+ std::make_unique<BlobFileCache>(
+ backing_cache.get(), &immutable_options, &file_options,
+ kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+ BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+ blob_file_cache.get());
+
+ ConcurrentCacheReservationManager* cache_res_mgr =
+ static_cast<ChargedCache*>(blob_source.GetBlobCache())
+ ->TEST_GetCacheReservationManager();
+ ASSERT_NE(cache_res_mgr, nullptr);
+
+ ReadOptions read_options;
+ read_options.verify_checksums = true;
+
+ {
+ read_options.fill_cache = false;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+ }
+ }
+
+ {
+ read_options.fill_cache = true;
+
+ std::vector<PinnableSlice> values(keys_.size());
+
+ // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
+ // can't fit all the blobs in the cache at the same time, which means we
+ // should observe cache evictions once we reach the cache's capacity.
+ // Due to the overhead of the cache and the BlobContents objects, as well as
+ // jemalloc bin sizes, this happens after inserting seven blobs.
+ uint64_t blob_bytes = 0;
+ for (size_t i = 0; i < kNumBlobs; ++i) {
+ ASSERT_OK(blob_source.GetBlob(
+ read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+ blob_file_size_, blob_sizes[i], kNoCompression,
+ nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+ // Release cache handle
+ values[i].Reset();
+
+ if (i < kNumBlobs / 2 - 1) {
+ size_t charge = 0;
+ ASSERT_TRUE(blob_source.TEST_BlobInCache(
+ kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
+
+ blob_bytes += charge;
+ }
+
+ ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+ ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+ options_.blob_cache->GetUsage());
+ }
+ }
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_basic_test.cc b/src/rocksdb/db/blob/db_blob_basic_test.cc
new file mode 100644
index 000000000..e6832a2ae
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_basic_test.cc
@@ -0,0 +1,1789 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <sstream>
+#include <string>
+
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobBasicTest : public DBTestBase {
+ protected:
+ DBBlobBasicTest()
+ : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBBlobBasicTest, GetBlob) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(Get(key), blob_value);
+
+ // Try again with no I/O allowed. The table and the necessary blocks should
+ // already be in their respective caches; however, the blob itself can only be
+ // read from the blob file, so the read should return Incomplete.
+ ReadOptions read_options;
+ read_options.read_tier = kBlockCacheTier;
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+ .IsIncomplete());
+}
+
+TEST_F(DBBlobBasicTest, GetBlobFromCache) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+
+ read_options.fill_cache = false;
+
+ {
+ PinnableSlice result;
+
+ read_options.read_tier = kReadAllTier;
+ ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+ ASSERT_EQ(result, blob_value);
+
+ result.Reset();
+ read_options.read_tier = kBlockCacheTier;
+
+ // Try again with no I/O allowed. Since we didn't re-fill the cache, the
+ // blob itself can only be read from the blob file, so the read should
+ // return Incomplete.
+ ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+ .IsIncomplete());
+ ASSERT_TRUE(result.empty());
+ }
+
+ read_options.fill_cache = true;
+
+ {
+ PinnableSlice result;
+
+ read_options.read_tier = kReadAllTier;
+ ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+ ASSERT_EQ(result, blob_value);
+
+ result.Reset();
+ read_options.read_tier = kBlockCacheTier;
+
+ // Try again with no I/O allowed. The table and the necessary blocks/blobs
+ // should already be in their respective caches.
+ ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+ ASSERT_EQ(result, blob_value);
+ }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCache) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ options.statistics = CreateDBStatistics();
+
+ Reopen(options);
+
+ int num_blobs = 5;
+ std::vector<std::string> keys;
+ std::vector<std::string> blobs;
+
+ for (int i = 0; i < num_blobs; ++i) {
+ keys.push_back("key" + std::to_string(i));
+ blobs.push_back("blob" + std::to_string(i));
+ ASSERT_OK(Put(keys[i], blobs[i]));
+ }
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+
+ {
+ read_options.fill_cache = false;
+ read_options.read_tier = kReadAllTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ int i = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), keys[i]);
+ ASSERT_EQ(iter->value().ToString(), blobs[i]);
+ ++i;
+ }
+ ASSERT_EQ(i, num_blobs);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+ }
+
+ {
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ // Try again with no I/O allowed. Since we didn't re-fill the cache,
+ // the blob itself can only be read from the blob file, so iter->Valid()
+ // should be false.
+ iter->SeekToFirst();
+ ASSERT_NOK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+ }
+
+ {
+ read_options.fill_cache = true;
+ read_options.read_tier = kReadAllTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ // Read blobs from the file and refill the cache.
+ int i = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), keys[i]);
+ ASSERT_EQ(iter->value().ToString(), blobs[i]);
+ ++i;
+ }
+ ASSERT_EQ(i, num_blobs);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD),
+ num_blobs);
+ }
+
+ {
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+ ASSERT_OK(iter->status());
+
+ // Try again with no I/O allowed. The table and the necessary blocks/blobs
+ // should already be in their respective caches.
+ int i = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), keys[i]);
+ ASSERT_EQ(iter->value().ToString(), blobs[i]);
+ ++i;
+ }
+ ASSERT_EQ(i, num_blobs);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+ }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) {
+ constexpr size_t min_blob_size = 6;
+
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions cache_options;
+ cache_options.capacity = 2048;
+ cache_options.num_shard_bits = 0;
+ cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+ options.blob_cache = NewLRUCache(cache_options);
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+
+ Reopen(options);
+
+ // Put then iterate over three key-values. The second value is below the size
+ // limit and is thus stored inline; the other two are stored separately as
+ // blobs. We expect to have something pinned in the cache iff we are
+ // positioned on a blob.
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "long_value";
+ static_assert(sizeof(first_value) - 1 >= min_blob_size,
+ "first_value too short to be stored as blob");
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "short";
+ static_assert(sizeof(second_value) - 1 < min_blob_size,
+ "second_value too long to be inlined");
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "other_long_value";
+ static_assert(sizeof(third_value) - 1 >= min_blob_size,
+ "third_value too short to be stored as blob");
+
+ ASSERT_OK(Put(third_key, third_value));
+
+ ASSERT_OK(Flush());
+
+ {
+ ReadOptions read_options;
+ read_options.fill_cache = true;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_value);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ }
+
+ {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_value);
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+ }
+
+ {
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+ read_options.read_tier = kBlockCacheTier;
+
+ std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), third_key);
+ ASSERT_EQ(iter->value(), third_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), second_key);
+ ASSERT_EQ(iter->value(), second_value);
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key(), first_key);
+ ASSERT_EQ(iter->value(), first_value);
+ ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+ iter->Prev();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+ }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobs) {
+ constexpr size_t min_blob_size = 6;
+
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+
+ Reopen(options);
+
+ // Put then retrieve three key-values. The first value is below the size limit
+ // and is thus stored inline; the other two are stored separately as blobs.
+ constexpr size_t num_keys = 3;
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "short";
+ static_assert(sizeof(first_value) - 1 < min_blob_size,
+ "first_value too long to be inlined");
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "long_value";
+ static_assert(sizeof(second_value) - 1 >= min_blob_size,
+ "second_value too short to be stored as blob");
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "other_long_value";
+ static_assert(sizeof(third_value) - 1 >= min_blob_size,
+ "third_value too short to be stored as blob");
+
+ ASSERT_OK(Put(third_key, third_value));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+
+ std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ // Try again with no I/O allowed. The table and the necessary blocks should
+ // already be in their respective caches. The first (inlined) value should be
+ // successfully read; however, the two blob values could only be read from the
+ // blob file, so for those the read should return Incomplete.
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_TRUE(statuses[1].IsIncomplete());
+
+ ASSERT_TRUE(statuses[2].IsIncomplete());
+ }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ constexpr size_t min_blob_size = 6;
+ options.min_blob_size = min_blob_size;
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ DestroyAndReopen(options);
+
+ // Put then retrieve three key-values. The first value is below the size limit
+ // and is thus stored inline; the other two are stored separately as blobs.
+ constexpr size_t num_keys = 3;
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "short";
+ static_assert(sizeof(first_value) - 1 < min_blob_size,
+ "first_value too long to be inlined");
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "long_value";
+ static_assert(sizeof(second_value) - 1 >= min_blob_size,
+ "second_value too short to be stored as blob");
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "other_long_value";
+ static_assert(sizeof(third_value) - 1 >= min_blob_size,
+ "third_value too short to be stored as blob");
+
+ ASSERT_OK(Put(third_key, third_value));
+
+ ASSERT_OK(Flush());
+
+ ReadOptions read_options;
+ read_options.fill_cache = false;
+
+ std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ // Try again with no I/O allowed. The first (inlined) value should be
+ // successfully read; however, the two blob values could only be read from the
+ // blob file, so for those the read should return Incomplete.
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_TRUE(statuses[1].IsIncomplete());
+
+ ASSERT_TRUE(statuses[2].IsIncomplete());
+ }
+
+ // Fill the cache when reading blobs from the blob file.
+ read_options.read_tier = kReadAllTier;
+ read_options.fill_cache = true;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+
+ // Try again with no I/O allowed. All blobs should be successfully read from
+ // the cache.
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_value);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], second_value);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], third_value);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
+ Options options = GetDefaultOptions();
+
+ // First, create an external SST file ["b"].
+ const std::string file_path = dbname_ + "/test.sst";
+ {
+ SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions());
+ Status s = sst_file_writer.Open(file_path);
+ ASSERT_OK(s);
+ ASSERT_OK(sst_file_writer.Put("b", "b_value"));
+ ASSERT_OK(sst_file_writer.Finish());
+ }
+
+ options.enable_blob_files = true;
+ options.min_blob_size = 1000;
+ options.use_direct_reads = true;
+ options.allow_ingest_behind = true;
+
+ // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+ // new table file when encountering a new key whose 1-byte prefix changes.
+ constexpr size_t key_len = 1;
+ options.sst_partitioner_factory =
+ NewSstPartitionerFixedPrefixFactory(key_len);
+
+ Status s = TryReopen(options);
+ if (s.IsInvalidArgument()) {
+ ROCKSDB_GTEST_SKIP("This test requires direct IO support");
+ return;
+ }
+ ASSERT_OK(s);
+
+ constexpr size_t num_keys = 3;
+ constexpr size_t blob_size = 3000;
+
+ constexpr char first_key[] = "a";
+ const std::string first_blob(blob_size, 'a');
+ ASSERT_OK(Put(first_key, first_blob));
+
+ constexpr char second_key[] = "b";
+ const std::string second_blob(2 * blob_size, 'b');
+ ASSERT_OK(Put(second_key, second_blob));
+
+ constexpr char third_key[] = "d";
+ const std::string third_blob(blob_size, 'd');
+ ASSERT_OK(Put(third_key, third_blob));
+
+ // first_blob, second_blob and third_blob in the same blob file.
+ // SST Blob file
+ // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'|
+ // | | | ^ ^ ^
+ // | | | | | |
+ // | | +---------|-------|--------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ ASSERT_OK(Flush());
+
+ constexpr char fourth_key[] = "c";
+ const std::string fourth_blob(blob_size, 'c');
+ ASSERT_OK(Put(fourth_key, fourth_blob));
+ // fourth_blob in another blob file.
+ // SST Blob file SST Blob file
+ // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| ["c"] |'cccc'|
+ // | | | ^ ^ ^ | ^
+ // | | | | | | | |
+ // | | +---------|-------|--------+ +-------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ // Due to the above sst partitioner, we get 4 L1 files. The blob files are
+ // unchanged.
+ // |'aaaa', 'bbbb', 'dddd'| |'cccc'|
+ // ^ ^ ^ ^
+ // | | | |
+ // L0 | | | |
+ // L1 ["a"] ["b"] ["c"] | | ["d"] |
+ // | | | | | |
+ // | | +---------|-------|---------------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+ {
+ // Ingest the external SST file into bottommost level.
+ std::vector<std::string> ext_files{file_path};
+ IngestExternalFileOptions opts;
+ opts.ingest_behind = true;
+ ASSERT_OK(
+ db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts));
+ }
+
+ // Now the database becomes as follows.
+ // |'aaaa', 'bbbb', 'dddd'| |'cccc'|
+ // ^ ^ ^ ^
+ // | | | |
+ // L0 | | | |
+ // L1 ["a"] ["b"] ["c"] | | ["d"] |
+ // | | | | | |
+ // | | +---------|-------|---------------+
+ // | +-----------------|-------+
+ // +-------------------------+
+ //
+ // L6 ["b"]
+
+ {
+ // Compact ["b"] to bottommost level.
+ Slice begin = Slice(second_key);
+ Slice end = Slice(second_key);
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db_->CompactRange(cro, &begin, &end));
+ }
+
+ // |'aaaa', 'bbbb', 'dddd'| |'cccc'|
+ // ^ ^ ^ ^
+ // | | | |
+ // L0 | | | |
+ // L1 ["a"] ["c"] | | ["d"] |
+ // | | | | |
+ // | +---------|-------|---------------+
+ // | +-----------------|-------+
+ // +-------|-----------------+
+ // |
+ // L6 ["b"]
+ ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1));
+ ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6));
+
+ bool called = false;
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) {
+ auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg);
+ assert(aligned_reqs);
+ ASSERT_EQ(1, aligned_reqs->size());
+ called = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::array<Slice, num_keys> keys{{first_key, third_key, second_key}};
+
+ {
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ // The MultiGet(), when constructing the KeyContexts, will process the keys
+ // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1,
+ // while ["b"] resides in L6.
+ // Consequently, the original FSReadRequest list prepared by
+ // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as
+ // follows:
+ //
+ // ["a", offset=30, len=3033],
+ // ["d", offset=9096, len=3033],
+ // ["b", offset=3063, len=6033]
+ //
+ // If we do not sort them before calling MultiRead() in DirectIO, then the
+ // underlying IO merging logic will yield two requests.
+ //
+ // [offset=0, len=4096] (for "a")
+ // [offset=0, len=12288] (result of merging the request for "d" and "b")
+ //
+ // We need to sort them in Version::MultiGetBlob() so that the underlying
+ // IO merging logic in DirectIO mode works as expected. The correct
+ // behavior will be one aligned request:
+ //
+ // [offset=0, len=12288]
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(called);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], first_blob);
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], third_blob);
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], second_blob);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 2 << 20; // 2MB
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ Reopen(options);
+
+ constexpr size_t kNumBlobFiles = 3;
+ constexpr size_t kNumBlobsPerFile = 3;
+ constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles;
+
+ std::vector<std::string> key_strs;
+ std::vector<std::string> value_strs;
+ for (size_t i = 0; i < kNumBlobFiles; ++i) {
+ for (size_t j = 0; j < kNumBlobsPerFile; ++j) {
+ std::string key = "key" + std::to_string(i) + "_" + std::to_string(j);
+ std::string value =
+ "value_as_blob" + std::to_string(i) + "_" + std::to_string(j);
+ ASSERT_OK(Put(key, value));
+ key_strs.push_back(key);
+ value_strs.push_back(value);
+ }
+ ASSERT_OK(Flush());
+ }
+ assert(key_strs.size() == kNumKeys);
+ std::array<Slice, kNumKeys> keys;
+ for (size_t i = 0; i < keys.size(); ++i) {
+ keys[i] = key_strs[i];
+ }
+
+ ReadOptions read_options;
+ read_options.read_tier = kReadAllTier;
+ read_options.fill_cache = false;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(value_strs[i], values[i]);
+ }
+ }
+
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_TRUE(statuses[i].IsIncomplete());
+ ASSERT_TRUE(values[i].empty());
+ }
+ }
+
+ read_options.read_tier = kReadAllTier;
+ read_options.fill_cache = true;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(value_strs[i], values[i]);
+ }
+ }
+
+ read_options.read_tier = kBlockCacheTier;
+
+ {
+ std::array<PinnableSlice, kNumKeys> values;
+ std::array<Status, kNumKeys> statuses;
+ db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+ &values[0], &statuses[0]);
+
+ for (size_t i = 0; i < kNumKeys; ++i) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(value_strs[i], values[i]);
+ }
+ }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ ASSERT_OK(Put(key, blob));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "Version::Get::TamperWithBlobIndex", [](void* arg) {
+ Slice* const blob_index = static_cast<Slice*>(arg);
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+
+ DestroyAndReopen(options);
+
+ constexpr size_t kNumOfKeys = 3;
+ std::array<std::string, kNumOfKeys> key_strs;
+ std::array<std::string, kNumOfKeys> value_strs;
+ std::array<Slice, kNumOfKeys + 1> keys;
+ for (size_t i = 0; i < kNumOfKeys; ++i) {
+ key_strs[i] = "foo" + std::to_string(i);
+ value_strs[i] = "blob_value" + std::to_string(i);
+ ASSERT_OK(Put(key_strs[i], value_strs[i]));
+ keys[i] = key_strs[i];
+ }
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+ ASSERT_OK(Put(key, blob));
+ keys[kNumOfKeys] = key;
+
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) {
+ KeyContext* const key_context = static_cast<KeyContext*>(arg);
+ assert(key_context);
+ assert(key_context->key);
+
+ if (*(key_context->key) == key) {
+ Slice* const blob_index = key_context->value;
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::array<PinnableSlice, kNumOfKeys + 1> values;
+ std::array<Status, kNumOfKeys + 1> statuses;
+ db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1,
+ keys.data(), values.data(), statuses.data(),
+ /*sorted_input=*/false);
+ for (size_t i = 0; i < kNumOfKeys + 1; ++i) {
+ if (i != kNumOfKeys) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ("blob_value" + std::to_string(i), values[i]);
+ } else {
+ ASSERT_TRUE(statuses[i].IsCorruption());
+ }
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr size_t kNumOfKeys = 3;
+ std::array<std::string, kNumOfKeys> key_bufs;
+ std::array<std::string, kNumOfKeys> value_bufs;
+ std::array<Slice, kNumOfKeys> keys;
+ for (size_t i = 0; i < kNumOfKeys; ++i) {
+ key_bufs[i] = "foo" + std::to_string(i);
+ value_bufs[i] = "blob_value" + std::to_string(i);
+ ASSERT_OK(Put(key_bufs[i], value_bufs[i]));
+ keys[i] = key_bufs[i];
+ }
+ ASSERT_OK(Flush());
+
+ std::array<PinnableSlice, kNumOfKeys> values;
+ std::array<Status, kNumOfKeys> statuses;
+ ReadOptions read_opts;
+ read_opts.value_size_soft_limit = 1;
+ db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys,
+ keys.data(), values.data(), statuses.data(),
+ /*sorted_input=*/true);
+ for (const auto& s : statuses) {
+ ASSERT_TRUE(s.IsAborted());
+ }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
+ constexpr uint64_t min_blob_size = 10;
+
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = min_blob_size;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "short";
+ static_assert(sizeof(short) - 1 < min_blob_size,
+ "Blob too long to be inlined");
+
+ // Fake an inlined TTL blob index.
+ std::string blob_index;
+
+ constexpr uint64_t expiration = 1234567890;
+
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Flush());
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+
+ // Fake a blob index referencing a non-existent blob file.
+ std::string blob_index;
+
+ constexpr uint64_t blob_file_number = 1000;
+ constexpr uint64_t offset = 1234;
+ constexpr uint64_t size = 5678;
+
+ BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+ kNoCompression);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+ ASSERT_OK(Flush());
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsCorruption());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, GenerateIOTracing) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::string trace_file = dbname_ + "/io_trace_file";
+
+ Reopen(options);
+ {
+ // Create IO trace file
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(
+ NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
+ ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(Get(key), blob_value);
+
+ ASSERT_OK(db_->EndIOTrace());
+ ASSERT_OK(env_->FileExists(trace_file));
+ }
+ {
+ // Parse trace file to check file operations related to blob files are
+ // recorded.
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(
+ NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
+ IOTraceReader reader(std::move(trace_reader));
+
+ IOTraceHeader header;
+ ASSERT_OK(reader.ReadHeader(&header));
+ ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+ ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+ // Read records.
+ int blob_files_op_count = 0;
+ Status status;
+ while (true) {
+ IOTraceRecord record;
+ status = reader.ReadIOOp(&record);
+ if (!status.ok()) {
+ break;
+ }
+ if (record.file_name.find("blob") != std::string::npos) {
+ blob_files_op_count++;
+ }
+ }
+ // Assuming blob files will have Append, Close and then Read operations.
+ ASSERT_GT(blob_files_op_count, 2);
+ }
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ Reopen(options);
+
+ ASSERT_OK(dbfull()->DisableFileDeletions());
+ constexpr int kNumTableFiles = 2;
+ for (int i = 0; i < kNumTableFiles; ++i) {
+ for (char ch = 'a'; ch != 'c'; ++ch) {
+ std::string key(1, ch);
+ ASSERT_OK(Put(key, "value" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ Close();
+
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(dbname_, &files));
+ std::string blob_file_path;
+ uint64_t max_blob_file_num = kInvalidBlobFileNumber;
+ for (const auto& fname : files) {
+ uint64_t file_num = 0;
+ FileType type;
+ if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
+ type == kBlobFile) {
+ if (file_num > max_blob_file_num) {
+ max_blob_file_num = file_num;
+ blob_file_path = dbname_ + "/" + fname;
+ }
+ }
+ }
+ ASSERT_OK(env_->DeleteFile(blob_file_path));
+
+ options.best_efforts_recovery = true;
+ Reopen(options);
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
+ ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
+}
+
+TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
+ Options options = GetDefaultOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("Key1", "v1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key1", "v2"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key1", "v3"));
+ ASSERT_OK(Flush());
+
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
+ ASSERT_EQ(Get("Key1"), "v1,v2,v3");
+}
+
+TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
+ constexpr size_t num_keys = 3;
+
+ Options options = GetDefaultOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("Key0", "v0_0"));
+ ASSERT_OK(Put("Key1", "v1_0"));
+ ASSERT_OK(Put("Key2", "v2_0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key0", "v0_1"));
+ ASSERT_OK(Merge("Key1", "v1_1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge("Key0", "v0_2"));
+ ASSERT_OK(Flush());
+
+ std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+ ASSERT_OK(statuses[1]);
+ ASSERT_EQ(values[1], "v1_0,v1_1");
+
+ ASSERT_OK(statuses[2]);
+ ASSERT_EQ(values[2], "v2_0");
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, Properties) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key1[] = "key1";
+ constexpr size_t key1_size = sizeof(key1) - 1;
+
+ constexpr char key2[] = "key2";
+ constexpr size_t key2_size = sizeof(key2) - 1;
+
+ constexpr char key3[] = "key3";
+ constexpr size_t key3_size = sizeof(key3) - 1;
+
+ constexpr char blob[] = "00000000000000";
+ constexpr size_t blob_size = sizeof(blob) - 1;
+
+ constexpr char longer_blob[] = "00000000000000000000";
+ constexpr size_t longer_blob_size = sizeof(longer_blob) - 1;
+
+ ASSERT_OK(Put(key1, blob));
+ ASSERT_OK(Put(key2, longer_blob));
+ ASSERT_OK(Flush());
+
+ constexpr size_t first_blob_file_expected_size =
+ BlobLogHeader::kSize +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+ longer_blob_size + BlobLogFooter::kSize;
+
+ ASSERT_OK(Put(key3, blob));
+ ASSERT_OK(Flush());
+
+ constexpr size_t second_blob_file_expected_size =
+ BlobLogHeader::kSize +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size +
+ BlobLogFooter::kSize;
+
+ constexpr size_t total_expected_size =
+ first_blob_file_expected_size + second_blob_file_expected_size;
+
+ // Number of blob files
+ uint64_t num_blob_files = 0;
+ ASSERT_TRUE(
+ db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files));
+ ASSERT_EQ(num_blob_files, 2);
+
+ // Total size of live blob files
+ uint64_t live_blob_file_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize,
+ &live_blob_file_size));
+ ASSERT_EQ(live_blob_file_size, total_expected_size);
+
+ // Total amount of garbage in live blob files
+ {
+ uint64_t live_blob_file_garbage_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+ &live_blob_file_garbage_size));
+ ASSERT_EQ(live_blob_file_garbage_size, 0);
+ }
+
+ // Total size of all blob files across all versions
+ // Note: this should be the same as above since we only have one
+ // version at this point.
+ uint64_t total_blob_file_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+ &total_blob_file_size));
+ ASSERT_EQ(total_blob_file_size, total_expected_size);
+
+ // Delete key2 to create some garbage
+ ASSERT_OK(Delete(key2));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ constexpr size_t expected_garbage_size =
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+ longer_blob_size;
+
+ constexpr double expected_space_amp =
+ static_cast<double>(total_expected_size) /
+ (total_expected_size - expected_garbage_size);
+
+ // Blob file stats
+ std::string blob_stats;
+ ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats));
+
+ std::ostringstream oss;
+ oss << "Number of blob files: 2\nTotal size of blob files: "
+ << total_expected_size
+ << "\nTotal size of garbage in blob files: " << expected_garbage_size
+ << "\nBlob file space amplification: " << expected_space_amp << '\n';
+
+ ASSERT_EQ(blob_stats, oss.str());
+
+ // Total amount of garbage in live blob files
+ {
+ uint64_t live_blob_file_garbage_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+ &live_blob_file_garbage_size));
+ ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size);
+ }
+}
+
+TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key1[] = "key1";
+ constexpr char key2[] = "key2";
+ constexpr char key3[] = "key3";
+
+ constexpr size_t key_size = sizeof(key1) - 1;
+ static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2");
+ static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3");
+
+ constexpr char blob[] = "0000000000";
+ constexpr size_t blob_size = sizeof(blob) - 1;
+
+ ASSERT_OK(Put(key1, blob));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put(key2, blob));
+ ASSERT_OK(Flush());
+
+ // Create an iterator to keep the current version alive
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+ ASSERT_OK(iter->status());
+
+ // Note: the Delete and subsequent compaction results in the first blob file
+ // not making it to the final version. (It is still part of the previous
+ // version kept alive by the iterator though.) On the other hand, the Put
+ // results in a third blob file.
+ ASSERT_OK(Delete(key1));
+ ASSERT_OK(Put(key3, blob));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ // Total size of all blob files across all versions: between the two versions,
+ // we should have three blob files of the same size with one blob each.
+ // The version kept alive by the iterator contains the first and the second
+ // blob file, while the final version contains the second and the third blob
+ // file. (The second blob file is thus shared by the two versions but should
+ // be counted only once.)
+ uint64_t total_blob_file_size = 0;
+ ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+ &total_blob_file_size));
+ ASSERT_EQ(total_blob_file_size,
+ 3 * (BlobLogHeader::kSize +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+ blob_size + BlobLogFooter::kSize));
+}
+#endif // !ROCKSDB_LITE
+
+class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
+ public testing::WithParamInterface<std::string> {
+ protected:
+ DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
+ fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
+ }
+ ~DBBlobBasicIOErrorTest() { Close(); }
+
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+ std::string sync_point_;
+};
+
+class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest {
+ public:
+ DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::OpenFile:NewRandomAccessFile",
+ "BlobFileReader::GetBlob:ReadFromFile"}));
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest,
+ ::testing::ValuesIn(std::vector<std::string>{
+ "BlobFileReader::OpenFile:NewRandomAccessFile",
+ "BlobFileReader::MultiGetBlob:ReadFromFile"}));
+
+TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
+ Options options;
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob_value[] = "blob_value";
+
+ ASSERT_OK(Put(key, blob_value));
+
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ PinnableSlice result;
+ ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+ .IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) {
+ Options options = GetDefaultOptions();
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr size_t num_keys = 2;
+
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+
+ ASSERT_OK(Put(second_key, second_value));
+
+ ASSERT_OK(Flush());
+
+ std::array<Slice, num_keys> keys{{first_key, second_key}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+ &values[0], &statuses[0]);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(statuses[0].IsIOError());
+ ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) {
+ Options options = GetDefaultOptions();
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr size_t num_keys = 2;
+
+ constexpr char key1[] = "key1";
+ constexpr char value1[] = "blob1";
+
+ ASSERT_OK(Put(key1, value1));
+ ASSERT_OK(Flush());
+
+ constexpr char key2[] = "key2";
+ constexpr char value2[] = "blob2";
+
+ ASSERT_OK(Put(key2, value2));
+ ASSERT_OK(Flush());
+
+ std::array<Slice, num_keys> keys{{key1, key2}};
+ std::array<PinnableSlice, num_keys> values;
+ std::array<Status, num_keys> statuses;
+
+ bool first_blob_file = true;
+ SyncPoint::GetInstance()->SetCallBack(
+ sync_point_, [&first_blob_file, this](void* /* arg */) {
+ if (first_blob_file) {
+ first_blob_file = false;
+ return;
+ }
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+ keys.data(), values.data(), statuses.data());
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ ASSERT_OK(statuses[0]);
+ ASSERT_EQ(value1, values[0]);
+ ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+namespace {
+
+class ReadBlobCompactionFilter : public CompactionFilter {
+ public:
+ ReadBlobCompactionFilter() = default;
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.read.blob";
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const override {
+ if (value_type != CompactionFilter::ValueType::kValue) {
+ return CompactionFilter::Decision::kKeep;
+ }
+ assert(new_value);
+ new_value->assign(existing_value.data(), existing_value.size());
+ return CompactionFilter::Decision::kChangeValue;
+ }
+};
+
+} // anonymous namespace
+
+TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
+ Options options = GetDefaultOptions();
+ options.env = fault_injection_env_.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ReadBlobCompactionFilter);
+ options.compaction_filter = compaction_filter_guard.get();
+
+ DestroyAndReopen(options);
+ constexpr char key[] = "foo";
+ constexpr char blob_value[] = "foo_blob_value";
+ ASSERT_OK(Put(key, blob_value));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false,
+ Status::IOError(sync_point_));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsIOError());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 1 << 25;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ DestroyAndReopen(options);
+
+ constexpr size_t kNumBlobs = 10;
+ constexpr size_t kValueSize = 100;
+
+ std::string value(kValueSize, 'a');
+
+ for (size_t i = 1; i <= kNumBlobs; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
+ ASSERT_OK(Flush());
+ ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+ ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS));
+ ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT));
+ }
+
+ // Verify compaction not counted
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ EXPECT_EQ(kNumBlobs * 2,
+ options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
+ Options options = GetDefaultOptions();
+
+ LRUCacheOptions co;
+ co.capacity = 1 << 25;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ auto backing_cache = NewLRUCache(co);
+
+ options.blob_cache = backing_cache;
+
+ BlockBasedTableOptions block_based_options;
+ block_based_options.no_block_cache = false;
+ block_based_options.block_cache = backing_cache;
+ block_based_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+ options.enable_blob_files = true;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ DestroyAndReopen(options);
+
+ constexpr size_t kNumBlobs = 10;
+ constexpr size_t kValueSize = 100;
+
+ std::string value(kValueSize, 'a');
+
+ for (size_t i = 1; i <= 5; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
+ ASSERT_OK(Flush());
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+ ASSERT_EQ(0,
+ options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+ }
+
+ ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}}));
+
+ for (size_t i = 6; i <= kNumBlobs; i++) {
+ ASSERT_OK(Put(std::to_string(i), value));
+ ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap
+ ASSERT_OK(Flush());
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+ ASSERT_EQ(value, Get(std::to_string(i)));
+ ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+ ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+ ASSERT_EQ(2,
+ options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+ ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+ }
+
+ // Verify compaction not counted
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+#endif // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+ secondary_cache_opts.capacity = 1 << 20;
+ secondary_cache_opts.num_shard_bits = 0;
+ secondary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+ secondary_cache_opts.compression_type = kNoCompression;
+
+ LRUCacheOptions primary_cache_opts;
+ primary_cache_opts.capacity = 1024;
+ primary_cache_opts.num_shard_bits = 0;
+ primary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+ primary_cache_opts.secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.statistics = CreateDBStatistics();
+ options.enable_blob_files = true;
+ options.blob_cache = NewLRUCache(primary_cache_opts);
+ options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+
+ DestroyAndReopen(options);
+
+ // Note: only one of the two blobs fit in the primary cache at any given time.
+ constexpr char first_key[] = "foo";
+ constexpr size_t first_blob_size = 512;
+ const std::string first_blob(first_blob_size, 'a');
+
+ constexpr char second_key[] = "bar";
+ constexpr size_t second_blob_size = 768;
+ const std::string second_blob(second_blob_size, 'b');
+
+ // First blob is inserted into primary cache during flush.
+ ASSERT_OK(Put(first_key, first_blob));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+ // Second blob is inserted into primary cache during flush,
+ // First blob is evicted but only a dummy handle is inserted into secondary
+ // cache.
+ ASSERT_OK(Put(second_key, second_blob));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+ // First blob is inserted into primary cache.
+ // Second blob is evicted but only a dummy handle is inserted into secondary
+ // cache.
+ ASSERT_EQ(Get(first_key), first_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 0);
+ // Second blob is inserted into primary cache,
+ // First blob is evicted and is inserted into secondary cache.
+ ASSERT_EQ(Get(second_key), second_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 0);
+
+ // First blob's dummy item is inserted into primary cache b/c of lookup.
+ // Second blob is still in primary cache.
+ ASSERT_EQ(Get(first_key), first_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 1);
+
+ // First blob's item is inserted into primary cache b/c of lookup.
+ // Second blob is evicted and inserted into secondary cache.
+ ASSERT_EQ(Get(first_key), first_blob);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+ ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+ 1);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_compaction_test.cc b/src/rocksdb/db/blob/db_blob_compaction_test.cc
new file mode 100644
index 000000000..f3fe3c03b
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_compaction_test.cc
@@ -0,0 +1,913 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCompactionTest : public DBTestBase {
+ public:
+ explicit DBBlobCompactionTest()
+ : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
+
+#ifndef ROCKSDB_LITE
+ const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ const InternalStats* const internal_stats = cfd->internal_stats();
+ assert(internal_stats);
+
+ return internal_stats->TEST_GetCompactionStats();
+ }
+#endif // ROCKSDB_LITE
+};
+
+namespace {
+
+class FilterByKeyLength : public CompactionFilter {
+ public:
+ explicit FilterByKeyLength(size_t len) : length_threshold_(len) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.by.key.length";
+ }
+ CompactionFilter::Decision FilterBlobByKey(
+ int /*level*/, const Slice& key, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (key.size() < length_threshold_) {
+ return CompactionFilter::Decision::kRemove;
+ }
+ return CompactionFilter::Decision::kKeep;
+ }
+
+ private:
+ size_t length_threshold_;
+};
+
+class FilterByValueLength : public CompactionFilter {
+ public:
+ explicit FilterByValueLength(size_t len) : length_threshold_(len) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.by.value.length";
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+ const Slice& existing_value, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (existing_value.size() < length_threshold_) {
+ return CompactionFilter::Decision::kRemove;
+ }
+ return CompactionFilter::Decision::kKeep;
+ }
+
+ private:
+ size_t length_threshold_;
+};
+
+class BadBlobCompactionFilter : public CompactionFilter {
+ public:
+ explicit BadBlobCompactionFilter(std::string prefix,
+ CompactionFilter::Decision filter_by_key,
+ CompactionFilter::Decision filter_v2)
+ : prefix_(std::move(prefix)),
+ filter_blob_by_key_(filter_by_key),
+ filter_v2_(filter_v2) {}
+ const char* Name() const override { return "rocksdb.compaction.filter.bad"; }
+ CompactionFilter::Decision FilterBlobByKey(
+ int /*level*/, const Slice& key, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ if (key.size() >= prefix_.size() &&
+ 0 == strncmp(prefix_.data(), key.data(), prefix_.size())) {
+ return CompactionFilter::Decision::kUndetermined;
+ }
+ return filter_blob_by_key_;
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return filter_v2_;
+ }
+
+ private:
+ const std::string prefix_;
+ const CompactionFilter::Decision filter_blob_by_key_;
+ const CompactionFilter::Decision filter_v2_;
+};
+
+class ValueBlindWriteFilter : public CompactionFilter {
+ public:
+ explicit ValueBlindWriteFilter(std::string new_val)
+ : new_value_(std::move(new_val)) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.blind.write";
+ }
+ CompactionFilter::Decision FilterBlobByKey(
+ int level, const Slice& key, std::string* new_value,
+ std::string* skip_until) const override;
+
+ private:
+ const std::string new_value_;
+};
+
+CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey(
+ int /*level*/, const Slice& /*key*/, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ assert(new_value);
+ new_value->assign(new_value_);
+ return CompactionFilter::Decision::kChangeValue;
+}
+
+class ValueMutationFilter : public CompactionFilter {
+ public:
+ explicit ValueMutationFilter(std::string padding)
+ : padding_(std::move(padding)) {}
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.value.mutation";
+ }
+ CompactionFilter::Decision FilterV2(int level, const Slice& key,
+ ValueType value_type,
+ const Slice& existing_value,
+ std::string* new_value,
+ std::string* skip_until) const override;
+
+ private:
+ const std::string padding_;
+};
+
+CompactionFilter::Decision ValueMutationFilter::FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ assert(CompactionFilter::ValueType::kBlobIndex != value_type);
+ if (CompactionFilter::ValueType::kValue != value_type) {
+ return CompactionFilter::Decision::kKeep;
+ }
+ assert(new_value);
+ new_value->assign(existing_value.data(), existing_value.size());
+ new_value->append(padding_);
+ return CompactionFilter::Decision::kChangeValue;
+}
+
+class AlwaysKeepFilter : public CompactionFilter {
+ public:
+ explicit AlwaysKeepFilter() = default;
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.always.keep";
+ }
+ CompactionFilter::Decision FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ std::string* /*skip_until*/) const override {
+ return CompactionFilter::Decision::kKeep;
+ }
+};
+
+class SkipUntilFilter : public CompactionFilter {
+ public:
+ explicit SkipUntilFilter(std::string skip_until)
+ : skip_until_(std::move(skip_until)) {}
+
+ const char* Name() const override {
+ return "rocksdb.compaction.filter.skip.until";
+ }
+
+ CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */,
+ ValueType /* value_type */,
+ const Slice& /* existing_value */,
+ std::string* /* new_value */,
+ std::string* skip_until) const override {
+ assert(skip_until);
+ *skip_until = skip_until_;
+
+ return CompactionFilter::Decision::kRemoveAndSkipUntil;
+ }
+
+ private:
+ std::string skip_until_;
+};
+
+} // anonymous namespace
+
+class DBBlobBadCompactionFilterTest
+ : public DBBlobCompactionTest,
+ public testing::WithParamInterface<
+ std::tuple<std::string, CompactionFilter::Decision,
+ CompactionFilter::Decision>> {
+ public:
+ explicit DBBlobBadCompactionFilterTest()
+ : compaction_filter_guard_(new BadBlobCompactionFilter(
+ std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()))) {}
+
+ protected:
+ std::unique_ptr<CompactionFilter> compaction_filter_guard_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ BadCompactionFilter, DBBlobBadCompactionFilterTest,
+ testing::Combine(
+ testing::Values("a"),
+ testing::Values(CompactionFilter::Decision::kChangeBlobIndex,
+ CompactionFilter::Decision::kIOError),
+ testing::Values(CompactionFilter::Decision::kUndetermined,
+ CompactionFilter::Decision::kChangeBlobIndex,
+ CompactionFilter::Decision::kIOError)));
+
+TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ constexpr size_t kKeyLength = 2;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new FilterByKeyLength(kKeyLength));
+ options.compaction_filter = compaction_filter_guard.get();
+
+ constexpr char short_key[] = "a";
+ constexpr char long_key[] = "abc";
+ constexpr char blob_value[] = "value";
+
+ DestroyAndReopen(options);
+ ASSERT_OK(Put(short_key, blob_value));
+ ASSERT_OK(Put(long_key, blob_value));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ std::string value;
+ ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound());
+ value.clear();
+ ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
+ ASSERT_EQ("value", value);
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter decides between kKeep and kRemove solely based on key;
+ // this involves neither reading nor writing blobs
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, FilterByValueLength) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 5;
+ options.create_if_missing = true;
+ constexpr size_t kValueLength = 5;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new FilterByValueLength(kValueLength));
+ options.compaction_filter = compaction_filter_guard.get();
+
+ const std::vector<std::string> short_value_keys = {"a", "e", "j"};
+ constexpr char short_value[] = "val";
+ const std::vector<std::string> long_value_keys = {"b", "f", "k"};
+ constexpr char long_value[] = "valuevalue";
+
+ DestroyAndReopen(options);
+ for (size_t i = 0; i < short_value_keys.size(); ++i) {
+ ASSERT_OK(Put(short_value_keys[i], short_value));
+ }
+ for (size_t i = 0; i < short_value_keys.size(); ++i) {
+ ASSERT_OK(Put(long_value_keys[i], long_value));
+ }
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ std::string value;
+ for (size_t i = 0; i < short_value_keys.size(); ++i) {
+ ASSERT_TRUE(
+ db_->Get(ReadOptions(), short_value_keys[i], &value).IsNotFound());
+ value.clear();
+ }
+ for (size_t i = 0; i < long_value_keys.size(); ++i) {
+ ASSERT_OK(db_->Get(ReadOptions(), long_value_keys[i], &value));
+ ASSERT_EQ(long_value, value);
+ }
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter decides between kKeep and kRemove based on value;
+ // this involves reading but not writing blobs
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
+ Options options = GetDefaultOptions();
+
+ options.enable_blob_files = true;
+ options.min_blob_size = 1000;
+ options.blob_file_starting_level = 5;
+ options.create_if_missing = true;
+
+ // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+ // new table file when encountering a new key whose 1-byte prefix changes.
+ constexpr size_t key_len = 1;
+ options.sst_partitioner_factory =
+ NewSstPartitionerFixedPrefixFactory(key_len);
+
+ ASSERT_OK(TryReopen(options));
+
+ constexpr size_t blob_size = 3000;
+
+ constexpr char first_key[] = "a";
+ const std::string first_blob(blob_size, 'a');
+ ASSERT_OK(Put(first_key, first_blob));
+
+ constexpr char second_key[] = "b";
+ const std::string second_blob(2 * blob_size, 'b');
+ ASSERT_OK(Put(second_key, second_blob));
+
+ constexpr char third_key[] = "d";
+ const std::string third_blob(blob_size, 'd');
+ ASSERT_OK(Put(third_key, third_blob));
+
+ ASSERT_OK(Flush());
+
+ constexpr char fourth_key[] = "c";
+ const std::string fourth_blob(blob_size, 'c');
+ ASSERT_OK(Put(fourth_key, fourth_blob));
+
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(0, GetBlobFileNumbers().size());
+ ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ // No blob file should be created since blob_file_starting_level is 5.
+ ASSERT_EQ(0, GetBlobFileNumbers().size());
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+ {
+ options.blob_file_starting_level = 1;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put(first_key, first_blob));
+ ASSERT_OK(Put(second_key, second_blob));
+ ASSERT_OK(Put(third_key, third_blob));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(fourth_key, fourth_blob));
+ ASSERT_OK(Flush());
+
+ ASSERT_EQ(0, GetBlobFileNumbers().size());
+ ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ // The compaction's output level equals to blob_file_starting_level.
+ ASSERT_EQ(1, GetBlobFileNumbers().size());
+ ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+ ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+ }
+
+ Close();
+}
+#endif
+
+TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ constexpr char new_blob_value[] = "new_blob_value";
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueBlindWriteFilter(new_blob_value));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ const std::vector<std::string> keys = {"a", "b", "c"};
+ const std::vector<std::string> values = {"a_value", "b_value", "c_value"};
+ assert(keys.size() == values.size());
+ for (size_t i = 0; i < keys.size(); ++i) {
+ ASSERT_OK(Put(keys[i], values[i]));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ for (const auto& key : keys) {
+ ASSERT_EQ(new_blob_value, Get(key));
+ }
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter unconditionally changes value in FilterBlobByKey;
+ // this involves writing but not reading blobs
+ ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, SkipUntilFilter) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new SkipUntilFilter("z"));
+ options.compaction_filter = compaction_filter_guard.get();
+
+ Reopen(options);
+
+ const std::vector<std::string> keys{"a", "b", "c"};
+ const std::vector<std::string> values{"a_value", "b_value", "c_value"};
+ assert(keys.size() == values.size());
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ ASSERT_OK(Put(keys[i], values[i]));
+ }
+
+ ASSERT_OK(Flush());
+
+ int process_in_flow_called = 0;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow",
+ [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+ /* end */ nullptr));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ for (const auto& key : keys) {
+ ASSERT_EQ(Get(key), "NOT_FOUND");
+ }
+
+ // Make sure SkipUntil was performed using iteration rather than Seek
+ ASSERT_EQ(process_in_flow_called, keys.size());
+
+ Close();
+}
+
+TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ options.compaction_filter = compaction_filter_guard_.get();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("b", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsNotSupported());
+ Close();
+
+ DestroyAndReopen(options);
+ std::string key(std::get<0>(GetParam()));
+ ASSERT_OK(Put(key, "value"));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsNotSupported());
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter(""));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+ // Fake an inlined TTL blob index.
+ std::string blob_index;
+ constexpr uint64_t expiration = 1234567890;
+ BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ ASSERT_OK(Flush());
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsCorruption());
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ constexpr char padding[] = "_delta";
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter(padding));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ const std::vector<std::pair<std::string, std::string>> kvs = {
+ {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}};
+ for (const auto& kv : kvs) {
+ ASSERT_OK(Put(kv.first, kv.second));
+ }
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ for (const auto& kv : kvs) {
+ ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
+ }
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter changes the value using the previous value in FilterV2;
+ // this involves reading and writing blobs
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter(""));
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ ASSERT_OK(Put(key, blob));
+ ASSERT_OK(Flush());
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+ [](void* arg) {
+ Slice* const blob_index = static_cast<Slice*>(arg);
+ assert(blob_index);
+ assert(!blob_index->empty());
+ blob_index->remove_prefix(1);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr)
+ .IsCorruption());
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new AlwaysKeepFilter());
+ options.compaction_filter = compaction_filter_guard.get();
+ DestroyAndReopen(options);
+ ASSERT_OK(Put("foo", "foo_value"));
+ ASSERT_OK(Flush());
+ std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ ASSERT_EQ(blob_files, GetBlobFileNumbers());
+
+#ifndef ROCKSDB_LITE
+ const auto& compaction_stats = GetCompactionStats();
+ ASSERT_GE(compaction_stats.size(), 2);
+
+ // Filter decides to keep the existing value in FilterV2;
+ // this involves reading but not writing blobs
+ ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+ ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif // ROCKSDB_LITE
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, TrackGarbage) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+
+ Reopen(options);
+
+ // First table+blob file pair: 4 blobs with different keys
+ constexpr char first_key[] = "first_key";
+ constexpr char first_value[] = "first_value";
+ constexpr char second_key[] = "second_key";
+ constexpr char second_value[] = "second_value";
+ constexpr char third_key[] = "third_key";
+ constexpr char third_value[] = "third_value";
+ constexpr char fourth_key[] = "fourth_key";
+ constexpr char fourth_value[] = "fourth_value";
+
+ ASSERT_OK(Put(first_key, first_value));
+ ASSERT_OK(Put(second_key, second_value));
+ ASSERT_OK(Put(third_key, third_value));
+ ASSERT_OK(Put(fourth_key, fourth_value));
+ ASSERT_OK(Flush());
+
+ // Second table+blob file pair: overwrite 2 existing keys
+ constexpr char new_first_value[] = "new_first_value";
+ constexpr char new_second_value[] = "new_second_value";
+
+ ASSERT_OK(Put(first_key, new_first_value));
+ ASSERT_OK(Put(second_key, new_second_value));
+ ASSERT_OK(Flush());
+
+ // Compact them together. The first blob file should have 2 garbage blobs
+ // corresponding to the 2 overwritten keys.
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ assert(versions->GetColumnFamilySet());
+
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+
+ Version* const current = cfd->current();
+ assert(current);
+
+ const VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ const auto& blob_files = storage_info->GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 2);
+
+ {
+ const auto& meta = blob_files.front();
+ assert(meta);
+
+ constexpr uint64_t first_expected_bytes =
+ sizeof(first_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+ 1);
+ constexpr uint64_t second_expected_bytes =
+ sizeof(second_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+ 1);
+ constexpr uint64_t third_expected_bytes =
+ sizeof(third_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) -
+ 1);
+ constexpr uint64_t fourth_expected_bytes =
+ sizeof(fourth_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) -
+ 1);
+
+ ASSERT_EQ(meta->GetTotalBlobCount(), 4);
+ ASSERT_EQ(meta->GetTotalBlobBytes(),
+ first_expected_bytes + second_expected_bytes +
+ third_expected_bytes + fourth_expected_bytes);
+ ASSERT_EQ(meta->GetGarbageBlobCount(), 2);
+ ASSERT_EQ(meta->GetGarbageBlobBytes(),
+ first_expected_bytes + second_expected_bytes);
+ }
+
+ {
+ const auto& meta = blob_files.back();
+ assert(meta);
+
+ constexpr uint64_t new_first_expected_bytes =
+ sizeof(new_first_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+ 1);
+ constexpr uint64_t new_second_expected_bytes =
+ sizeof(new_second_value) - 1 +
+ BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+ 1);
+
+ ASSERT_EQ(meta->GetTotalBlobCount(), 2);
+ ASSERT_EQ(meta->GetTotalBlobBytes(),
+ new_first_expected_bytes + new_second_expected_bytes);
+ ASSERT_EQ(meta->GetGarbageBlobCount(), 0);
+ ASSERT_EQ(meta->GetGarbageBlobBytes(), 0);
+ }
+}
+
+TEST_F(DBBlobCompactionTest, MergeBlobWithBase) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+ ASSERT_OK(Put("Key1", "v1_1"));
+ ASSERT_OK(Put("Key2", "v2_1"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Merge("Key1", "v1_2"));
+ ASSERT_OK(Merge("Key2", "v2_2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Merge("Key1", "v1_3"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3");
+ ASSERT_EQ(Get("Key2"), "v2_1,v2_2");
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.blob_compaction_readahead_size = 1 << 10;
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("key", "pie"));
+ ASSERT_OK(Put("foo", "baz"));
+ ASSERT_OK(Flush());
+
+ size_t num_non_prefetch_reads = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::GetBlob:ReadFromFile",
+ [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(Get("key"), "pie");
+ ASSERT_EQ(Get("foo"), "baz");
+ ASSERT_EQ(num_non_prefetch_reads, 0);
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) {
+ Options options = GetDefaultOptions();
+
+ std::unique_ptr<CompactionFilter> compaction_filter_guard(
+ new ValueMutationFilter("pie"));
+
+ options.compaction_filter = compaction_filter_guard.get();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.blob_compaction_readahead_size = 1 << 10;
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ size_t num_non_prefetch_reads = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::GetBlob:ReadFromFile",
+ [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(Get("key"), "limepie");
+ ASSERT_EQ(Get("foo"), "barpie");
+ ASSERT_EQ(num_non_prefetch_reads, 0);
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.blob_compaction_readahead_size = 1 << 10;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.disable_auto_compactions = true;
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Merge("key", "pie"));
+ ASSERT_OK(Merge("foo", "baz"));
+ ASSERT_OK(Flush());
+
+ size_t num_non_prefetch_reads = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobFileReader::GetBlob:ReadFromFile",
+ [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_EQ(Get("key"), "lime,pie");
+ ASSERT_EQ(Get("foo"), "bar,baz");
+ ASSERT_EQ(num_non_prefetch_reads, 0);
+
+ Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionDoNotFillCache) {
+ Options options = GetDefaultOptions();
+
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.enable_blob_garbage_collection = true;
+ options.blob_garbage_collection_age_cutoff = 1.0;
+ options.disable_auto_compactions = true;
+ options.statistics = CreateDBStatistics();
+
+ LRUCacheOptions cache_options;
+ cache_options.capacity = 1 << 20;
+ cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+ options.blob_cache = NewLRUCache(cache_options);
+
+ Reopen(options);
+
+ ASSERT_OK(Put("key", "lime"));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("key", "pie"));
+ ASSERT_OK(Put("foo", "baz"));
+ ASSERT_OK(Flush());
+
+ constexpr Slice* begin = nullptr;
+ constexpr Slice* end = nullptr;
+
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+ ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+
+ Close();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_corruption_test.cc b/src/rocksdb/db/blob/db_blob_corruption_test.cc
new file mode 100644
index 000000000..7ac7ce3fc
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_corruption_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCorruptionTest : public DBTestBase {
+ protected:
+ DBBlobCorruptionTest()
+ : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {}
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ std::string fname;
+ uint64_t picked_number = kInvalidBlobFileNumber;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+ number > picked_number) { // Pick latest file
+ fname = dbname_ + "/" + filenames[i];
+ picked_number = number;
+ }
+ }
+ ASSERT_TRUE(!fname.empty()) << filetype;
+ ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+ }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
+ Options options = GetDefaultOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+ options.create_if_missing = true;
+ options.file_checksum_gen_factory =
+ ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+ Reopen(options);
+
+ ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1")));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2")));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+ Close();
+
+ Corrupt(kBlobFile, 0, 2);
+
+ ASSERT_OK(TryReopen(options));
+
+ int count{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+ const Status* s = static_cast<Status*>(arg);
+ ASSERT_NE(s, nullptr);
+ ++count;
+ ASSERT_NOK(*s);
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+ ASSERT_EQ(1, count);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif // !ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_index_test.cc b/src/rocksdb/db/blob/db_blob_index_test.cc
new file mode 100644
index 000000000..64c550894
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_index_test.cc
@@ -0,0 +1,602 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+ enum Tier {
+ kMemtable = 0,
+ kImmutableMemtables = 1,
+ kL0SstFile = 2,
+ kLnSstFile = 3,
+ };
+ const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+ Tier::kImmutableMemtables,
+ Tier::kL0SstFile, Tier::kLnSstFile};
+
+ DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {}
+
+ ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+ ColumnFamilyData* cfd() {
+ return static_cast_with_check<ColumnFamilyHandleImpl>(cfh())->cfd();
+ }
+
+ Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+ const Slice& blob_index) {
+ return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+ blob_index);
+ }
+
+ Status Write(WriteBatch* batch) {
+ return dbfull()->Write(WriteOptions(), batch);
+ }
+
+ std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ PinnableSlice value;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = cfh();
+ get_impl_options.value = &value;
+ get_impl_options.is_blob_index = is_blob_index;
+ auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+ if (s.IsNotFound()) {
+ return "NOT_FOUND";
+ }
+ if (s.IsCorruption()) {
+ return "CORRUPTION";
+ }
+ if (s.IsNotSupported()) {
+ return "NOT_SUPPORTED";
+ }
+ if (!s.ok()) {
+ return s.ToString();
+ }
+ return value.ToString();
+ }
+
+ std::string GetBlobIndex(const Slice& key,
+ const Snapshot* snapshot = nullptr) {
+ bool is_blob_index = false;
+ std::string value = GetImpl(key, &is_blob_index, snapshot);
+ if (!is_blob_index) {
+ return "NOT_BLOB";
+ }
+ return value;
+ }
+
+ ArenaWrappedDBIter* GetBlobIterator() {
+ return dbfull()->NewIteratorImpl(
+ ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+ nullptr /*read_callback*/, true /*expose_blob_index*/);
+ }
+
+ Options GetTestOptions() {
+ Options options;
+ options.env = CurrentOptions().env;
+ options.create_if_missing = true;
+ options.num_levels = 2;
+ options.disable_auto_compactions = true;
+ // Disable auto flushes.
+ options.max_write_buffer_number = 10;
+ options.min_write_buffer_number_to_merge = 10;
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ return options;
+ }
+
+ void MoveDataTo(Tier tier) {
+ switch (tier) {
+ case Tier::kMemtable:
+ break;
+ case Tier::kImmutableMemtables:
+ ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+ break;
+ case Tier::kL0SstFile:
+ ASSERT_OK(Flush());
+ break;
+ case Tier::kLnSstFile:
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("a", "dummy"));
+ ASSERT_OK(Put("z", "dummy"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(
+ dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ("0,1", FilesPerLevel());
+#endif // !ROCKSDB_LITE
+ break;
+ }
+ }
+};
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. We should be able to write kTypeBlobIndex to memtables and
+// SST files.
+TEST_F(DBBlobIndexTest, Write) {
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+
+ std::vector<std::pair<std::string, std::string>> key_values;
+
+ constexpr size_t num_key_values = 5;
+
+ key_values.reserve(num_key_values);
+
+ for (size_t i = 1; i <= num_key_values; ++i) {
+ std::string key = "key" + std::to_string(i);
+
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
+ "blob" + std::to_string(i));
+
+ key_values.emplace_back(std::move(key), std::move(blob_index));
+ }
+
+ for (const auto& key_value : key_values) {
+ WriteBatch batch;
+ ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second));
+ ASSERT_OK(Write(&batch));
+ }
+
+ MoveDataTo(tier);
+
+ for (const auto& key_value : key_values) {
+ ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second);
+ }
+ }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should be able to return blob index if is_blob_index is
+// provided, otherwise it should return Status::NotSupported (when reading from
+// memtable) or Status::Corruption (when reading from SST). Reading from SST
+// returns Corruption because we can't differentiate between the application
+// accidentally opening the base DB of a stacked BlobDB and actual corruption
+// when using the integrated BlobDB.
+TEST_F(DBBlobIndexTest, Get) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("key", "value"));
+ ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index));
+ ASSERT_OK(Write(&batch));
+
+ MoveDataTo(tier);
+
+ // Verify normal value
+ bool is_blob_index = false;
+ PinnableSlice value;
+ ASSERT_EQ("value", Get("key"));
+ ASSERT_EQ("value", GetImpl("key"));
+ ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+ ASSERT_FALSE(is_blob_index);
+
+ // Verify blob index
+ if (tier <= kImmutableMemtables) {
+ ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+ ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+ } else {
+ ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
+ ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
+ }
+ ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index));
+ ASSERT_TRUE(is_blob_index);
+ }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should NOT return Status::NotSupported/Status::Corruption
+// if blob index is updated with a normal value. See the test case above for
+// more details.
+TEST_F(DBBlobIndexTest, Updated) {
+ std::string blob_index;
+ BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+ for (auto tier : kAllTiers) {
+ DestroyAndReopen(GetTestOptions());
+ WriteBatch batch;
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index));
+ }
+ ASSERT_OK(Write(&batch));
+ // Avoid blob values from being purged.
+ const Snapshot* snapshot = dbfull()->GetSnapshot();
+ ASSERT_OK(Put("key1", "new_value"));
+ ASSERT_OK(Merge("key2", "a"));
+ ASSERT_OK(Merge("key2", "b"));
+ ASSERT_OK(Merge("key2", "c"));
+ ASSERT_OK(Delete("key3"));
+ ASSERT_OK(SingleDelete("key4"));
+ ASSERT_OK(Delete("key5"));
+ ASSERT_OK(Merge("key5", "a"));
+ ASSERT_OK(Merge("key5", "b"));
+ ASSERT_OK(Merge("key5", "c"));
+ ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+ MoveDataTo(tier);
+ for (int i = 0; i < 10; i++) {
+ ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot));
+ }
+ ASSERT_EQ("new_value", Get("key1"));
+ if (tier <= kImmutableMemtables) {
+ ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+ } else {
+ ASSERT_EQ("CORRUPTION", GetImpl("key2"));
+ }
+ ASSERT_EQ("NOT_FOUND", Get("key3"));
+ ASSERT_EQ("NOT_FOUND", Get("key4"));
+ ASSERT_EQ("a,b,c", GetImpl("key5"));
+ for (int i = 6; i < 9; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+ }
+ ASSERT_EQ(blob_index, GetBlobIndex("key9"));
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. When a blob iterator is used, it should set the
+// expose_blob_index flag for the underlying DBIter, and retrieve/return the
+// corresponding blob value. If a regular DBIter is created (i.e.
+// expose_blob_index is not set), it should return Status::Corruption.
+TEST_F(DBBlobIndexTest, Iterate) {
+ const std::vector<std::vector<ValueType>> data = {
+ /*00*/ {kTypeValue},
+ /*01*/ {kTypeBlobIndex},
+ /*02*/ {kTypeValue},
+ /*03*/ {kTypeBlobIndex, kTypeValue},
+ /*04*/ {kTypeValue},
+ /*05*/ {kTypeValue, kTypeBlobIndex},
+ /*06*/ {kTypeValue},
+ /*07*/ {kTypeDeletion, kTypeBlobIndex},
+ /*08*/ {kTypeValue},
+ /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+ /*10*/ {kTypeValue},
+ /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+ /*12*/ {kTypeValue},
+ /*13*/
+ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+ /*14*/ {kTypeValue},
+ /*15*/ {kTypeBlobIndex},
+ /*16*/ {kTypeValue},
+ };
+
+ auto get_key = [](int index) {
+ char buf[20];
+ snprintf(buf, sizeof(buf), "%02d", index);
+ return "key" + std::string(buf);
+ };
+
+ auto get_value = [&](int index, int version) {
+ return get_key(index) + "_value" + std::to_string(version);
+ };
+
+ auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+ const Slice& expected_value) {
+ ASSERT_EQ(expected_status, iterator->status().code());
+ if (expected_status == Status::kOk) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ(expected_value, iterator->value());
+ } else {
+ ASSERT_FALSE(iterator->Valid());
+ }
+ };
+
+ auto create_normal_iterator = [&]() -> Iterator* {
+ return dbfull()->NewIterator(ReadOptions());
+ };
+
+ auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+ auto check_is_blob = [&](bool is_blob) {
+ return [is_blob](Iterator* iterator) {
+ ASSERT_EQ(is_blob,
+ reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+ };
+ };
+
+ auto verify = [&](int index, Status::Code expected_status,
+ const Slice& forward_value, const Slice& backward_value,
+ std::function<Iterator*()> create_iterator,
+ std::function<void(Iterator*)> extra_check = nullptr) {
+ // Seek
+ auto* iterator = create_iterator();
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index));
+ check_iterator(iterator, expected_status, forward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // Next
+ iterator = create_iterator();
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index - 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Next();
+ check_iterator(iterator, expected_status, forward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // SeekForPrev
+ iterator = create_iterator();
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->SeekForPrev(get_key(index));
+ check_iterator(iterator, expected_status, backward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+
+ // Prev
+ iterator = create_iterator();
+ iterator->Seek(get_key(index + 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Prev();
+ check_iterator(iterator, expected_status, backward_value);
+ if (extra_check) {
+ extra_check(iterator);
+ }
+ delete iterator;
+ };
+
+ for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+ // Avoid values from being purged.
+ std::vector<const Snapshot*> snapshots;
+ DestroyAndReopen(GetTestOptions());
+
+ // fill data
+ for (int i = 0; i < static_cast<int>(data.size()); i++) {
+ for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+ std::string key = get_key(i);
+ std::string value = get_value(i, j);
+ WriteBatch batch;
+ switch (data[i][j]) {
+ case kTypeValue:
+ ASSERT_OK(Put(key, value));
+ break;
+ case kTypeDeletion:
+ ASSERT_OK(Delete(key));
+ break;
+ case kTypeSingleDeletion:
+ ASSERT_OK(SingleDelete(key));
+ break;
+ case kTypeMerge:
+ ASSERT_OK(Merge(key, value));
+ break;
+ case kTypeBlobIndex:
+ ASSERT_OK(PutBlobIndex(&batch, key, value));
+ ASSERT_OK(Write(&batch));
+ break;
+ default:
+ FAIL();
+ };
+ }
+ snapshots.push_back(dbfull()->GetSnapshot());
+ }
+ ASSERT_OK(
+ dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+ snapshots.push_back(dbfull()->GetSnapshot());
+ MoveDataTo(tier);
+
+ // Normal iterator
+ verify(1, Status::kCorruption, "", "", create_normal_iterator);
+ verify(3, Status::kCorruption, "", "", create_normal_iterator);
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_normal_iterator);
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_normal_iterator);
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_normal_iterator);
+ verify(11, Status::kCorruption, "", "", create_normal_iterator);
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_normal_iterator);
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_normal_iterator);
+
+ // Iterator with blob support
+ verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_blob_iterator, check_is_blob(false));
+ if (tier <= kImmutableMemtables) {
+ verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+ } else {
+ verify(11, Status::kCorruption, "", "", create_blob_iterator);
+ }
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+ // Iterator with blob support and using seek.
+ ASSERT_OK(dbfull()->SetOptions(
+ cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+ verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+ create_blob_iterator, check_is_blob(true));
+ verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+ create_blob_iterator, check_is_blob(false));
+ if (tier <= kImmutableMemtables) {
+ verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+ } else {
+ verify(11, Status::kCorruption, "", "", create_blob_iterator);
+ }
+ verify(13, Status::kOk,
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+ create_blob_iterator, check_is_blob(false));
+ verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+ create_blob_iterator, check_is_blob(false));
+#endif // !ROCKSDB_LITE
+
+ for (auto* snapshot : snapshots) {
+ dbfull()->ReleaseSnapshot(snapshot);
+ }
+ }
+}
+
+TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
+ const std::vector<std::vector<std::string>> data = {
+ /*00*/ {"Put"},
+ /*01*/ {"Put", "Merge", "Merge", "Merge"},
+ /*02*/ {"Put"}};
+
+ auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
+
+ auto get_value = [&](size_t index, size_t version) {
+ return get_key(index) + "_value" + std::to_string(version);
+ };
+
+ auto check_iterator = [&](Iterator* iterator, Status expected_status,
+ const Slice& expected_value) {
+ ASSERT_EQ(expected_status, iterator->status());
+ if (expected_status.ok()) {
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_EQ(expected_value, iterator->value());
+ } else {
+ ASSERT_FALSE(iterator->Valid());
+ }
+ };
+
+ auto verify = [&](size_t index, Status expected_status,
+ const Slice& expected_value) {
+ // Seek
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index));
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ // Next
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ ASSERT_OK(iterator->Refresh());
+ iterator->Seek(get_key(index - 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Next();
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ // SeekForPrev
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ ASSERT_OK(iterator->status());
+ ASSERT_OK(iterator->Refresh());
+ iterator->SeekForPrev(get_key(index));
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ // Prev
+ {
+ Iterator* iterator = db_->NewIterator(ReadOptions());
+ std::unique_ptr<Iterator> iterator_guard(iterator);
+ iterator->Seek(get_key(index + 1));
+ ASSERT_TRUE(iterator->Valid());
+ ASSERT_OK(iterator->status());
+ iterator->Prev();
+ check_iterator(iterator, expected_status, expected_value);
+ }
+ };
+
+ Options options = GetTestOptions();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ DestroyAndReopen(options);
+
+ // fill data
+ for (size_t i = 0; i < data.size(); i++) {
+ for (size_t j = 0; j < data[i].size(); j++) {
+ std::string key = get_key(i);
+ std::string value = get_value(i, j);
+ if (data[i][j] == "Put") {
+ ASSERT_OK(Put(key, value));
+ ASSERT_OK(Flush());
+ } else if (data[i][j] == "Merge") {
+ ASSERT_OK(Merge(key, value));
+ ASSERT_OK(Flush());
+ }
+ }
+ }
+
+ std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
+ get_value(1, 2) + "," + get_value(1, 3);
+ Status expected_status;
+ verify(1, expected_status, expected_value);
+
+#ifndef ROCKSDB_LITE
+ // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
+ ASSERT_OK(dbfull()->SetOptions(cfh(),
+ {{"max_sequential_skip_in_iterations", "0"}}));
+ verify(1, expected_status, expected_value);
+#endif // !ROCKSDB_LITE
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.cc b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
new file mode 100644
index 000000000..079576f51
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/prefetch_buffer_collection.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer(
+ uint64_t file_number) {
+ auto& prefetch_buffer = prefetch_buffers_[file_number];
+ if (!prefetch_buffer) {
+ prefetch_buffer.reset(
+ new FilePrefetchBuffer(readahead_size_, readahead_size_));
+ }
+
+ return prefetch_buffer.get();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.h b/src/rocksdb/db/blob/prefetch_buffer_collection.h
new file mode 100644
index 000000000..b973eddc0
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+
+#include "file/file_prefetch_buffer.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that owns a collection of FilePrefetchBuffers using the file number
+// as key. Used for implementing compaction readahead for blob files. Designed
+// to be accessed by a single thread only: every (sub)compaction needs its own
+// buffers since they are guaranteed to read different blobs from different
+// positions even when reading the same file.
+class PrefetchBufferCollection {
+ public:
+ explicit PrefetchBufferCollection(uint64_t readahead_size)
+ : readahead_size_(readahead_size) {
+ assert(readahead_size_ > 0);
+ }
+
+ FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number);
+
+ private:
+ uint64_t readahead_size_;
+ std::unordered_map<uint64_t, std::unique_ptr<FilePrefetchBuffer>>
+ prefetch_buffers_; // maps file number to prefetch buffer
+};
+
+} // namespace ROCKSDB_NAMESPACE