summaryrefslogtreecommitdiffstats
path: root/src/test/objectstore
diff options
context:
space:
mode:
Diffstat (limited to 'src/test/objectstore')
-rwxr-xr-xsrc/test/objectstore/Allocator_aging_fragmentation.cc463
-rw-r--r--src/test/objectstore/Allocator_bench.cc368
-rw-r--r--src/test/objectstore/Allocator_test.cc566
-rw-r--r--src/test/objectstore/CMakeLists.txt140
-rw-r--r--src/test/objectstore/ObjectStoreTransactionBenchmark.cc266
-rw-r--r--src/test/objectstore/TestObjectStoreState.cc299
-rw-r--r--src/test/objectstore/TestObjectStoreState.h158
-rw-r--r--src/test/objectstore/TestRocksdbOptionParse.cc78
-rw-r--r--src/test/objectstore/allocator_replay_test.cc694
-rw-r--r--src/test/objectstore/fastbmap_allocator_test.cc1145
-rwxr-xr-xsrc/test/objectstore/hybrid_allocator_test.cc231
-rwxr-xr-xsrc/test/objectstore/run_seed_to.sh293
-rwxr-xr-xsrc/test/objectstore/run_seed_to_range.sh24
-rw-r--r--src/test/objectstore/run_smr_bluestore_test.sh48
-rwxr-xr-xsrc/test/objectstore/run_test_deferred.sh52
-rw-r--r--src/test/objectstore/store_test.cc10932
-rw-r--r--src/test/objectstore/store_test_fixture.cc135
-rw-r--r--src/test/objectstore/store_test_fixture.h52
-rwxr-xr-xsrc/test/objectstore/test_bdev.cc111
-rw-r--r--src/test/objectstore/test_bluefs.cc1422
-rw-r--r--src/test/objectstore/test_bluestore_types.cc2346
-rw-r--r--src/test/objectstore/test_deferred.cc146
-rw-r--r--src/test/objectstore/test_kv.cc1304
-rw-r--r--src/test/objectstore/test_memstore_clone.cc202
-rw-r--r--src/test/objectstore/test_transaction.cc215
25 files changed, 21690 insertions, 0 deletions
diff --git a/src/test/objectstore/Allocator_aging_fragmentation.cc b/src/test/objectstore/Allocator_aging_fragmentation.cc
new file mode 100755
index 000000000..220f8841b
--- /dev/null
+++ b/src/test/objectstore/Allocator_aging_fragmentation.cc
@@ -0,0 +1,463 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap allocator fragmentation benchmarks.
+ * Author: Adam Kupczyk, akupczyk@redhat.com
+ */
+#include <bit>
+#include <iostream>
+#include <boost/scoped_ptr.hpp>
+#include <gtest/gtest.h>
+#include <boost/random/triangle_distribution.hpp>
+
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+#include "include/stringify.h"
+#include "include/Context.h"
+#include "os/bluestore/Allocator.h"
+
+#include <boost/random/uniform_int.hpp>
+
+typedef boost::mt11213b gen_type;
+
+#include "common/debug.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_
+
+struct Scenario {
+ uint64_t capacity;
+ uint64_t alloc_unit;
+ double high_mark;
+ double low_mark;
+ double leakness;
+ uint32_t repeats;
+};
+
+std::vector<Scenario> scenarios{
+ Scenario{512, 65536, 0.8, 0.6, 0.1, 3},
+ Scenario{512, 65536, 0.9, 0.7, 0.0, 3},
+ Scenario{512, 65536, 0.9, 0.7, 0.1, 3},
+ Scenario{512, 65536, 0.8, 0.6, 0.5, 3},
+ Scenario{512, 65536, 0.9, 0.7, 0.5, 3},
+ Scenario{1024, 65536, 0.8, 0.6, 0.1, 3},
+ Scenario{1024, 65536, 0.9, 0.7, 0.0, 3},
+ Scenario{1024, 65536, 0.9, 0.7, 0.1, 3},
+ Scenario{1024*2, 65536, 0.8, 0.6, 0.3, 3},
+ Scenario{1024*2, 65536, 0.9, 0.7, 0.0, 3},
+ Scenario{1024*2, 65536, 0.9, 0.7, 0.3, 3},
+ Scenario{512, 65536/16, 0.8, 0.6, 0.1, 3},
+ Scenario{512, 65536/16, 0.9, 0.7, 0.0, 3},
+ Scenario{512, 65536/16, 0.9, 0.7, 0.1, 3},
+ Scenario{512, 65536/16, 0.8, 0.6, 0.5, 3},
+ Scenario{512, 65536/16, 0.9, 0.7, 0.5, 3},
+ Scenario{1024, 65536/16, 0.8, 0.6, 0.1, 3},
+ Scenario{1024, 65536/16, 0.9, 0.7, 0.0, 3},
+ Scenario{1024, 65536/16, 0.9, 0.7, 0.1, 3},
+ Scenario{1024*2, 65536/16, 0.8, 0.6, 0.3, 3},
+ Scenario{1024*2, 65536/16, 0.9, 0.7, 0.0, 3},
+ Scenario{1024*2, 65536/16, 0.9, 0.7, 0.3, 3}
+};
+
+void PrintTo(const Scenario& s, ::std::ostream* os)
+{
+ *os << "(capacity=" << s.capacity;
+ *os << "G, alloc_unit=" << s.alloc_unit;
+ *os << ", high_mark=" << s.high_mark;
+ *os << ", low_mark=" << s.low_mark;
+ *os << ", leakness=" << s.leakness;
+ *os << ", repeats=" << s.repeats << ")";
+}
+bool verbose = getenv("VERBOSE") != nullptr;
+
+class AllocTracker;
+class AllocTest : public ::testing::TestWithParam<std::string> {
+protected:
+ boost::scoped_ptr<AllocTracker> at;
+ gen_type rng;
+ static boost::intrusive_ptr<CephContext> cct;
+
+public:
+ boost::scoped_ptr<Allocator> alloc;
+ AllocTest(): alloc(nullptr) {}
+ void init_alloc(const std::string& alloc_name, int64_t size, uint64_t min_alloc_size);
+ void init_close();
+ void doAgingTest(std::function<uint32_t()> size_generator,
+ const std::string& alloc_name, uint64_t capacity, uint32_t alloc_unit,
+ uint64_t high_mark, uint64_t low_mark, uint32_t iterations, double leak_factor = 0);
+
+ uint64_t capacity;
+ uint32_t alloc_unit;
+
+ uint64_t level = 0;
+ uint64_t allocs = 0;
+ uint64_t fragmented = 0;
+ uint64_t fragments = 0;
+ uint64_t total_fragments = 0;
+
+ void do_fill(uint64_t high_mark, std::function<uint32_t()> size_generator, double leak_factor = 0);
+ void do_free(uint64_t low_mark);
+ uint32_t free_random();
+
+ void TearDown() final;
+ static void SetUpTestSuite();
+ static void TearDownTestSuite();
+};
+
+struct test_result {
+ uint64_t tests_cnt = 0;
+ double fragmented_percent = 0;
+ double fragments_count = 0;
+ double time = 0;
+ double frag_score = 0;
+};
+
+std::map<std::string, test_result> results_per_allocator;
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _1G = 1024 * 1024 * 1024;
+
+const uint64_t _2m = 2 * 1024 * 1024;
+
+class AllocTracker
+{
+ std::vector<bluestore_pextent_t> allocations;
+ uint64_t size = 0;
+
+public:
+ bool push(uint64_t offs, uint32_t len)
+ {
+ assert(len != 0);
+ if (size + 1 > allocations.size())
+ allocations.resize(size + 100);
+ allocations[size++] = bluestore_pextent_t(offs, len);
+ return true;
+ }
+
+ bool pop_random(gen_type& rng, uint64_t* offs, uint32_t* len,
+ uint32_t max_len = 0)
+ {
+ if (size == 0)
+ return false;
+ uint64_t pos = rng() % size;
+ *len = allocations[pos].length;
+ *offs = allocations[pos].offset;
+
+ if (max_len && *len > max_len) {
+ allocations[pos].length = *len - max_len;
+ allocations[pos].offset = *offs + max_len;
+ *len = max_len;
+ } else {
+ allocations[pos] = allocations[size-1];
+ --size;
+ }
+ return true;
+ }
+};
+
+boost::intrusive_ptr<CephContext> AllocTest::cct;
+
+void AllocTest::init_alloc(const std::string& allocator_name, int64_t size, uint64_t min_alloc_size) {
+ this->capacity = size;
+ this->alloc_unit = min_alloc_size;
+ rng.seed(0);
+ alloc.reset(Allocator::create(cct.get(), allocator_name, size,
+ min_alloc_size));
+ at.reset(new AllocTracker());
+}
+
+void AllocTest::init_close() {
+ alloc.reset(0);
+ at.reset(nullptr);
+}
+
+uint32_t AllocTest::free_random() {
+ uint64_t o = 0;
+ uint32_t l = 0;
+ interval_set<uint64_t> release_set;
+ if (!at->pop_random(rng, &o, &l)) {
+ //empty?
+ return 0;
+ }
+ release_set.insert(o, l);
+ alloc->release(release_set);
+ level -= l;
+ return l;
+}
+
+
+void AllocTest::do_fill(uint64_t high_mark, std::function<uint32_t()> size_generator, double leak_factor) {
+ assert (leak_factor >= 0);
+ assert (leak_factor < 1);
+ uint32_t leak_level = leak_factor * std::numeric_limits<uint32_t>::max();
+ PExtentVector tmp;
+ while (level < high_mark)
+ {
+ uint32_t want = size_generator();
+ tmp.clear();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ if (r < want) {
+ break;
+ }
+ level += r;
+ for(auto a : tmp) {
+ bool full = !at->push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+ allocs++;
+ if (tmp.size() > 1) {
+ fragmented ++;
+ total_fragments += r;
+ fragments += tmp.size();
+ }
+ if (leak_level > 0) {
+ for (size_t i=0; i<tmp.size(); i++) {
+ if (uint32_t(rng()) < leak_level) {
+ free_random();
+ }
+ }
+ }
+ }
+}
+
+void AllocTest::do_free(uint64_t low_mark) {
+ while (level > low_mark)
+ {
+ if (free_random() == 0)
+ break;
+ }
+}
+
+void AllocTest::doAgingTest(
+ std::function<uint32_t()> size_generator,
+ const std::string& allocator_name,
+ uint64_t capacity, uint32_t alloc_unit,
+ uint64_t high_mark, uint64_t low_mark, uint32_t iterations, double leak_factor)
+{
+ assert(std::has_single_bit(alloc_unit));
+ cct->_conf->bdev_block_size = alloc_unit;
+ PExtentVector allocated, tmp;
+ init_alloc(allocator_name, capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ utime_t start = ceph_clock_now();
+ level = 0;
+ allocs = 0;
+ fragmented = 0;
+ fragments = 0;
+ total_fragments = 0;
+ if (verbose) std::cout << "INITIAL FILL" << std::endl;
+ do_fill(high_mark, size_generator, leak_factor); //initial fill with data
+ if (verbose) std::cout << " fragmented allocs=" << 100.0 * fragmented / allocs << "%" <<
+ " #frags=" << ( fragmented != 0 ? double(fragments) / fragmented : 0 )<<
+ " time=" << (ceph_clock_now() - start) * 1000 << "ms" << std::endl;
+
+ for (uint32_t i=0; i < iterations; i++)
+ {
+ allocs = 0;
+ fragmented = 0;
+ fragments = 0;
+ total_fragments = 0;
+
+ uint64_t level_previous = level;
+ start = ceph_clock_now();
+ if (verbose) std::cout << "ADDING CAPACITY " << i + 1 << std::endl;
+ do_free(low_mark); //simulates adding new capacity to cluster
+ if (verbose) std::cout << " level change: " <<
+ double(level_previous) / capacity * 100 << "% -> " <<
+ double(level) / capacity * 100 << "% time=" <<
+ (ceph_clock_now() - start) * 1000 << "ms" << std::endl;
+
+ start = ceph_clock_now();
+ if (verbose) std::cout << "APPENDING " << i + 1 << std::endl;
+ do_fill(high_mark, size_generator, leak_factor); //only creating elements
+ if (verbose) std::cout << " fragmented allocs=" << 100.0 * fragmented / allocs << "%" <<
+ " #frags=" << ( fragmented != 0 ? double(fragments) / fragmented : 0 ) <<
+ " time=" << (ceph_clock_now() - start) * 1000 << "ms" << std::endl;
+ }
+ double frag_score = alloc->get_fragmentation_score();
+ do_free(0);
+ double free_frag_score = alloc->get_fragmentation_score();
+ ASSERT_EQ(alloc->get_free(), capacity);
+
+ std::cout << " fragmented allocs=" << 100.0 * fragmented / allocs << "%" <<
+ " #frags=" << ( fragmented != 0 ? double(fragments) / fragmented : 0 ) <<
+ " time=" << (ceph_clock_now() - start) * 1000 << "ms" <<
+ " frag.score=" << frag_score << " after free frag.score=" << free_frag_score << std::endl;
+
+ uint64_t sum = 0;
+ uint64_t cnt = 0;
+ auto list_free = [&](size_t off, size_t len) {
+ cnt++;
+ sum+=len;
+ };
+ alloc->dump(list_free);
+ ASSERT_EQ(sum, capacity);
+ if (verbose)
+ std::cout << "free chunks sum=" << sum << " free chunks count=" << cnt << std::endl;
+
+ //adding to totals
+ test_result &r = results_per_allocator[allocator_name];
+ r.tests_cnt ++;
+ r.fragmented_percent += 100.0 * fragmented / allocs;
+ r.fragments_count += ( fragmented != 0 ? double(fragments) / fragmented : 2 );
+ r.time += ceph_clock_now() - start;
+ r.frag_score += frag_score;
+}
+
+void AllocTest::SetUpTestSuite()
+{
+ vector<const char*> args;
+ cct = global_init(NULL, args,
+ CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(cct.get());
+}
+
+void AllocTest::TearDown()
+{
+ at.reset();
+ alloc.reset();
+}
+
+void AllocTest::TearDownTestSuite()
+{
+ cct.reset();
+
+ std::cout << "Summary: " << std::endl;
+ for (auto& r: results_per_allocator) {
+ std::cout << r.first <<
+ " fragmented allocs=" << r.second.fragmented_percent / r.second.tests_cnt << "%" <<
+ " #frags=" << r.second.fragments_count / r.second.tests_cnt <<
+ " free_score=" << r.second.frag_score / r.second.tests_cnt <<
+ " time=" << r.second.time * 1000 << "ms" << std::endl;
+ }
+}
+
+
+TEST_P(AllocTest, test_alloc_triangle_0_8M_16M)
+{
+ std::string allocator_name = GetParam();
+ boost::triangle_distribution<double> D(1, (8 * 1024 * 1024) , (16 * 1024 * 1024) );
+ for (auto& s:scenarios) {
+ std::cout << "Allocator: " << allocator_name << ", ";
+ PrintTo(s, &std::cout);
+ std::cout << std::endl;
+
+ auto size_generator = [&]() -> uint32_t {
+ return (uint32_t(D(rng)) + s.alloc_unit) & ~(s.alloc_unit - 1);
+ };
+
+ doAgingTest(size_generator, allocator_name, s.capacity * _1G, s.alloc_unit,
+ s.high_mark * s.capacity * _1G,
+ s.low_mark * s.capacity * _1G,
+ s.repeats, s.leakness);
+ }
+}
+
+TEST_P(AllocTest, test_alloc_8M_and_64K)
+{
+ std::string allocator_name = GetParam();
+ constexpr uint32_t max_chunk_size = 8*1024*1024;
+ constexpr uint32_t min_chunk_size = 64*1024;
+ for (auto& s:scenarios) {
+ std::cout << "Allocator: " << allocator_name << ", ";
+ PrintTo(s, &std::cout);
+ std::cout << std::endl;
+ boost::uniform_int<> D(0, 1);
+
+ auto size_generator = [&]() -> uint32_t {
+ if (D(rng) == 0)
+ return max_chunk_size;
+ else
+ return min_chunk_size;
+ };
+
+ doAgingTest(size_generator, allocator_name, s.capacity * _1G, s.alloc_unit,
+ s.high_mark * s.capacity * _1G,
+ s.low_mark * s.capacity * _1G,
+ s.repeats, s.leakness);
+ }
+}
+
+TEST_P(AllocTest, test_alloc_fragmentation_max_chunk_8M)
+{
+ std::string allocator_name = GetParam();
+ constexpr uint32_t max_object_size = 150*1000*1000;
+ constexpr uint32_t max_chunk_size = 8*1024*1024;
+ for (auto& s:scenarios) {
+ std::cout << "Allocator: " << allocator_name << ", ";
+ PrintTo(s, &std::cout);
+ std::cout << std::endl;
+ boost::uniform_int<> D(1, max_object_size / s.alloc_unit);
+
+ uint32_t object_size = 0;
+
+ auto size_generator = [&]() -> uint32_t {
+ uint32_t c;
+ if (object_size == 0)
+ object_size = (uint32_t(D(rng))* s.alloc_unit);
+ if (object_size > max_chunk_size)
+ c = max_chunk_size;
+ else
+ c = object_size;
+ object_size -= c;
+ return c;
+ };
+
+ doAgingTest(size_generator, allocator_name, s.capacity * _1G, s.alloc_unit,
+ s.high_mark * s.capacity * _1G,
+ s.low_mark * s.capacity * _1G,
+ s.repeats, s.leakness);
+ }
+}
+
+TEST_P(AllocTest, test_bonus_empty_fragmented)
+{
+ uint64_t capacity = uint64_t(512) * 1024 * 1024 * 1024; //512 G
+ uint64_t alloc_unit = 64 * 1024;
+ std::string allocator_name = GetParam();
+ std::cout << "Allocator: " << allocator_name << std::endl;
+ init_alloc(allocator_name, capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+ PExtentVector tmp;
+ for (size_t i = 0; i < capacity / (1024 * 1024); i++) {
+ tmp.clear();
+ uint32_t want = 1024 * 1024;
+ int r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ ASSERT_EQ(r, want);
+ if (tmp.size() > 1) {
+ interval_set<uint64_t> release_set;
+ for (auto& t: tmp) {
+ release_set.insert(t.offset, t.length);
+ }
+ alloc->release(release_set);
+ } else {
+ interval_set<uint64_t> release_set;
+ uint64_t offset = tmp[0].offset;
+ uint64_t length = tmp[0].length;
+
+ release_set.insert(offset + alloc_unit, length - 3 * alloc_unit);
+ alloc->release(release_set);
+ release_set.clear();
+
+ release_set.insert(offset , alloc_unit);
+ alloc->release(release_set);
+ release_set.clear();
+
+ release_set.insert(offset + length - 2 * alloc_unit, 2 * alloc_unit);
+ alloc->release(release_set);
+ release_set.clear();
+ }
+ }
+ double frag_score = alloc->get_fragmentation_score();
+ ASSERT_EQ(alloc->get_free(), capacity);
+ std::cout << " empty storage frag.score=" << frag_score << std::endl;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Allocator,
+ AllocTest,
+ ::testing::Values("stupid", "bitmap", "avl", "btree"));
diff --git a/src/test/objectstore/Allocator_bench.cc b/src/test/objectstore/Allocator_bench.cc
new file mode 100644
index 000000000..0d04a854e
--- /dev/null
+++ b/src/test/objectstore/Allocator_bench.cc
@@ -0,0 +1,368 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * In memory space allocator benchmarks.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ */
+#include <iostream>
+#include <boost/scoped_ptr.hpp>
+#include <gtest/gtest.h>
+
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "include/Context.h"
+#include "os/bluestore/Allocator.h"
+
+#include <boost/random/uniform_int.hpp>
+typedef boost::mt11213b gen_type;
+
+#include "common/debug.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
+
+using namespace std;
+
+class AllocTest : public ::testing::TestWithParam<const char*> {
+
+public:
+ boost::scoped_ptr<Allocator> alloc;
+ AllocTest(): alloc(0) { }
+ void init_alloc(int64_t size, uint64_t min_alloc_size) {
+ std::cout << "Creating alloc type " << string(GetParam()) << " \n";
+ alloc.reset(Allocator::create(g_ceph_context, GetParam(), size,
+ min_alloc_size));
+ }
+
+ void init_close() {
+ alloc.reset(0);
+ }
+ void doOverwriteTest(uint64_t capacity, uint64_t prefill,
+ uint64_t overwrite);
+};
+
+const uint64_t _1m = 1024 * 1024;
+
+void dump_mempools()
+{
+ ostringstream ostr;
+ Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+ ostr << "Mempools: ";
+ f->open_object_section("mempools");
+ mempool::dump(f);
+ f->close_section();
+ f->flush(ostr);
+ delete f;
+ ldout(g_ceph_context, 0) << ostr.str() << dendl;
+}
+
+class AllocTracker
+{
+ std::vector<uint64_t> allocations;
+ uint64_t head = 0;
+ uint64_t tail = 0;
+ uint64_t size = 0;
+ boost::uniform_int<> u1;
+
+public:
+ AllocTracker(uint64_t capacity, uint64_t alloc_unit)
+ : u1(0, capacity)
+ {
+ ceph_assert(alloc_unit >= 0x100);
+ ceph_assert(capacity <= (uint64_t(1) << 48)); // we use 5 octets (bytes 1 - 5) to store
+ // offset to save the required space.
+ // This supports capacity up to 281 TB
+
+ allocations.resize(capacity / alloc_unit);
+ }
+ inline uint64_t get_head() const
+ {
+ return head;
+ }
+
+ inline uint64_t get_tail() const
+ {
+ return tail;
+ }
+
+ bool push(uint64_t offs, uint32_t len)
+ {
+ ceph_assert((len & 0xff) == 0);
+ ceph_assert((offs & 0xff) == 0);
+ ceph_assert((offs & 0xffff000000000000) == 0);
+
+ if (head + 1 == tail)
+ return false;
+ uint64_t val = (offs << 16) | (len >> 8);
+ allocations[head++] = val;
+ head %= allocations.size();
+ ++size;
+ return true;
+ }
+ bool pop(uint64_t* offs, uint32_t* len)
+ {
+ if (size == 0)
+ return false;
+ uint64_t val = allocations[tail++];
+ *len = uint64_t((val & 0xffffff) << 8);
+ *offs = (val >> 16) & ~uint64_t(0xff);
+ tail %= allocations.size();
+ --size;
+ return true;
+ }
+ bool pop_random(gen_type& rng, uint64_t* offs, uint32_t* len,
+ uint32_t max_len = 0)
+ {
+ if (size == 0)
+ return false;
+
+ uint64_t pos = (u1(rng) % size) + tail;
+ pos %= allocations.size();
+ uint64_t val = allocations[pos];
+ *len = uint64_t((val & 0xffffff) << 8);
+ *offs = (val >> 16) & ~uint64_t(0xff);
+ if (max_len && *len > max_len) {
+ val = ((*offs + max_len) << 16) | ((*len - max_len) >> 8);
+ allocations[pos] = val;
+ *len = max_len;
+ } else {
+ allocations[pos] = allocations[tail++];
+ tail %= allocations.size();
+ --size;
+ }
+ return true;
+ }
+};
+
+TEST_P(AllocTest, test_alloc_bench_seq)
+{
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+ uint64_t alloc_unit = 4096;
+ uint64_t want_size = alloc_unit;
+ PExtentVector allocated, tmp;
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ utime_t start = ceph_clock_now();
+ for (uint64_t i = 0; i < capacity; i += want_size)
+ {
+ tmp.clear();
+ EXPECT_EQ(static_cast<int64_t>(want_size),
+ alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+
+ std::cout << "releasing..." << std::endl;
+ for (size_t i = 0; i < capacity; i += want_size)
+ {
+ interval_set<uint64_t> release_set;
+ release_set.insert(i, want_size);
+ alloc->release(release_set);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "release " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+ dump_mempools();
+}
+
+TEST_P(AllocTest, test_alloc_bench)
+{
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+ uint64_t alloc_unit = 4096;
+ PExtentVector allocated, tmp;
+ AllocTracker at(capacity, alloc_unit);
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ gen_type rng(time(NULL));
+ boost::uniform_int<> u1(0, 9); // 4K-2M
+ boost::uniform_int<> u2(0, 7); // 4K-512K
+
+ utime_t start = ceph_clock_now();
+ for (uint64_t i = 0; i < capacity * 2; )
+ {
+ uint32_t want = alloc_unit << u1(rng);
+
+ tmp.clear();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ if (r < want) {
+ break;
+ }
+ i += r;
+
+ for(auto a : tmp) {
+ bool full = !at.push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+ uint64_t want_release = alloc_unit << u2(rng);
+ uint64_t released = 0;
+ do {
+ uint64_t o = 0;
+ uint32_t l = 0;
+ interval_set<uint64_t> release_set;
+ if (!at.pop_random(rng, &o, &l, want_release - released)) {
+ break;
+ }
+ release_set.insert(o, l);
+ alloc->release(release_set);
+ released += l;
+ } while (released < want_release);
+
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+ std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+ dump_mempools();
+}
+
+void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill,
+ uint64_t overwrite)
+{
+ uint64_t alloc_unit = 4096;
+ PExtentVector allocated, tmp;
+ AllocTracker at(capacity, alloc_unit);
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ gen_type rng(time(NULL));
+ boost::uniform_int<> u1(0, 9); // 4K-2M
+ boost::uniform_int<> u2(0, 9); // 4K-512K
+
+ utime_t start = ceph_clock_now();
+ // allocate 90% of the capacity
+ auto cap = prefill;
+ for (uint64_t i = 0; i < cap; )
+ {
+ uint32_t want = alloc_unit << u1(rng);
+ tmp.clear();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ if (r < want) {
+ break;
+ }
+ i += r;
+
+ for(auto a : tmp) {
+ bool full = !at.push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+ << cap / 1024 / 1024 << std::endl;
+ }
+ }
+
+ cap = overwrite;
+ for (uint64_t i = 0; i < cap; )
+ {
+ uint64_t want_release = alloc_unit << u2(rng);
+ uint64_t released = 0;
+ do {
+ uint64_t o = 0;
+ uint32_t l = 0;
+ interval_set<uint64_t> release_set;
+ if (!at.pop_random(rng, &o, &l, want_release - released)) {
+ break;
+ }
+ release_set.insert(o, l);
+ alloc->release(release_set);
+ released += l;
+ } while (released < want_release);
+
+ uint32_t want = alloc_unit << u1(rng);
+ tmp.clear();
+ auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ if (r != want) {
+ std::cout<<"Can't allocate more space, stopping."<< std::endl;
+ break;
+ }
+ i += r;
+
+ for(auto a : tmp) {
+ bool full = !at.push(a.offset, a.length);
+ EXPECT_EQ(full, false);
+ }
+
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "reuse " << i / 1024 / 1024 << " mb of "
+ << cap / 1024 / 1024 << std::endl;
+ }
+ }
+ std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+ std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+
+ dump_mempools();
+}
+
+TEST_P(AllocTest, test_alloc_bench_90_300)
+{
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+ auto prefill = capacity - capacity / 10;
+ auto overwrite = capacity * 3;
+ doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench_50_300)
+{
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+ auto prefill = capacity / 2;
+ auto overwrite = capacity * 3;
+ doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench_10_300)
+{
+ uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+ auto prefill = capacity / 10;
+ auto overwrite = capacity * 3;
+ doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, mempoolAccounting)
+{
+ uint64_t bytes = mempool::bluestore_alloc::allocated_bytes();
+ uint64_t items = mempool::bluestore_alloc::allocated_items();
+
+ uint64_t alloc_size = 4 * 1024;
+ uint64_t capacity = 512ll * 1024 * 1024 * 1024;
+ Allocator* alloc = Allocator::create(g_ceph_context, GetParam(),
+ capacity, alloc_size);
+ ASSERT_NE(alloc, nullptr);
+ alloc->init_add_free(0, capacity);
+
+ std::map<uint32_t, PExtentVector> all_allocs;
+ for (size_t i = 0; i < 10000; i++) {
+ PExtentVector tmp;
+ alloc->allocate(alloc_size, alloc_size, 0, 0, &tmp);
+ all_allocs[rand()] = tmp;
+ tmp.clear();
+ alloc->allocate(alloc_size, alloc_size, 0, 0, &tmp);
+ all_allocs[rand()] = tmp;
+ tmp.clear();
+
+ auto it = all_allocs.upper_bound(rand());
+ if (it != all_allocs.end()) {
+ alloc->release(it->second);
+ all_allocs.erase(it);
+ }
+ }
+
+ delete(alloc);
+ ASSERT_EQ(mempool::bluestore_alloc::allocated_bytes(), bytes);
+ ASSERT_EQ(mempool::bluestore_alloc::allocated_items(), items);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Allocator,
+ AllocTest,
+ ::testing::Values("stupid", "bitmap", "avl", "btree", "hybrid"));
diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc
new file mode 100644
index 000000000..b00650015
--- /dev/null
+++ b/src/test/objectstore/Allocator_test.cc
@@ -0,0 +1,566 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * In memory space allocator test cases.
+ * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
+ */
+#include <iostream>
+#include <boost/scoped_ptr.hpp>
+#include <gtest/gtest.h>
+
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "include/Context.h"
+#include "os/bluestore/Allocator.h"
+
+using namespace std;
+
+typedef boost::mt11213b gen_type;
+
+class AllocTest : public ::testing::TestWithParam<const char*> {
+
+public:
+ boost::scoped_ptr<Allocator> alloc;
+ AllocTest(): alloc(0) { }
+ void init_alloc(int64_t size, uint64_t min_alloc_size) {
+ std::cout << "Creating alloc type " << string(GetParam()) << " \n";
+ alloc.reset(Allocator::create(g_ceph_context, GetParam(), size,
+ min_alloc_size,
+ 256*1048576, 100*256*1048576ull));
+ }
+
+ void init_close() {
+ alloc.reset(0);
+ }
+};
+
+TEST_P(AllocTest, test_alloc_init)
+{
+ int64_t blocks = 64;
+ init_alloc(blocks, 1);
+ ASSERT_EQ(0U, alloc->get_free());
+ alloc->shutdown();
+ blocks = 1024 * 2 + 16;
+ init_alloc(blocks, 1);
+ ASSERT_EQ(0U, alloc->get_free());
+ alloc->shutdown();
+ blocks = 1024 * 2;
+ init_alloc(blocks, 1);
+ ASSERT_EQ(alloc->get_free(), (uint64_t) 0);
+}
+
+TEST_P(AllocTest, test_init_add_free)
+{
+ int64_t block_size = 1024;
+ int64_t capacity = 4 * 1024 * block_size;
+
+ {
+ init_alloc(capacity, block_size);
+
+ auto free = alloc->get_free();
+ alloc->init_add_free(block_size, 0);
+ ASSERT_EQ(free, alloc->get_free());
+
+ alloc->init_rm_free(block_size, 0);
+ ASSERT_EQ(free, alloc->get_free());
+ }
+}
+
+TEST_P(AllocTest, test_alloc_min_alloc)
+{
+ int64_t block_size = 1024;
+ int64_t capacity = 4 * 1024 * block_size;
+
+ {
+ init_alloc(capacity, block_size);
+
+ alloc->init_add_free(block_size, block_size);
+ PExtentVector extents;
+ EXPECT_EQ(block_size, alloc->allocate(block_size, block_size,
+ 0, (int64_t) 0, &extents));
+ }
+
+ /*
+ * Allocate extent and make sure all comes in single extent.
+ */
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 4);
+ PExtentVector extents;
+ EXPECT_EQ(4*block_size,
+ alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
+ 0, (int64_t) 0, &extents));
+ EXPECT_EQ(1u, extents.size());
+ EXPECT_EQ(extents[0].length, 4 * block_size);
+ }
+
+ /*
+ * Allocate extent and make sure we get two different extents.
+ */
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 2);
+ alloc->init_add_free(3 * block_size, block_size * 2);
+ PExtentVector extents;
+
+ EXPECT_EQ(4*block_size,
+ alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
+ 0, (int64_t) 0, &extents));
+ EXPECT_EQ(2u, extents.size());
+ EXPECT_EQ(extents[0].length, 2 * block_size);
+ EXPECT_EQ(extents[1].length, 2 * block_size);
+ }
+ alloc->shutdown();
+}
+
+TEST_P(AllocTest, test_alloc_min_max_alloc)
+{
+ int64_t block_size = 1024;
+
+ int64_t capacity = 4 * 1024 * block_size;
+ init_alloc(capacity, block_size);
+
+ /*
+ * Make sure we get all extents different when
+ * min_alloc_size == max_alloc_size
+ */
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 4);
+ PExtentVector extents;
+ EXPECT_EQ(4*block_size,
+ alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
+ block_size, (int64_t) 0, &extents));
+ for (auto e : extents) {
+ EXPECT_EQ(e.length, block_size);
+ }
+ EXPECT_EQ(4u, extents.size());
+ }
+
+
+ /*
+ * Make sure we get extents of length max_alloc size
+ * when max alloc size > min_alloc size
+ */
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 4);
+ PExtentVector extents;
+ EXPECT_EQ(4*block_size,
+ alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
+ 2 * block_size, (int64_t) 0, &extents));
+ EXPECT_EQ(2u, extents.size());
+ for (auto& e : extents) {
+ EXPECT_EQ(e.length, block_size * 2);
+ }
+ }
+
+ /*
+ * Make sure allocations are of min_alloc_size when min_alloc_size > block_size.
+ */
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 1024);
+ PExtentVector extents;
+ EXPECT_EQ(1024 * block_size,
+ alloc->allocate(1024 * (uint64_t)block_size,
+ (uint64_t) block_size * 4,
+ block_size * 4, (int64_t) 0, &extents));
+ for (auto& e : extents) {
+ EXPECT_EQ(e.length, block_size * 4);
+ }
+ EXPECT_EQ(1024u/4, extents.size());
+ }
+
+ /*
+ * Allocate and free.
+ */
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 16);
+ PExtentVector extents;
+ EXPECT_EQ(16 * block_size,
+ alloc->allocate(16 * (uint64_t)block_size, (uint64_t) block_size,
+ 2 * block_size, (int64_t) 0, &extents));
+
+ EXPECT_EQ(extents.size(), 8u);
+ for (auto& e : extents) {
+ EXPECT_EQ(e.length, 2 * block_size);
+ }
+ }
+}
+
+TEST_P(AllocTest, test_alloc_failure)
+{
+ int64_t block_size = 1024;
+ int64_t capacity = 4 * 1024 * block_size;
+
+ {
+ init_alloc(capacity, block_size);
+ alloc->init_add_free(0, block_size * 256);
+ alloc->init_add_free(block_size * 512, block_size * 256);
+
+ PExtentVector extents;
+ EXPECT_EQ(512 * block_size,
+ alloc->allocate(512 * (uint64_t)block_size,
+ (uint64_t) block_size * 256,
+ block_size * 256, (int64_t) 0, &extents));
+ alloc->init_add_free(0, block_size * 256);
+ alloc->init_add_free(block_size * 512, block_size * 256);
+ extents.clear();
+ EXPECT_EQ(-ENOSPC,
+ alloc->allocate(512 * (uint64_t)block_size,
+ (uint64_t) block_size * 512,
+ block_size * 512, (int64_t) 0, &extents));
+ }
+}
+
+TEST_P(AllocTest, test_alloc_big)
+{
+ int64_t block_size = 4096;
+ int64_t blocks = 104857600;
+ int64_t mas = 4096;
+ init_alloc(blocks*block_size, block_size);
+ alloc->init_add_free(2*block_size, (blocks-2)*block_size);
+ for (int64_t big = mas; big < 1048576*128; big*=2) {
+ cout << big << std::endl;
+ PExtentVector extents;
+ EXPECT_EQ(big,
+ alloc->allocate(big, mas, 0, &extents));
+ }
+}
+
+TEST_P(AllocTest, test_alloc_non_aligned_len)
+{
+ int64_t block_size = 1 << 12;
+ int64_t blocks = (1 << 20) * 100;
+ int64_t want_size = 1 << 22;
+ int64_t alloc_unit = 1 << 20;
+
+ init_alloc(blocks*block_size, block_size);
+ alloc->init_add_free(0, 2097152);
+ alloc->init_add_free(2097152, 1064960);
+ alloc->init_add_free(3670016, 2097152);
+
+ PExtentVector extents;
+ EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, &extents));
+}
+
+TEST_P(AllocTest, test_alloc_39334)
+{
+ uint64_t block = 0x4000;
+ uint64_t size = 0x5d00000000;
+
+ init_alloc(size, block);
+ alloc->init_add_free(0x4000, 0x5cffffc000);
+ EXPECT_EQ(size - block, alloc->get_free());
+}
+
+TEST_P(AllocTest, test_alloc_fragmentation)
+{
+ uint64_t capacity = 4 * 1024 * 1024;
+ uint64_t alloc_unit = 4096;
+ uint64_t want_size = alloc_unit;
+ PExtentVector allocated, tmp;
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+ bool bitmap_alloc = GetParam() == std::string("bitmap");
+
+ EXPECT_EQ(0.0, alloc->get_fragmentation());
+
+ for (size_t i = 0; i < capacity / alloc_unit; ++i)
+ {
+ tmp.clear();
+ EXPECT_EQ(static_cast<int64_t>(want_size),
+ alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+ allocated.insert(allocated.end(), tmp.begin(), tmp.end());
+
+ // bitmap fragmentation calculation doesn't provide such constant
+ // estimate
+ if (!bitmap_alloc) {
+ EXPECT_EQ(0.0, alloc->get_fragmentation());
+ }
+ }
+ tmp.clear();
+ EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+
+ if (GetParam() == string("avl")) {
+ // AVL allocator uses a different allocating strategy
+ GTEST_SKIP() << "skipping for AVL allocator";
+ } else if (GetParam() == string("hybrid")) {
+ // AVL allocator uses a different allocating strategy
+ GTEST_SKIP() << "skipping for Hybrid allocator";
+ }
+
+ for (size_t i = 0; i < allocated.size(); i += 2)
+ {
+ interval_set<uint64_t> release_set;
+ release_set.insert(allocated[i].offset, allocated[i].length);
+ alloc->release(release_set);
+ }
+ EXPECT_EQ(1.0, alloc->get_fragmentation());
+ EXPECT_EQ(66u, uint64_t(alloc->get_fragmentation_score() * 100));
+
+ for (size_t i = 1; i < allocated.size() / 2; i += 2)
+ {
+ interval_set<uint64_t> release_set;
+ release_set.insert(allocated[i].offset, allocated[i].length);
+ alloc->release(release_set);
+ }
+ if (bitmap_alloc) {
+ // fragmentation = one l1 slot is free + one l1 slot is partial
+ EXPECT_EQ(50U, uint64_t(alloc->get_fragmentation() * 100));
+ } else {
+ // fragmentation approx = 257 intervals / 768 max intervals
+ EXPECT_EQ(33u, uint64_t(alloc->get_fragmentation() * 100));
+ }
+ EXPECT_EQ(27u, uint64_t(alloc->get_fragmentation_score() * 100));
+
+ for (size_t i = allocated.size() / 2 + 1; i < allocated.size(); i += 2)
+ {
+ interval_set<uint64_t> release_set;
+ release_set.insert(allocated[i].offset, allocated[i].length);
+ alloc->release(release_set);
+ }
+ // doing some rounding trick as stupid allocator doesn't merge all the
+ // extents that causes some minor fragmentation (minor bug or by-design behavior?).
+ // Hence leaving just two
+ // digits after decimal point due to this.
+ EXPECT_EQ(0u, uint64_t(alloc->get_fragmentation() * 100));
+ if (bitmap_alloc) {
+ EXPECT_EQ(0u, uint64_t(alloc->get_fragmentation_score() * 100));
+ } else {
+ EXPECT_EQ(11u, uint64_t(alloc->get_fragmentation_score() * 100));
+ }
+}
+
+TEST_P(AllocTest, test_dump_fragmentation_score)
+{
+ uint64_t capacity = 1024 * 1024 * 1024;
+ uint64_t one_alloc_max = 2 * 1024 * 1024;
+ uint64_t alloc_unit = 4096;
+ uint64_t want_size = alloc_unit;
+ uint64_t rounds = 10;
+ uint64_t actions_per_round = 1000;
+ PExtentVector allocated, tmp;
+ gen_type rng;
+
+ init_alloc(capacity, alloc_unit);
+ alloc->init_add_free(0, capacity);
+
+ EXPECT_EQ(0.0, alloc->get_fragmentation());
+ EXPECT_EQ(0.0, alloc->get_fragmentation_score());
+
+ uint64_t allocated_cnt = 0;
+ for (size_t round = 0; round < rounds ; round++) {
+ for (size_t j = 0; j < actions_per_round ; j++) {
+ //free or allocate ?
+ if ( rng() % capacity >= allocated_cnt ) {
+ //allocate
+ want_size = ( rng() % one_alloc_max ) / alloc_unit * alloc_unit + alloc_unit;
+ tmp.clear();
+ int64_t r = alloc->allocate(want_size, alloc_unit, 0, 0, &tmp);
+ if (r > 0) {
+ for (auto& t: tmp) {
+ if (t.length > 0)
+ allocated.push_back(t);
+ }
+ allocated_cnt += r;
+ }
+ } else {
+ //free
+ ceph_assert(allocated.size() > 0);
+ size_t item = rng() % allocated.size();
+ ceph_assert(allocated[item].length > 0);
+ allocated_cnt -= allocated[item].length;
+ interval_set<uint64_t> release_set;
+ release_set.insert(allocated[item].offset, allocated[item].length);
+ alloc->release(release_set);
+ std::swap(allocated[item], allocated[allocated.size() - 1]);
+ allocated.resize(allocated.size() - 1);
+ }
+ }
+
+ size_t free_sum = 0;
+ auto iterated_allocation = [&](size_t off, size_t len) {
+ ceph_assert(len > 0);
+ free_sum += len;
+ };
+ alloc->foreach(iterated_allocation);
+ EXPECT_GT(1, alloc->get_fragmentation_score());
+ EXPECT_EQ(capacity, free_sum + allocated_cnt);
+ }
+
+ for (size_t i = 0; i < allocated.size(); i ++)
+ {
+ interval_set<uint64_t> release_set;
+ release_set.insert(allocated[i].offset, allocated[i].length);
+ alloc->release(release_set);
+ }
+}
+
+TEST_P(AllocTest, test_alloc_bug_24598)
+{
+ if (string(GetParam()) != "bitmap")
+ return;
+
+ uint64_t capacity = 0x2625a0000ull;
+ uint64_t alloc_unit = 0x4000;
+ uint64_t want_size = 0x200000;
+ PExtentVector allocated, tmp;
+
+ init_alloc(capacity, alloc_unit);
+
+ alloc->init_add_free(0x4800000, 0x100000);
+ alloc->init_add_free(0x4a00000, 0x100000);
+
+ alloc->init_rm_free(0x4800000, 0x100000);
+ alloc->init_rm_free(0x4a00000, 0x100000);
+
+ alloc->init_add_free(0x3f00000, 0x500000);
+ alloc->init_add_free(0x4500000, 0x100000);
+ alloc->init_add_free(0x4700000, 0x100000);
+ alloc->init_add_free(0x4900000, 0x100000);
+ alloc->init_add_free(0x4b00000, 0x200000);
+
+ EXPECT_EQ(static_cast<int64_t>(want_size),
+ alloc->allocate(want_size, 0x100000, 0, 0, &tmp));
+ EXPECT_EQ(1u, tmp.size());
+ EXPECT_EQ(0x4b00000u, tmp[0].offset);
+ EXPECT_EQ(0x200000u, tmp[0].length);
+}
+
+//Verifies issue from
+//http://tracker.ceph.com/issues/40703
+//
+TEST_P(AllocTest, test_alloc_big2)
+{
+ int64_t block_size = 4096;
+ int64_t blocks = 1048576 * 2;
+ int64_t mas = 1024*1024;
+ init_alloc(blocks*block_size, block_size);
+ alloc->init_add_free(0, blocks * block_size);
+
+ PExtentVector extents;
+ uint64_t need = block_size * blocks / 4; // 2GB
+ EXPECT_EQ(need,
+ alloc->allocate(need, mas, 0, &extents));
+ need = block_size * blocks / 4; // 2GB
+ extents.clear();
+ EXPECT_EQ(need,
+ alloc->allocate(need, mas, 0, &extents));
+ EXPECT_TRUE(extents[0].length > 0);
+}
+
+//Verifies stuck 4GB chunk allocation
+//in StupidAllocator
+//
+TEST_P(AllocTest, test_alloc_big3)
+{
+ int64_t block_size = 4096;
+ int64_t blocks = 1048576 * 2;
+ int64_t mas = 1024*1024;
+ init_alloc(blocks*block_size, block_size);
+ alloc->init_add_free(0, blocks * block_size);
+
+ PExtentVector extents;
+ uint64_t need = block_size * blocks / 2; // 4GB
+ EXPECT_EQ(need,
+ alloc->allocate(need, mas, 0, &extents));
+ EXPECT_TRUE(extents[0].length > 0);
+}
+
+TEST_P(AllocTest, test_alloc_contiguous)
+{
+ int64_t block_size = 0x1000;
+ int64_t capacity = block_size * 1024 * 1024;
+
+ {
+ init_alloc(capacity, block_size);
+
+ alloc->init_add_free(0, capacity);
+ PExtentVector extents;
+ uint64_t need = 4 * block_size;
+ EXPECT_EQ(need,
+ alloc->allocate(need, need,
+ 0, (int64_t)0, &extents));
+ EXPECT_EQ(1u, extents.size());
+ EXPECT_EQ(extents[0].offset, 0);
+ EXPECT_EQ(extents[0].length, 4 * block_size);
+
+ extents.clear();
+ EXPECT_EQ(need,
+ alloc->allocate(need, need,
+ 0, (int64_t)0, &extents));
+ EXPECT_EQ(1u, extents.size());
+ EXPECT_EQ(extents[0].offset, 4 * block_size);
+ EXPECT_EQ(extents[0].length, 4 * block_size);
+ }
+
+ alloc->shutdown();
+}
+
+TEST_P(AllocTest, test_alloc_47883)
+{
+ uint64_t block = 0x1000;
+ uint64_t size = 1599858540544ul;
+
+ init_alloc(size, block);
+
+ alloc->init_add_free(0x1b970000, 0x26000);
+ alloc->init_add_free(0x1747e9d5000, 0x493000);
+ alloc->init_add_free(0x1747ee6a000, 0x196000);
+
+ PExtentVector extents;
+ auto need = 0x3f980000;
+ auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents);
+ EXPECT_GE(got, 0x630000);
+}
+
+TEST_P(AllocTest, test_alloc_50656_best_fit)
+{
+ uint64_t block = 0x1000;
+ uint64_t size = 0x3b9e400000;
+
+ init_alloc(size, block);
+
+ // too few free extents - causes best fit mode for avls
+ for (size_t i = 0; i < 0x10; i++) {
+ alloc->init_add_free(i * 2 * 0x100000, 0x100000);
+ }
+
+ alloc->init_add_free(0x1e1bd13000, 0x404000);
+
+ PExtentVector extents;
+ auto need = 0x400000;
+ auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents);
+ EXPECT_GT(got, 0);
+ EXPECT_EQ(got, 0x400000);
+}
+
+TEST_P(AllocTest, test_alloc_50656_first_fit)
+{
+ uint64_t block = 0x1000;
+ uint64_t size = 0x3b9e400000;
+
+ init_alloc(size, block);
+
+ for (size_t i = 0; i < 0x10000; i += 2) {
+ alloc->init_add_free(i * 0x100000, 0x100000);
+ }
+
+ alloc->init_add_free(0x1e1bd13000, 0x404000);
+
+ PExtentVector extents;
+ auto need = 0x400000;
+ auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents);
+ EXPECT_GT(got, 0);
+ EXPECT_EQ(got, 0x400000);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Allocator,
+ AllocTest,
+ ::testing::Values("stupid", "bitmap", "avl", "hybrid"));
diff --git a/src/test/objectstore/CMakeLists.txt b/src/test/objectstore/CMakeLists.txt
new file mode 100644
index 000000000..a012264e8
--- /dev/null
+++ b/src/test/objectstore/CMakeLists.txt
@@ -0,0 +1,140 @@
+add_executable(ceph_perf_objectstore
+ ObjectStoreTransactionBenchmark.cc)
+target_link_libraries(ceph_perf_objectstore os osdc global ${UNITTEST_LIBS})
+install(TARGETS ceph_perf_objectstore
+ DESTINATION bin)
+
+add_library(store_test_fixture OBJECT store_test_fixture.cc)
+target_include_directories(store_test_fixture PRIVATE
+ $<TARGET_PROPERTY:GTest::GTest,INTERFACE_INCLUDE_DIRECTORIES>)
+
+add_executable(ceph_test_objectstore
+ store_test.cc
+ $<TARGET_OBJECTS:store_test_fixture>)
+target_link_libraries(ceph_test_objectstore
+ os
+ ceph-common
+ ${UNITTEST_LIBS}
+ global
+ ${EXTRALIBS}
+ ${BLKID_LIBRARIES}
+ ${CMAKE_DL_LIBS}
+ )
+install(TARGETS ceph_test_objectstore
+ DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+add_executable(ceph_test_keyvaluedb
+ test_kv.cc)
+target_link_libraries(ceph_test_keyvaluedb
+ os
+ ceph-common
+ ${UNITTEST_LIBS}
+ global
+ ${EXTRALIBS}
+ ${BLKID_LIBRARIES}
+ ${CMAKE_DL_LIBS}
+ )
+install(TARGETS ceph_test_keyvaluedb
+ DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+# unittest_rocksdb_option
+add_executable(unittest_rocksdb_option
+ TestRocksdbOptionParse.cc
+ $<TARGET_OBJECTS:unit-main>
+ )
+add_ceph_unittest(unittest_rocksdb_option)
+target_link_libraries(unittest_rocksdb_option global os ${BLKID_LIBRARIES})
+
+if(WITH_EVENTTRACE)
+ add_dependencies(os eventtrace_tp)
+endif()
+
+if(WITH_BLUESTORE)
+
+ add_executable(unittest_alloc
+ Allocator_test.cc
+ $<TARGET_OBJECTS:unit-main>
+ )
+ add_ceph_unittest(unittest_alloc)
+ target_link_libraries(unittest_alloc os global)
+
+ add_executable(unittest_alloc_bench
+ Allocator_bench.cc
+ $<TARGET_OBJECTS:unit-main>
+ )
+ target_link_libraries(unittest_alloc_bench ${UNITTEST_LIBS} os global)
+
+ add_executable(unittest_fastbmap_allocator
+ fastbmap_allocator_test.cc
+ $<TARGET_OBJECTS:unit-main>
+ )
+ add_ceph_unittest(unittest_fastbmap_allocator)
+ target_link_libraries(unittest_fastbmap_allocator os global)
+
+ set_target_properties(unittest_fastbmap_allocator PROPERTIES COMPILE_FLAGS
+ "${UNITTEST_CXX_FLAGS}")
+
+ add_executable(unittest_hybrid_allocator
+ hybrid_allocator_test.cc
+ $<TARGET_OBJECTS:unit-main>
+ )
+ add_ceph_unittest(unittest_hybrid_allocator)
+ target_link_libraries(unittest_hybrid_allocator os global)
+
+ set_target_properties(unittest_hybrid_allocator PROPERTIES COMPILE_FLAGS
+ "${UNITTEST_CXX_FLAGS}")
+
+ add_executable(unittest_alloc_aging EXCLUDE_FROM_ALL
+ Allocator_aging_fragmentation.cc)
+ target_link_libraries(unittest_alloc_aging os global GTest::Main)
+
+ # unittest_bluefs
+ add_executable(unittest_bluefs
+ test_bluefs.cc
+ )
+ add_ceph_unittest(unittest_bluefs)
+ target_link_libraries(unittest_bluefs os global)
+
+ # unittest_bluestore_types
+ add_executable(unittest_bluestore_types
+ test_bluestore_types.cc
+ )
+ add_ceph_unittest(unittest_bluestore_types)
+ target_link_libraries(unittest_bluestore_types os global)
+
+ # unittest_bdev
+ add_executable(unittest_bdev
+ test_bdev.cc
+ )
+ add_ceph_unittest(unittest_bdev)
+ target_link_libraries(unittest_bdev os global)
+
+ # unittest_deferred
+ add_executable(unittest_deferred
+ test_deferred.cc
+ )
+ add_ceph_unittest(unittest_deferred)
+ target_link_libraries(unittest_deferred os global)
+
+endif(WITH_BLUESTORE)
+
+# unittest_transaction
+add_executable(unittest_transaction
+ test_transaction.cc)
+add_ceph_unittest(unittest_transaction)
+target_link_libraries(unittest_transaction os ceph-common)
+
+# unittest_memstore_clone
+add_executable(unittest_memstore_clone
+ test_memstore_clone.cc
+ $<TARGET_OBJECTS:store_test_fixture>)
+add_ceph_unittest(unittest_memstore_clone)
+target_link_libraries(unittest_memstore_clone os global)
+
+if(WITH_BLUESTORE)
+ add_executable(ceph_test_alloc_replay
+ allocator_replay_test.cc)
+ target_link_libraries(ceph_test_alloc_replay os global ${UNITTEST_LIBS})
+ install(TARGETS ceph_test_alloc_replay
+ DESTINATION bin)
+endif()
diff --git a/src/test/objectstore/ObjectStoreTransactionBenchmark.cc b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
new file mode 100644
index 000000000..e2ce3b2ef
--- /dev/null
+++ b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string>
+#include <iostream>
+
+using namespace std;
+
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/Cycles.h"
+#include "global/global_init.h"
+#include "os/ObjectStore.h"
+
+class Transaction {
+ private:
+ ObjectStore::Transaction t;
+
+ public:
+ struct Tick {
+ uint64_t ticks;
+ uint64_t count;
+ Tick(): ticks(0), count(0) {}
+ void add(uint64_t a) {
+ ticks += a;
+ count++;
+ }
+ };
+ static Tick write_ticks, setattr_ticks, omap_setkeys_ticks, omap_rmkey_ticks;
+ static Tick encode_ticks, decode_ticks, iterate_ticks;
+
+ void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len,
+ const bufferlist& data) {
+ uint64_t start_time = Cycles::rdtsc();
+ t.write(cid, oid, off, len, data);
+ write_ticks.add(Cycles::rdtsc() - start_time);
+ }
+ void setattr(coll_t cid, const ghobject_t& oid, const string &name,
+ bufferlist& val) {
+ uint64_t start_time = Cycles::rdtsc();
+ t.setattr(cid, oid, name, val);
+ setattr_ticks.add(Cycles::rdtsc() - start_time);
+ }
+ void omap_setkeys(coll_t cid, const ghobject_t &oid,
+ const map<string, bufferlist> &attrset) {
+
+ uint64_t start_time = Cycles::rdtsc();
+ t.omap_setkeys(cid, oid, attrset);
+ omap_setkeys_ticks.add(Cycles::rdtsc() - start_time);
+ }
+ void omap_rmkey(coll_t cid, const ghobject_t &oid,
+ const string &key) {
+ uint64_t start_time = Cycles::rdtsc();
+ t.omap_rmkey(cid, oid, key);
+ omap_rmkey_ticks.add(Cycles::rdtsc() - start_time);
+ }
+
+ void apply_encode_decode() {
+ bufferlist bl;
+ ObjectStore::Transaction d;
+ uint64_t start_time = Cycles::rdtsc();
+ t.encode(bl);
+ encode_ticks.add(Cycles::rdtsc() - start_time);
+
+ auto bliter = bl.cbegin();
+ start_time = Cycles::rdtsc();
+ d.decode(bliter);
+ decode_ticks.add(Cycles::rdtsc() - start_time);
+ }
+
+ void apply_iterate() {
+ uint64_t start_time = Cycles::rdtsc();
+ ObjectStore::Transaction::iterator i = t.begin();
+ while (i.have_op()) {
+ ObjectStore::Transaction::Op *op = i.decode_op();
+
+ switch (op->op) {
+ case ObjectStore::Transaction::OP_WRITE:
+ {
+ ghobject_t oid = i.get_oid(op->oid);
+ bufferlist bl;
+ i.decode_bl(bl);
+ }
+ break;
+ case ObjectStore::Transaction::OP_SETATTR:
+ {
+ ghobject_t oid = i.get_oid(op->oid);
+ string name = i.decode_string();
+ bufferlist bl;
+ i.decode_bl(bl);
+ map<string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ }
+ break;
+ case ObjectStore::Transaction::OP_OMAP_SETKEYS:
+ {
+ ghobject_t oid = i.get_oid(op->oid);
+ map<string, bufferptr> aset;
+ i.decode_attrset(aset);
+ }
+ break;
+ case ObjectStore::Transaction::OP_OMAP_RMKEYS:
+ {
+ ghobject_t oid = i.get_oid(op->oid);
+ set<string> keys;
+ i.decode_keyset(keys);
+ }
+ break;
+ }
+ }
+ iterate_ticks.add(Cycles::rdtsc() - start_time);
+ }
+
+ static void dump_stat() {
+ cerr << " write op: " << Cycles::to_microseconds(write_ticks.ticks) << "us count: " << write_ticks.count << std::endl;
+ cerr << " setattr op: " << Cycles::to_microseconds(setattr_ticks.ticks) << "us count: " << setattr_ticks.count << std::endl;
+ cerr << " omap_setkeys op: " << Cycles::to_microseconds(Transaction::omap_setkeys_ticks.ticks) << "us count: " << Transaction::omap_setkeys_ticks.count << std::endl;
+ cerr << " omap_rmkey op: " << Cycles::to_microseconds(Transaction::omap_rmkey_ticks.ticks) << "us count: " << Transaction::omap_rmkey_ticks.count << std::endl;
+ cerr << " encode op: " << Cycles::to_microseconds(Transaction::encode_ticks.ticks) << "us count: " << Transaction::encode_ticks.count << std::endl;
+ cerr << " decode op: " << Cycles::to_microseconds(Transaction::decode_ticks.ticks) << "us count: " << Transaction::decode_ticks.count << std::endl;
+ cerr << " iterate op: " << Cycles::to_microseconds(Transaction::iterate_ticks.ticks) << "us count: " << Transaction::iterate_ticks.count << std::endl;
+ }
+};
+
+class PerfCase {
+ static const uint64_t Kib = 1024;
+ static const uint64_t Mib = 1024 * 1024;
+ static const string info_epoch_attr;
+ static const string info_info_attr;
+ static const string attr;
+ static const string snapset_attr;
+ static const string pglog_attr;
+ static const coll_t meta_cid;
+ static const coll_t cid;
+ static const ghobject_t pglog_oid;
+ static const ghobject_t info_oid;
+ map<string, bufferlist> data;
+
+ ghobject_t create_object() {
+ bufferlist bl = generate_random(100, 1);
+ return ghobject_t(hobject_t(string("obj_")+string(bl.c_str()), string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, ""));
+ }
+
+
+ bufferlist generate_random(uint64_t len, int frag) {
+ static const char alphanum[] = "0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz";
+ uint64_t per_frag = len / frag;
+ bufferlist bl;
+ for (int i = 0; i < frag; i++ ) {
+ bufferptr bp(per_frag);
+ for (unsigned int j = 0; j < len; j++) {
+ bp[j] = alphanum[rand() % (sizeof(alphanum) - 1)];
+ }
+ bl.append(bp);
+ }
+ return bl;
+ }
+ public:
+ PerfCase() {
+ uint64_t four_kb = Kib * 4;
+ uint64_t one_mb = Mib * 1;
+ uint64_t four_mb = Mib * 4;
+ data["4k"] = generate_random(four_kb, 1);
+ data["1m"] = generate_random(one_mb, 1);
+ data["4m"] = generate_random(four_mb, 1);
+ data[attr] = generate_random(256, 1);
+ data[snapset_attr] = generate_random(32, 1);
+ data[pglog_attr] = generate_random(128, 1);
+ data[info_epoch_attr] = generate_random(4, 1);
+ data[info_info_attr] = generate_random(560, 1);
+ }
+
+ uint64_t rados_write_4k(int times) {
+ uint64_t ticks = 0;
+ uint64_t len = Kib *4;
+ for (int i = 0; i < times; i++) {
+ uint64_t start_time = 0;
+ {
+ Transaction t;
+ ghobject_t oid = create_object();
+ start_time = Cycles::rdtsc();
+ t.write(cid, oid, 0, len, data["4k"]);
+ t.setattr(cid, oid, attr, data[attr]);
+ t.setattr(cid, oid, snapset_attr, data[snapset_attr]);
+ t.apply_encode_decode();
+ t.apply_iterate();
+ ticks += Cycles::rdtsc() - start_time;
+ }
+ {
+ Transaction t;
+ map<string, bufferlist> pglog_attrset;
+ map<string, bufferlist> info_attrset;
+ pglog_attrset[pglog_attr] = data[pglog_attr];
+ info_attrset[info_epoch_attr] = data[info_epoch_attr];
+ info_attrset[info_info_attr] = data[info_info_attr];
+ start_time = Cycles::rdtsc();
+ t.omap_setkeys(meta_cid, pglog_oid, pglog_attrset);
+ t.omap_setkeys(meta_cid, info_oid, info_attrset);
+ t.omap_rmkey(meta_cid, pglog_oid, pglog_attr);
+ t.apply_encode_decode();
+ t.apply_iterate();
+ ticks += Cycles::rdtsc() - start_time;
+ }
+ }
+ return ticks;
+ }
+};
+const string PerfCase::info_epoch_attr("11.40_epoch");
+const string PerfCase::info_info_attr("11.40_info");
+const string PerfCase::attr("_");
+const string PerfCase::snapset_attr("snapset");
+const string PerfCase::pglog_attr("pglog_attr");
+const coll_t PerfCase::meta_cid;
+const coll_t PerfCase::cid;
+const ghobject_t PerfCase::pglog_oid(hobject_t(sobject_t(object_t("cid_pglog"), 0)));
+const ghobject_t PerfCase::info_oid(hobject_t(sobject_t(object_t("infos"), 0)));
+Transaction::Tick Transaction::write_ticks, Transaction::setattr_ticks, Transaction::omap_setkeys_ticks, Transaction::omap_rmkey_ticks;
+Transaction::Tick Transaction::encode_ticks, Transaction::decode_ticks, Transaction::iterate_ticks;
+
+void usage(const string &name) {
+ cerr << "Usage: " << name << " [times] "
+ << std::endl;
+}
+
+int main(int argc, char **argv)
+{
+ auto args = argv_to_vec(argc, argv);
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ g_ceph_context->_conf.apply_changes(nullptr);
+ Cycles::init();
+
+ cerr << "args: " << args << std::endl;
+ if (args.size() < 1) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ uint64_t times = atoi(args[0]);
+ PerfCase c;
+ uint64_t ticks = c.rados_write_4k(times);
+ Transaction::dump_stat();
+ cerr << " Total rados op " << times << " run time " << Cycles::to_microseconds(ticks) << "us." << std::endl;
+
+ return 0;
+}
diff --git a/src/test/objectstore/TestObjectStoreState.cc b/src/test/objectstore/TestObjectStoreState.cc
new file mode 100644
index 000000000..f4ccef4f0
--- /dev/null
+++ b/src/test/objectstore/TestObjectStoreState.cc
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 New Dream Network
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <time.h>
+#include <stdlib.h>
+#include <signal.h>
+#include "os/ObjectStore.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/debug.h"
+#include <boost/scoped_ptr.hpp>
+#include <boost/lexical_cast.hpp>
+#include "TestObjectStoreState.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph_test_objectstore_state "
+
+using namespace std;
+
+void TestObjectStoreState::init(int colls, int objs)
+{
+ dout(5) << "init " << colls << " colls " << objs << " objs" << dendl;
+
+ ObjectStore::Transaction t;
+ auto meta_ch = m_store->create_new_collection(coll_t::meta());
+ t.create_collection(coll_t::meta(), 0);
+ m_store->queue_transaction(meta_ch, std::move(t));
+
+ wait_for_ready();
+
+ int baseid = 0;
+ for (int i = 0; i < colls; i++) {
+ spg_t pgid(pg_t(i, 1), shard_id_t::NO_SHARD);
+ coll_t cid(pgid);
+ auto ch = m_store->create_new_collection(cid);
+ coll_entry_t *entry = coll_create(pgid, ch);
+ dout(5) << "init create collection " << entry->m_cid
+ << " meta " << entry->m_meta_obj << dendl;
+
+ ObjectStore::Transaction *t = new ObjectStore::Transaction;
+ t->create_collection(entry->m_cid, 32);
+ bufferlist hint;
+ uint32_t pg_num = colls;
+ uint64_t num_objs = uint64_t(objs / colls);
+ encode(pg_num, hint);
+ encode(num_objs, hint);
+ t->collection_hint(entry->m_cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
+ dout(5) << "give collection hint, number of objects per collection: " << num_objs << dendl;
+ t->touch(cid, entry->m_meta_obj);
+
+ for (int i = 0; i < objs; i++) {
+ hobject_t *obj = entry->touch_obj(i + baseid);
+ t->touch(entry->m_cid, ghobject_t(*obj));
+ ceph_assert(i + baseid == m_num_objects);
+ m_num_objects++;
+ }
+ baseid += objs;
+
+ t->register_on_commit(new C_OnFinished(this));
+ m_store->queue_transaction(entry->m_ch, std::move(*t), nullptr);
+
+ delete t;
+ inc_in_flight();
+
+ m_collections.insert(make_pair(cid, entry));
+ rebuild_id_vec();
+ m_next_coll_nr++;
+ }
+ dout(5) << "init has " << m_in_flight.load() << "in-flight transactions" << dendl;
+ wait_for_done();
+ dout(5) << "init finished" << dendl;
+}
+
+TestObjectStoreState::coll_entry_t *TestObjectStoreState::coll_create(
+ spg_t pgid, ObjectStore::CollectionHandle ch)
+{
+ char meta_buf[100];
+ memset(meta_buf, 0, 100);
+ snprintf(meta_buf, 100, "pglog_0_head");
+ return (new coll_entry_t(pgid, ch, meta_buf));
+}
+
+TestObjectStoreState::coll_entry_t*
+TestObjectStoreState::get_coll(coll_t cid, bool erase)
+{
+ dout(5) << "get_coll id " << cid << dendl;
+
+ coll_entry_t *entry = NULL;
+ auto it = m_collections.find(cid);
+ if (it != m_collections.end()) {
+ entry = it->second;
+ if (erase) {
+ m_collections.erase(it);
+ rebuild_id_vec();
+ }
+ }
+
+ dout(5) << "get_coll id " << cid;
+ if (!entry)
+ *_dout << " non-existent";
+ else
+ *_dout << " name " << entry->m_cid;
+ *_dout << dendl;
+ return entry;
+}
+
+TestObjectStoreState::coll_entry_t*
+TestObjectStoreState::get_coll_at(int pos, bool erase)
+{
+ dout(5) << "get_coll_at pos " << pos << dendl;
+
+ if (m_collections.empty())
+ return NULL;
+
+ ceph_assert((size_t) pos < m_collections_ids.size());
+
+ coll_t cid = m_collections_ids[pos];
+ coll_entry_t *entry = m_collections[cid];
+
+ if (entry == NULL) {
+ dout(5) << "get_coll_at pos " << pos << " non-existent" << dendl;
+ return NULL;
+ }
+
+ if (erase) {
+ m_collections.erase(cid);
+ rebuild_id_vec();
+ }
+
+ dout(5) << "get_coll_at pos " << pos << ": "
+ << entry->m_cid << "(removed: " << erase << ")" << dendl;
+
+ return entry;
+}
+
+TestObjectStoreState::coll_entry_t::~coll_entry_t()
+{
+ if (m_objects.size() > 0) {
+ map<int, hobject_t*>::iterator it = m_objects.begin();
+ for (; it != m_objects.end(); ++it) {
+ hobject_t *obj = it->second;
+ if (obj) {
+ delete obj;
+ }
+ }
+ m_objects.clear();
+ }
+}
+
+bool TestObjectStoreState::coll_entry_t::check_for_obj(int id)
+{
+ if (m_objects.count(id))
+ return true;
+ return false;
+}
+
+hobject_t *TestObjectStoreState::coll_entry_t::touch_obj(int id)
+{
+ map<int, hobject_t*>::iterator it = m_objects.find(id);
+ if (it != m_objects.end()) {
+ dout(5) << "touch_obj coll id " << m_cid
+ << " name " << it->second->oid.name << dendl;
+ return it->second;
+ }
+
+ char buf[100];
+ memset(buf, 0, 100);
+ snprintf(buf, 100, "obj%d", id);
+
+ hobject_t *obj = new hobject_t(sobject_t(object_t(buf), CEPH_NOSNAP));
+ obj->set_hash(m_pgid.ps());
+ obj->pool = m_pgid.pool();
+ m_objects.insert(make_pair(id, obj));
+
+ dout(5) << "touch_obj coll id " << m_cid << " name " << buf << dendl;
+ return obj;
+}
+
+hobject_t *TestObjectStoreState::coll_entry_t::get_obj(int id)
+{
+ return get_obj(id, false);
+}
+
+/**
+ * remove_obj - Removes object without freeing it.
+ * @param id Object's id in the map.
+ * @return The object or NULL in case of error.
+ */
+hobject_t *TestObjectStoreState::coll_entry_t::remove_obj(int id)
+{
+ return get_obj(id, true);
+}
+
+hobject_t *TestObjectStoreState::coll_entry_t::get_obj(int id, bool remove)
+{
+ map<int, hobject_t*>::iterator it = m_objects.find(id);
+ if (it == m_objects.end()) {
+ dout(5) << "get_obj coll " << m_cid
+ << " obj #" << id << " non-existent" << dendl;
+ return NULL;
+ }
+
+ hobject_t *obj = it->second;
+ if (remove)
+ m_objects.erase(it);
+
+ dout(5) << "get_obj coll " << m_cid << " id " << id
+ << ": " << obj->oid.name << "(removed: " << remove << ")" << dendl;
+
+ return obj;
+}
+
+hobject_t *TestObjectStoreState::coll_entry_t::get_obj_at(int pos, int *key)
+{
+ return get_obj_at(pos, false, key);
+}
+
+/**
+ * remove_obj_at - Removes object without freeing it.
+ * @param pos The map's position in which the object lies.
+ * @return The object or NULL in case of error.
+ */
+hobject_t *TestObjectStoreState::coll_entry_t::remove_obj_at(int pos, int *key)
+{
+ return get_obj_at(pos, true, key);
+}
+
+hobject_t *TestObjectStoreState::coll_entry_t::get_obj_at(int pos,
+ bool remove, int *key)
+{
+ if (m_objects.empty()) {
+ dout(5) << "get_obj_at coll " << m_cid << " pos " << pos
+ << " in an empty collection" << dendl;
+ return NULL;
+ }
+
+ hobject_t *ret = NULL;
+ map<int, hobject_t*>::iterator it = m_objects.begin();
+ for (int i = 0; it != m_objects.end(); ++it, i++) {
+ if (i == pos) {
+ ret = it->second;
+ break;
+ }
+ }
+
+ if (ret == NULL) {
+ dout(5) << "get_obj_at coll " << m_cid << " pos " << pos
+ << " non-existent" << dendl;
+ return NULL;
+ }
+
+ if (key != NULL)
+ *key = it->first;
+
+ if (remove)
+ m_objects.erase(it);
+
+ dout(5) << "get_obj_at coll id " << m_cid << " pos " << pos
+ << ": " << ret->oid.name << "(removed: " << remove << ")" << dendl;
+
+ return ret;
+}
+
+hobject_t*
+TestObjectStoreState::coll_entry_t::replace_obj(int id, hobject_t *obj) {
+ hobject_t *old_obj = remove_obj(id);
+ m_objects.insert(make_pair(id, obj));
+ return old_obj;
+}
+
+int TestObjectStoreState::coll_entry_t::get_random_obj_id(rngen_t& gen)
+{
+ ceph_assert(!m_objects.empty());
+
+ boost::uniform_int<> orig_obj_rng(0, m_objects.size()-1);
+ int pos = orig_obj_rng(gen);
+ map<int, hobject_t*>::iterator it = m_objects.begin();
+ for (int i = 0; it != m_objects.end(); ++it, i++) {
+ if (i == pos) {
+ return it->first;
+ }
+ }
+ ceph_abort_msg("INTERNAL ERROR");
+}
diff --git a/src/test/objectstore/TestObjectStoreState.h b/src/test/objectstore/TestObjectStoreState.h
new file mode 100644
index 000000000..d1e31bd8a
--- /dev/null
+++ b/src/test/objectstore/TestObjectStoreState.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 New Dream Network
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#ifndef TEST_OBJECTSTORE_STATE_H_
+#define TEST_OBJECTSTORE_STATE_H_
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <map>
+#include <vector>
+
+#include "os/ObjectStore.h"
+#include "common/Cond.h"
+
+typedef boost::mt11213b rngen_t;
+
+class TestObjectStoreState {
+public:
+ struct coll_entry_t {
+ spg_t m_pgid;
+ coll_t m_cid;
+ ghobject_t m_meta_obj;
+ ObjectStore::CollectionHandle m_ch;
+ std::map<int, hobject_t*> m_objects;
+ int m_next_object_id;
+
+ coll_entry_t(spg_t pgid, ObjectStore::CollectionHandle& ch,
+ char *meta_obj_buf)
+ : m_pgid(pgid),
+ m_cid(m_pgid),
+ m_meta_obj(hobject_t(sobject_t(object_t(meta_obj_buf), CEPH_NOSNAP))),
+ m_ch(ch),
+ m_next_object_id(0) {
+ m_meta_obj.hobj.pool = m_pgid.pool();
+ m_meta_obj.hobj.set_hash(m_pgid.ps());
+ }
+ ~coll_entry_t();
+
+ hobject_t *touch_obj(int id);
+ bool check_for_obj(int id);
+ hobject_t *get_obj(int id);
+ hobject_t *remove_obj(int id);
+ hobject_t *get_obj_at(int pos, int *key = NULL);
+ hobject_t *remove_obj_at(int pos, int *key = NULL);
+ hobject_t *replace_obj(int id, hobject_t *obj);
+ int get_random_obj_id(rngen_t& gen);
+
+ private:
+ hobject_t *get_obj(int id, bool remove);
+ hobject_t *get_obj_at(int pos, bool remove, int *key = NULL);
+ };
+
+ protected:
+ boost::shared_ptr<ObjectStore> m_store;
+ std::map<coll_t, coll_entry_t*> m_collections;
+ std::vector<coll_t> m_collections_ids;
+ int m_next_coll_nr;
+ int m_num_objs_per_coll;
+ int m_num_objects;
+
+ int m_max_in_flight;
+ std::atomic<int> m_in_flight = { 0 };
+ ceph::mutex m_finished_lock = ceph::make_mutex("Finished Lock");
+ ceph::condition_variable m_finished_cond;
+
+ void rebuild_id_vec() {
+ m_collections_ids.clear();
+ m_collections_ids.reserve(m_collections.size());
+ for (auto& i : m_collections) {
+ m_collections_ids.push_back(i.first);
+ }
+ }
+
+ void wait_for_ready() {
+ std::unique_lock locker{m_finished_lock};
+ m_finished_cond.wait(locker, [this] {
+ return m_max_in_flight <= 0 || m_in_flight < m_max_in_flight;
+ });
+ }
+
+ void wait_for_done() {
+ std::unique_lock locker{m_finished_lock};
+ m_finished_cond.wait(locker, [this] { return m_in_flight == 0; });
+ }
+
+ void set_max_in_flight(int max) {
+ m_max_in_flight = max;
+ }
+ void set_num_objs_per_coll(int val) {
+ m_num_objs_per_coll = val;
+ }
+
+ coll_entry_t *get_coll(coll_t cid, bool erase = false);
+ coll_entry_t *get_coll_at(int pos, bool erase = false);
+ int get_next_pool_id() { return m_next_pool++; }
+
+ private:
+ static const int m_default_num_colls = 30;
+ // The pool ID used for collection creation, ID 0 is preserve for other tests
+ int m_next_pool;
+
+ public:
+ explicit TestObjectStoreState(ObjectStore *store) :
+ m_next_coll_nr(0), m_num_objs_per_coll(10), m_num_objects(0),
+ m_max_in_flight(0), m_next_pool(2) {
+ m_store.reset(store);
+ }
+ ~TestObjectStoreState() {
+ auto it = m_collections.begin();
+ while (it != m_collections.end()) {
+ if (it->second)
+ delete it->second;
+ m_collections.erase(it++);
+ }
+ }
+
+ void init(int colls, int objs);
+ void init() {
+ init(m_default_num_colls, 0);
+ }
+
+ int inc_in_flight() {
+ return ++m_in_flight;
+ }
+
+ int dec_in_flight() {
+ return --m_in_flight;
+ }
+
+ coll_entry_t *coll_create(spg_t pgid, ObjectStore::CollectionHandle ch);
+
+ class C_OnFinished: public Context {
+ protected:
+ TestObjectStoreState *m_state;
+
+ public:
+ explicit C_OnFinished(TestObjectStoreState *state) : m_state(state) { }
+
+ void finish(int r) override {
+ std::lock_guard locker{m_state->m_finished_lock};
+ m_state->dec_in_flight();
+ m_state->m_finished_cond.notify_all();
+
+ }
+ };
+};
+
+#endif /* TEST_OBJECTSTORE_STATE_H_ */
diff --git a/src/test/objectstore/TestRocksdbOptionParse.cc b/src/test/objectstore/TestRocksdbOptionParse.cc
new file mode 100644
index 000000000..c34ea6bc2
--- /dev/null
+++ b/src/test/objectstore/TestRocksdbOptionParse.cc
@@ -0,0 +1,78 @@
+#include <gtest/gtest.h>
+#include "include/Context.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+#include "kv/RocksDBStore.h"
+#include <iostream>
+
+using namespace std;
+
+const string dir("rocksdb.test_temp_dir");
+
+TEST(RocksDBOption, simple) {
+ rocksdb::Options options;
+ rocksdb::Status status;
+ map<string,string> kvoptions;
+ RocksDBStore *db = new RocksDBStore(g_ceph_context, dir, kvoptions, NULL);
+ string options_string = ""
+ "write_buffer_size=536870912;"
+ "create_if_missing=true;"
+ "max_write_buffer_number=4;"
+ "max_background_compactions=4;"
+ "stats_dump_period_sec = 5;"
+ "min_write_buffer_number_to_merge = 2;"
+ "level0_file_num_compaction_trigger = 4;"
+ "max_bytes_for_level_base = 104857600;"
+ "target_file_size_base = 10485760;"
+ "num_levels = 3;"
+ "compression = kNoCompression;"
+ "compaction_options_universal = {min_merge_width=4;size_ratio=2;max_size_amplification_percent=500}";
+ int r = db->ParseOptionsFromString(options_string, options);
+ ASSERT_EQ(0, r);
+ ASSERT_EQ(536870912u, options.write_buffer_size);
+ ASSERT_EQ(4, options.max_write_buffer_number);
+ ASSERT_EQ(4, options.max_background_compactions);
+ ASSERT_EQ(5u, options.stats_dump_period_sec);
+ ASSERT_EQ(2, options.min_write_buffer_number_to_merge);
+ ASSERT_EQ(4, options.level0_file_num_compaction_trigger);
+ ASSERT_EQ(104857600u, options.max_bytes_for_level_base);
+ ASSERT_EQ(10485760u, options.target_file_size_base);
+ ASSERT_EQ(3, options.num_levels);
+ ASSERT_EQ(rocksdb::kNoCompression, options.compression);
+ ASSERT_EQ(2, options.compaction_options_universal.size_ratio);
+ ASSERT_EQ(4, options.compaction_options_universal.min_merge_width);
+ ASSERT_EQ(500, options.compaction_options_universal.max_size_amplification_percent);
+}
+TEST(RocksDBOption, interpret) {
+ rocksdb::Options options;
+ rocksdb::Status status;
+ map<string,string> kvoptions;
+ RocksDBStore *db = new RocksDBStore(g_ceph_context, dir, kvoptions, NULL);
+ string options_string = "compact_on_mount = true; compaction_threads=10;flusher_threads=5;";
+
+ int r = db->ParseOptionsFromString(options_string, options);
+ ASSERT_EQ(0, r);
+ ASSERT_TRUE(db->compact_on_mount);
+ //check thread pool setting
+ options.env->SleepForMicroseconds(100000);
+ std::vector<rocksdb::ThreadStatus> thread_list;
+ status = options.env->GetThreadList(&thread_list);
+ ASSERT_TRUE(status.ok());
+
+ int num_high_pri_threads = 0;
+ int num_low_pri_threads = 0;
+ for (vector<rocksdb::ThreadStatus>::iterator it = thread_list.begin();
+ it!= thread_list.end();
+ ++it) {
+ if (it->thread_type == rocksdb::ThreadStatus::HIGH_PRIORITY)
+ num_high_pri_threads++;
+ if (it->thread_type == rocksdb::ThreadStatus::LOW_PRIORITY)
+ num_low_pri_threads++;
+ }
+ ASSERT_EQ(15u, thread_list.size());
+ //low pri threads is compaction_threads
+ ASSERT_EQ(10, num_low_pri_threads);
+ //high pri threads is flusher_threads
+ ASSERT_EQ(5, num_high_pri_threads);
+}
diff --git a/src/test/objectstore/allocator_replay_test.cc b/src/test/objectstore/allocator_replay_test.cc
new file mode 100644
index 000000000..18959a56c
--- /dev/null
+++ b/src/test/objectstore/allocator_replay_test.cc
@@ -0,0 +1,694 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Allocator replay tool.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ */
+#include <iostream>
+#include <vector>
+
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/Cycles.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+#include "common/admin_socket.h"
+#include "include/denc.h"
+#include "global/global_init.h"
+#include "os/bluestore/Allocator.h"
+
+using namespace std;
+
+void usage(const string &name) {
+ cerr << "Usage: " << name << " <log_to_replay> <raw_duplicates|duplicates|free_dump|try_alloc count want alloc_unit|replay_alloc alloc_list_file|export_binary out_file>" << std::endl;
+}
+
+void usage_replay_alloc(const string &name) {
+ cerr << "Detailed replay_alloc usage: " << name << " <allocator_dump_JSON> replay_alloc <alloc_list_file> [number of replays]" << std::endl;
+ cerr << "The number of replays defaults to 1." << std::endl;
+ cerr << "The \"alloc_list_file\" parameter should be a file with allocation requests, one per line." << std::endl;
+ cerr << "Allocation request format (space separated, optional parameters are 0 if not given): want unit [max] [hint]" << std::endl;
+}
+
+struct binary_alloc_map_t {
+ std::vector<std::pair<uint64_t, uint64_t>> free_extents;
+
+ DENC(binary_alloc_map_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.free_extents, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CLASS_DENC(binary_alloc_map_t)
+
+int replay_and_check_for_duplicate(char* fname)
+{
+ unique_ptr<Allocator> alloc;
+
+ FILE* f = fopen(fname, "r");
+ if (!f) {
+ std::cerr << "error: unable to open " << fname << std::endl;
+ return -1;
+ }
+
+ PExtentVector tmp;
+ bool init_done = false;
+ char s[4096];
+ char* sp, *token;
+ interval_set<uint64_t> owned_by_app;
+ while (true) {
+ if (fgets(s, sizeof(s), f) == nullptr) {
+ break;
+ }
+ sp = strstr(s, "init_add_free");
+ if (!sp) {
+ sp = strstr(s, "release");
+ }
+ if (sp) {
+ //2019-05-30 03:23:46.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_add_free 0x100000~680000000
+ // or
+ //2019-05-30 03:23:46.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_add_free done
+ // or
+ // 2019 - 10 - 08T16:19 : 32.257 + 0300 7f5679f3fe80 10 fbmap_alloc 0x564fab96f100 release 0x450000~10000
+ // or
+ // 2019 - 10 - 08T16 : 19 : 32.257 + 0300 7f5679f3fe80 10 fbmap_alloc 0x564fab96f100 release done
+ if (strstr(sp, "done") != nullptr) {
+ continue;
+ }
+ std::cout << s << std::endl;
+ if (!init_done) {
+ std::cerr << "error: no allocator init before: " << s << std::endl;
+ return -1;
+ }
+ uint64_t offs, len;
+ strtok(sp, " ~");
+ token = strtok(nullptr, " ~");
+ ceph_assert(token);
+ offs = strtoul(token, nullptr, 16);
+ token = strtok(nullptr, " ~");
+ ceph_assert(token);
+ len = strtoul(token, nullptr, 16);
+ if (len == 0) {
+ std::cerr << "error: " << sp <<": " << s << std::endl;
+ return -1;
+ }
+ if (!owned_by_app.contains(offs, len)) {
+ std::cerr << "error: unexpected return to allocator, not owned by app: "
+ << s << std::endl;
+ return -1;
+ }
+ owned_by_app.erase(offs, len);
+ if (strstr(sp, "init_add_free") != nullptr) {
+ alloc->init_add_free(offs, len);
+ } else {
+ PExtentVector release_set;
+ release_set.emplace_back(offs, len);
+ alloc->release(release_set);
+ }
+ continue;
+ }
+ sp = strstr(s, "init_rm_free");
+ if (sp) {
+ //2019-05-30 03:23:46.912 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_rm_free 0x100000~680000000
+ // or
+ // 2019-05-30 03:23:46.916 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_rm_free done
+
+ if (strstr(sp, "done") != nullptr) {
+ continue;
+ }
+ std::cout << s << std::endl;
+ if (!init_done) {
+ std::cerr << "error: no allocator init before: " << s << std::endl;
+ return -1;
+ }
+ uint64_t offs, len;
+ strtok(sp, " ~");
+ token = strtok(nullptr, " ~");
+ ceph_assert(token);
+ offs = strtoul(token, nullptr, 16);
+ token = strtok(nullptr, " ~");
+ ceph_assert(token);
+ len = strtoul(token, nullptr, 16);
+ if (len == 0) {
+ std::cerr << "error: " << sp <<": " << s << std::endl;
+ return -1;
+ }
+ alloc->init_rm_free(offs, len);
+
+ if (owned_by_app.intersects(offs, len)) {
+ std::cerr
+ << "error: unexpected takeover from allocator, already owned by app: "
+ << s << std::endl;
+ return -1;
+ } else {
+ owned_by_app.insert(offs, len);
+ }
+
+ continue;
+ }
+ sp = strstr(s, "allocate");
+ if (sp) {
+ //2019-05-30 03:23:48.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 allocate 0x80000000/100000,0,0
+ // and need to bypass
+ // 2019-05-30 03:23:48.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 allocate 0x69d400000~200000/100000,0,0
+
+ // Very simple and stupid check to bypass actual allocations
+ if (strstr(sp, "~") != nullptr) {
+ continue;
+ }
+
+ std::cout << s << std::endl;
+ if (!init_done) {
+ std::cerr << "error: no allocator init before: " << s << std::endl;
+ return -1;
+ }
+ uint64_t want, alloc_unit;
+ strtok(sp, " /");
+ token = strtok(nullptr, " /");
+ ceph_assert(token);
+ want = strtoul(token, nullptr, 16);
+ token = strtok(nullptr, " ~");
+ ceph_assert(token);
+ alloc_unit = strtoul(token, nullptr, 16);
+ if (want == 0 || alloc_unit == 0) {
+ std::cerr << "error: allocate: " << s << std::endl;
+ return -1;
+ }
+ tmp.clear();
+ auto allocated = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+ std::cout << "allocated TOTAL: " << allocated << std::endl;
+ for (auto& ee : tmp) {
+ std::cerr << "dump extent: " << std::hex
+ << ee.offset << "~" << ee.length
+ << std::dec << std::endl;
+ }
+ std::cerr << "dump completed." << std::endl;
+ for (auto& e : tmp) {
+ if (owned_by_app.intersects(e.offset, e.length)) {
+ std::cerr << "error: unexpected allocated extent: " << std::hex
+ << e.offset << "~" << e.length
+ << " dumping all allocations:" << std::dec << std::endl;
+ for (auto& ee : tmp) {
+ std::cerr <<"dump extent: " << std::hex
+ << ee.offset << "~" << ee.length
+ << std::dec << std::endl;
+ }
+ std::cerr <<"dump completed." << std::endl;
+ return -1;
+ } else {
+ owned_by_app.insert(e.offset, e.length);
+ }
+ }
+ continue;
+ }
+
+ string alloc_type = "bitmap";
+ sp = strstr(s, "BitmapAllocator");
+ if (!sp) {
+ alloc_type = "avl";
+ sp = strstr(s, "AvlAllocator");
+ }
+ if (!sp) {
+ alloc_type = "hybrid";
+ sp = strstr(s, "HybridAllocator");
+ }
+ if (!sp) {
+ alloc_type = "stupid";
+ sp = strstr(s, "StupidAllocator");
+ }
+ if (sp) {
+ // 2019-05-30 03:23:43.460 7f889a5edf00 10 fbmap_alloc 0x5642ed36e900 BitmapAllocator 0x15940000000/100000
+ std::cout << s << std::endl;
+ if (init_done) {
+ std::cerr << "error: duplicate init: " << s << std::endl;
+ return -1;
+ }
+ uint64_t total, alloc_unit;
+ strtok(sp, " /");
+ token = strtok(nullptr, " /");
+ ceph_assert(token);
+ total = strtoul(token, nullptr, 16);
+ token = strtok(nullptr, " /");
+ ceph_assert(token);
+ alloc_unit = strtoul(token, nullptr, 16);
+ if (total == 0 || alloc_unit == 0) {
+ std::cerr << "error: invalid init: " << s << std::endl;
+ return -1;
+ }
+ alloc.reset(Allocator::create(g_ceph_context, alloc_type, total,
+ alloc_unit));
+ owned_by_app.insert(0, total);
+
+ init_done = true;
+ continue;
+ }
+ }
+ fclose(f);
+ return 0;
+}
+
+int replay_free_dump_and_apply_raw(
+ char* fname,
+ std::function<void (
+ std::string_view,
+ int64_t,
+ int64_t,
+ std::string_view)> create,
+ std::function<void (uint64_t, uint64_t)> add_ext)
+{
+ string alloc_type;
+ string alloc_name;
+ uint64_t capacity = 0;
+ uint64_t alloc_unit = 0;
+
+ JSONParser p;
+ std::cout << "parsing..." << std::endl;
+ bool b = p.parse(fname);
+ if (!b) {
+ std::cerr << "Failed to parse json: " << fname << std::endl;
+ return -1;
+ }
+
+ JSONObj::data_val v;
+ ceph_assert(p.is_object());
+
+ auto *o = p.find_obj("alloc_type");
+ ceph_assert(o);
+ alloc_type = o->get_data_val().str;
+
+ o = p.find_obj("alloc_name");
+ ceph_assert(o);
+ alloc_name = o->get_data_val().str;
+
+ o = p.find_obj("capacity");
+ ceph_assert(o);
+ decode_json_obj(capacity, o);
+ o = p.find_obj("alloc_unit");
+ ceph_assert(o);
+ decode_json_obj(alloc_unit, o);
+
+ int fd = -1;
+ o = p.find_obj("extents_file");
+ if (o) {
+ string filename = o->get_data_val().str;
+ fd = open(filename.c_str(), O_RDONLY);
+ if (fd < 0) {
+ std::cerr << "error: unable to open extents file: " << filename
+ << ", " << cpp_strerror(-errno)
+ << std::endl;
+ return -1;
+ }
+ } else {
+ o = p.find_obj("extents");
+ ceph_assert(o);
+ ceph_assert(o->is_array());
+ }
+ std::cout << "parsing completed!" << std::endl;
+
+ create(alloc_type, capacity, alloc_unit, alloc_name);
+ int r = 0;
+ if (fd < 0) {
+ auto it = o->find_first();
+ while (!it.end()) {
+ auto *item_obj = *it;
+ uint64_t offset = 0;
+ uint64_t length = 0;
+ string offset_str, length_str;
+
+ bool b = JSONDecoder::decode_json("offset", offset_str, item_obj);
+ ceph_assert(b);
+ b = JSONDecoder::decode_json("length", length_str, item_obj);
+ ceph_assert(b);
+
+ char* p;
+ offset = strtol(offset_str.c_str(), &p, 16);
+ length = strtol(length_str.c_str(), &p, 16);
+
+ // intentionally skip/trim entries that are above the capacity,
+ // just to be able to "shrink" allocator by editing that field
+ if (offset < capacity) {
+ if (offset + length > capacity) {
+ length = offset + length - capacity;
+ }
+ add_ext(offset, length);
+ }
+ ++it;
+ }
+ } else {
+ bufferlist bl;
+ char buf[4096];
+ do {
+ r = read(fd, buf, sizeof(buf));
+ if (r > 0) {
+ bl.append(buf, r);
+ }
+ } while(r > 0);
+ if (r < 0) {
+ std::cerr << "error: error reading from extents file: "
+ << cpp_strerror(-errno)
+ << std::endl;
+ } else {
+ auto p = bl.cbegin();
+ binary_alloc_map_t amap;
+ try {
+ decode(amap, p);
+ for (auto p : amap.free_extents) {
+ add_ext(p.first, p.second);
+ }
+ } catch (ceph::buffer::error& e) {
+ std::cerr << __func__ << " unable to decode extents "
+ << ": " << e.what()
+ << std::endl;
+ r = -1;
+ }
+ }
+ close(fd);
+ }
+ return r;
+}
+
+/*
+* This replays allocator dump (in JSON) reported by
+ "ceph daemon <osd> bluestore allocator dump <name>"
+ command and applies custom method to it
+*/
+int replay_free_dump_and_apply(char* fname,
+ std::function<int (Allocator*, const string& aname)> fn)
+{
+ unique_ptr<Allocator> alloc;
+ auto create_fn = [&](std::string_view alloc_type,
+ int64_t capacity,
+ int64_t alloc_unit,
+ std::string_view alloc_name) {
+ alloc.reset(
+ Allocator::create(
+ g_ceph_context, alloc_type, capacity, alloc_unit, 0, 0, alloc_name));
+ };
+ auto add_fn = [&](uint64_t offset,
+ uint64_t len) {
+ alloc->init_add_free(offset, len);
+ };
+ int r = replay_free_dump_and_apply_raw(
+ fname,
+ create_fn,
+ add_fn);
+ if (r == 0) {
+ r = fn(alloc.get(), alloc->get_name());
+ }
+
+ return r;
+}
+
+void dump_alloc(Allocator* alloc, const string& aname)
+{
+ AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
+ ceph_assert(admin_socket);
+
+ ceph::bufferlist in, out;
+ ostringstream err;
+
+ string cmd = "{\"prefix\": \"bluestore allocator dump " + aname + "\"}";
+ auto r = admin_socket->execute_command(
+ { cmd },
+ in, err, &out);
+ if (r != 0) {
+ cerr << "failure querying: " << cpp_strerror(r) << std::endl;
+ }
+ else {
+ std::cout << std::string(out.c_str(), out.length()) << std::endl;
+ }
+}
+
+int export_as_binary(char* fname, char* target_fname)
+{
+ int fd = creat(target_fname, 0);
+ if (fd < 0) {
+ std::cerr << "error: unable to open target file: " << target_fname
+ << ", " << cpp_strerror(-errno)
+ << std::endl;
+ return -1;
+ }
+
+ binary_alloc_map_t amap;
+ auto dummy_create_fn =
+ [&](std::string_view alloc_type,
+ int64_t capacity,
+ int64_t alloc_unit,
+ std::string_view alloc_name) {
+ };
+ auto add_fn = [&](uint64_t offset,
+ uint64_t len) {
+ amap.free_extents.emplace_back(offset, len);
+ };
+ int r = replay_free_dump_and_apply_raw(
+ fname,
+ dummy_create_fn,
+ add_fn);
+ if (r == 0) {
+ bufferlist out;
+ ceph::encode(amap, out);
+ auto w = write(fd, out.c_str(), out.length());
+ if (w < 1) {
+ std::cerr << "error: unable to open target file: " << target_fname
+ << ", " << cpp_strerror(-errno)
+ << std::endl;
+ }
+ }
+ close(fd);
+ return r;
+}
+
+int check_duplicates(char* fname)
+{
+ interval_set<uint64_t> free_extents;
+ interval_set<uint64_t> invalid_extentsA;
+ interval_set<uint64_t> invalid_extentsB;
+ auto dummy_create_fn =
+ [&](std::string_view alloc_type,
+ int64_t capacity,
+ int64_t alloc_unit,
+ std::string_view alloc_name) {
+ };
+ size_t errors = 0;
+ size_t pos = 0;
+ size_t first_err_pos = 0;
+ auto add_fn = [&](uint64_t offset,
+ uint64_t len) {
+ ++pos;
+ if (free_extents.intersects(offset, len)) {
+ invalid_extentsB.insert(offset, len);
+ ++errors;
+ if (first_err_pos == 0) {
+ first_err_pos = pos;
+ }
+ } else {
+ free_extents.insert(offset, len);
+ }
+ };
+ int r = replay_free_dump_and_apply_raw(
+ fname,
+ dummy_create_fn,
+ add_fn);
+ if (r < 0) {
+ return r;
+ }
+ pos = 0;
+ auto add_fn2 = [&](uint64_t offset,
+ uint64_t len) {
+ ++pos;
+ if (pos < first_err_pos) {
+ if (invalid_extentsB.intersects(offset, len)) {
+ invalid_extentsA.insert(offset, len);
+ }
+ }
+ };
+ r = replay_free_dump_and_apply_raw(
+ fname,
+ dummy_create_fn,
+ add_fn2);
+ ceph_assert(r >= 0);
+ auto itA = invalid_extentsA.begin();
+ auto itB = invalid_extentsB.begin();
+ while (itA != invalid_extentsA.end()) {
+ std::cerr << "error: overlapping extents: " << std::hex
+ << itA.get_start() << "~" << itA.get_end() - itA.get_start()
+ << " vs.";
+ while (itB != invalid_extentsB.end() &&
+ itB.get_start() >= itA.get_start() &&
+ itB.get_end() <= itA.get_end()) {
+ std::cerr << " " << itB.get_start() << "~" << itB.get_end() - itB.get_start();
+ ++itB;
+ }
+ std::cerr << std::dec << std::endl;
+ ++itA;
+ }
+ return r >= 0 ? errors != 0 : r;
+}
+
+int main(int argc, char **argv)
+{
+ vector<const char*> args;
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ if (argc < 3) {
+ usage(argv[0]);
+ return 1;
+ }
+ if (strcmp(argv[2], "raw_duplicates") == 0) {
+ return replay_and_check_for_duplicate(argv[1]);
+ } else if (strcmp(argv[2], "free_dump") == 0) {
+ return replay_free_dump_and_apply(argv[1],
+ [&](Allocator* a, const string& aname) {
+ ceph_assert(a);
+ std::cout << "Fragmentation:" << a->get_fragmentation()
+ << std::endl;
+ std::cout << "Fragmentation score:" << a->get_fragmentation_score()
+ << std::endl;
+ std::cout << "Free:" << std::hex << a->get_free() << std::dec
+ << std::endl;
+ {
+ // stub to implement various testing stuff on properly initialized allocator
+ // e.g. one can dump allocator back via dump_alloc(a, aname);
+ }
+ return 0;
+ });
+ } else if (strcmp(argv[2], "try_alloc") == 0) {
+ if (argc < 6) {
+ std::cerr << "Error: insufficient arguments for \"try_alloc\" operation."
+ << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+ auto count = strtoul(argv[3], nullptr, 10);
+ auto want = strtoul(argv[4], nullptr, 10);
+ auto alloc_unit = strtoul(argv[5], nullptr, 10);
+
+ return replay_free_dump_and_apply(argv[1],
+ [&](Allocator* a, const string& aname) {
+ ceph_assert(a);
+ std::cout << "Fragmentation:" << a->get_fragmentation()
+ << std::endl;
+ std::cout << "Fragmentation score:" << a->get_fragmentation_score()
+ << std::endl;
+ std::cout << "Free:" << std::hex << a->get_free() << std::dec
+ << std::endl;
+ {
+ PExtentVector extents;
+ for(size_t i = 0; i < count; i++) {
+ extents.clear();
+ auto r = a->allocate(want, alloc_unit, 0, &extents);
+ if (r < 0) {
+ std::cerr << "Error: allocation failure at step:" << i + 1
+ << ", ret = " << r << std::endl;
+ return -1;
+ }
+ }
+ }
+ std::cout << "Successfully allocated: " << count << " * " << want
+ << ", unit:" << alloc_unit << std::endl;
+ return 0;
+ });
+ } else if (strcmp(argv[2], "replay_alloc") == 0) {
+ if (argc < 4) {
+ std::cerr << "Error: insufficient arguments for \"replay_alloc\" option."
+ << std::endl;
+ usage_replay_alloc(argv[0]);
+ return 1;
+ }
+ return replay_free_dump_and_apply(argv[1],
+ [&](Allocator *a, const string &aname) {
+ ceph_assert(a);
+ std::cout << "Fragmentation:" << a->get_fragmentation()
+ << std::endl;
+ std::cout << "Fragmentation score:" << a->get_fragmentation_score()
+ << std::endl;
+ std::cout << "Free:" << std::hex << a->get_free() << std::dec
+ << std::endl;
+ {
+ /* replay a set of allocation requests */
+ char s[4096];
+
+ FILE *f_alloc_list = fopen(argv[3], "r");
+ if (!f_alloc_list) {
+ std::cerr << "error: unable to open " << argv[3] << std::endl;
+ return -1;
+ }
+
+ /* Replay user specified number of times to simulate extended activity
+ * Defaults to 1 replay.
+ */
+ auto replay_count = 1;
+ if (argc == 5) {
+ replay_count = atoi(argv[4]);
+ }
+
+ for (auto i = 0; i < replay_count; ++i) {
+ while (fgets(s, sizeof(s), f_alloc_list) != nullptr) {
+ /* parse allocation request */
+ uint64_t want = 0, unit = 0, max = 0, hint = 0;
+
+ if (std::sscanf(s, "%ji %ji %ji %ji", &want, &unit, &max, &hint) < 2)
+ {
+ cerr << "Error: malformed allocation request:" << std::endl;
+ cerr << s << std::endl;
+ /* do not attempt to allocate a malformed request */
+ continue;
+ }
+
+ /* timestamp for allocation start */
+ auto t0 = ceph::mono_clock::now();
+
+ /* allocate */
+ PExtentVector extents;
+ auto r = a->allocate(want, unit, max, hint, &extents);
+ if (r < 0) {
+ /* blind replays of allocations may run out of space, provide info for easy confirmation */
+ std::cerr << "Error: allocation failure code: " << r
+ << " requested want/unit/max/hint (hex): " << std::hex
+ << want << "/" << unit << "/" << max << "/" << hint
+ << std::dec << std::endl;
+ std::cerr << "Fragmentation:" << a->get_fragmentation()
+ << std::endl;
+ std::cerr << "Fragmentation score:" << a->get_fragmentation_score()
+ << std::endl;
+ std::cerr << "Free:" << std::hex << a->get_free() << std::dec
+ << std::endl;
+ /* return 0 if the allocator ran out of space */
+ if (r == -ENOSPC) {
+ return 0;
+ }
+ return -1;
+ }
+
+ /* Outputs the allocation's duration in nanoseconds and the allocation request parameters */
+ std::cout << "Duration (ns): " << (ceph::mono_clock::now() - t0).count()
+ << " want/unit/max/hint (hex): " << std::hex
+ << want << "/" << unit << "/" << max << "/" << hint
+ << std::dec << std::endl;
+
+ /* Do not release. */
+ //alloc->release(extents);
+ extents.clear();
+ }
+ fseek(f_alloc_list, 0, SEEK_SET);
+ }
+ fclose(f_alloc_list);
+ std::cout << "Fragmentation:" << a->get_fragmentation()
+ << std::endl;
+ std::cout << "Fragmentation score:" << a->get_fragmentation_score()
+ << std::endl;
+ std::cout << "Free:" << std::hex << a->get_free() << std::dec
+ << std::endl;
+ }
+ return 0;
+ });
+ } else if (strcmp(argv[2], "export_binary") == 0) {
+ return export_as_binary(argv[1], argv[3]);
+ } else if (strcmp(argv[2], "duplicates") == 0) {
+ return check_duplicates(argv[1]);
+ }
+}
diff --git a/src/test/objectstore/fastbmap_allocator_test.cc b/src/test/objectstore/fastbmap_allocator_test.cc
new file mode 100644
index 000000000..710b3798f
--- /dev/null
+++ b/src/test/objectstore/fastbmap_allocator_test.cc
@@ -0,0 +1,1145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "os/bluestore/fastbmap_allocator_impl.h"
+
+class TestAllocatorLevel01 : public AllocatorLevel01Loose
+{
+public:
+ void init(uint64_t capacity, uint64_t alloc_unit)
+ {
+ _init(capacity, alloc_unit);
+ }
+ interval_t allocate_l1_cont(uint64_t length, uint64_t min_length,
+ uint64_t pos_start, uint64_t pos_end)
+ {
+ return _allocate_l1_contiguous(length, min_length, 0, pos_start, pos_end);
+ }
+ void free_l1(const interval_t& r)
+ {
+ _free_l1(r.offset, r.length);
+ }
+};
+
+class TestAllocatorLevel02 : public AllocatorLevel02<AllocatorLevel01Loose>
+{
+public:
+ void init(uint64_t capacity, uint64_t alloc_unit)
+ {
+ _init(capacity, alloc_unit);
+ }
+ void allocate_l2(uint64_t length, uint64_t min_length,
+ uint64_t* allocated0,
+ interval_vector_t* res)
+ {
+ uint64_t allocated = 0;
+ uint64_t hint = 0; // trigger internal l2 hint support
+ _allocate_l2(length, min_length, 0, hint, &allocated, res);
+ *allocated0 += allocated;
+ }
+ void free_l2(const interval_vector_t& r)
+ {
+ _free_l2(r);
+ }
+ void mark_free(uint64_t o, uint64_t len)
+ {
+ _mark_free(o, len);
+ }
+ void mark_allocated(uint64_t o, uint64_t len)
+ {
+ _mark_allocated(o, len);
+ }
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _2m = 2 * 1024 * 1024;
+
+TEST(TestAllocatorLevel01, test_l1)
+{
+ TestAllocatorLevel01 al1;
+ uint64_t num_l1_entries = 3 * 256;
+ uint64_t capacity = num_l1_entries * 512 * 4096;
+ al1.init(capacity, 0x1000);
+ ASSERT_EQ(capacity, al1.debug_get_free());
+
+ auto i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, 0u);
+ ASSERT_EQ(i1.length, 0x1000u);
+ ASSERT_EQ(capacity - 0x1000, al1.debug_get_free());
+
+ auto i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 0x1000u);
+ ASSERT_EQ(i2.length, 0x1000u);
+ al1.free_l1(i2);
+ al1.free_l1(i1);
+ i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, 0u);
+ ASSERT_EQ(i1.length, 0x1000u);
+ i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 0x1000u);
+ ASSERT_EQ(i2.length, 0x1000u);
+ al1.free_l1(i1);
+ al1.free_l1(i2);
+
+ i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, 0u);
+ ASSERT_EQ(i1.length, 0x2000u);
+
+ i2 = al1.allocate_l1_cont(0x3000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 0x2000u);
+ ASSERT_EQ(i2.length, 0x3000u);
+
+ al1.free_l1(i1);
+ al1.free_l1(i2);
+
+ i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, 0u);
+ ASSERT_EQ(i1.length, 0x2000u);
+
+ i2 = al1.allocate_l1_cont(2 * 1024 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 2u * 1024u * 1024u);
+ ASSERT_EQ(i2.length, 2u * 1024u * 1024u);
+
+ al1.free_l1(i1);
+ i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, 0u);
+ ASSERT_EQ(i1.length, 1024u * 1024u);
+
+ auto i3 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i3.offset, 2u * 2u * 1024u * 1024u);
+ ASSERT_EQ(i3.length, 1024u * 1024u + 0x1000u);
+
+ // here we have the following layout:
+ // Alloc: 0~1M, 2M~2M, 4M~1M+4K
+ // Free: 1M~1M, 4M+4K ~ 2M-4K, 6M ~...
+ //
+ auto i4 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(1 * 1024 * 1024u, i4.offset);
+ ASSERT_EQ(1024 * 1024u, i4.length);
+ al1.free_l1(i4);
+
+ i4 = al1.allocate_l1_cont(1024 * 1024 - 0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i4.offset, 5u * 1024u * 1024u + 0x1000u);
+ ASSERT_EQ(i4.length, 1024u * 1024u - 0x1000u);
+ al1.free_l1(i4);
+
+ i4 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i4.offset, 6u * 1024u * 1024u);
+ //ASSERT_EQ(i4.offset, 5 * 1024 * 1024 + 0x1000);
+ ASSERT_EQ(i4.length, 1024u * 1024u + 0x1000u);
+
+ al1.free_l1(i1);
+ al1.free_l1(i2);
+ al1.free_l1(i3);
+ al1.free_l1(i4);
+
+ i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, 0u);
+ ASSERT_EQ(i1.length, 1024u * 1024u);
+
+ i2 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 1u * 1024u * 1024u);
+ ASSERT_EQ(i2.length, 1024u * 1024u);
+
+ i3 = al1.allocate_l1_cont(512 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i3.offset, 2u * 1024u * 1024u);
+ ASSERT_EQ(i3.length, 512u * 1024u);
+
+ i4 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i4.offset, (2u * 1024u + 512u) * 1024u);
+ ASSERT_EQ(i4.length, 1536u * 1024u);
+ // making a hole 1.5 Mb length
+ al1.free_l1(i2);
+ al1.free_l1(i3);
+ // and trying to fill it
+ i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 1024u * 1024u);
+ ASSERT_EQ(i2.length, 1536u * 1024u);
+
+ al1.free_l1(i2);
+ // and trying to fill it partially
+ i2 = al1.allocate_l1_cont(1528 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 1024u * 1024u);
+ ASSERT_EQ(i2.length, 1528u * 1024u);
+
+ i3 = al1.allocate_l1_cont(8 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i3.offset, 2552u * 1024u);
+ ASSERT_EQ(i3.length, 8u * 1024u);
+
+ al1.free_l1(i2);
+ // here we have the following layout:
+ // Alloc: 0~1M, 2552K~8K, num_l1_entries0K~1.5M
+ // Free: 1M~1528K, 4M ~...
+ //
+ i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, 4u * 1024u * 1024u);
+ ASSERT_EQ(i2.length, 1536u * 1024u);
+
+ al1.free_l1(i1);
+ al1.free_l1(i2);
+ al1.free_l1(i3);
+ al1.free_l1(i4);
+ ASSERT_EQ(capacity, al1.debug_get_free());
+
+ for (uint64_t i = 0; i < capacity; i += _2m) {
+ i1 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+ ASSERT_EQ(i1.offset, i);
+ ASSERT_EQ(i1.length, _2m);
+ }
+ ASSERT_EQ(0u, al1.debug_get_free());
+ i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+ ASSERT_EQ(i2.length, 0u);
+ ASSERT_EQ(0u, al1.debug_get_free());
+
+ al1.free_l1(i1);
+ i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+ ASSERT_EQ(i2, i1);
+ al1.free_l1(i2);
+ i2 = al1.allocate_l1_cont(_1m, _1m, 0, num_l1_entries);
+ ASSERT_EQ(i2.offset, i1.offset);
+ ASSERT_EQ(i2.length, _1m);
+
+ i3 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+ ASSERT_EQ(i3.length, 0u);
+
+ i3 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries);
+ ASSERT_EQ(i3.length, _1m);
+
+ i4 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries);
+ ASSERT_EQ(i4.length, 0u);
+
+ al1.free_l1(i2);
+ i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+ ASSERT_EQ(i2.length, 0u);
+
+ i2 = al1.allocate_l1_cont(_2m, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.length, _1m);
+
+ al1.free_l1(i2);
+ al1.free_l1(i3);
+ ASSERT_EQ(_2m, al1.debug_get_free());
+
+ i1 = al1.allocate_l1_cont(_2m - 3 * 0x1000, 0x1000, 0, num_l1_entries);
+ i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ i3 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ i4 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(0u, al1.debug_get_free());
+
+ al1.free_l1(i2);
+ al1.free_l1(i4);
+
+ i2 = al1.allocate_l1_cont(0x4000, 0x2000, 0, num_l1_entries);
+ ASSERT_EQ(i2.length, 0u);
+ i2 = al1.allocate_l1_cont(0x4000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i2.length, 0x1000u);
+
+ al1.free_l1(i3);
+ i3 = al1.allocate_l1_cont(0x6000, 0x3000, 0, num_l1_entries);
+ ASSERT_EQ(i3.length, 0u);
+ i3 = al1.allocate_l1_cont(0x6000, 0x1000, 0, num_l1_entries);
+ ASSERT_EQ(i3.length, 0x2000u);
+ ASSERT_EQ(0u, al1.debug_get_free());
+
+ std::cout << "Done L1" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2)
+{
+ TestAllocatorLevel02 al2;
+ uint64_t num_l2_entries = 64;// *512;
+ uint64_t capacity = num_l2_entries * 256 * 512 * 4096;
+ al2.init(capacity, 0x1000);
+ std::cout << "Init L2" << std::endl;
+
+ uint64_t allocated1 = 0;
+ interval_vector_t a1;
+ al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1);
+ ASSERT_EQ(allocated1, 0x2000u);
+ ASSERT_EQ(a1[0].offset, 0u);
+ ASSERT_EQ(a1[0].length, 0x2000u);
+
+ // limit query range in debug_get_free for the sake of performance
+ ASSERT_EQ(0x2000u, al2.debug_get_allocated(0, 1));
+ ASSERT_EQ(0u, al2.debug_get_allocated(1, 2));
+
+ uint64_t allocated2 = 0;
+ interval_vector_t a2;
+ al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2);
+ ASSERT_EQ(allocated2, 0x2000u);
+ ASSERT_EQ(a2[0].offset, 0x2000u);
+ ASSERT_EQ(a2[0].length, 0x2000u);
+ // limit query range in debug_get_free for the sake of performance
+ ASSERT_EQ(0x4000u, al2.debug_get_allocated(0, 1));
+ ASSERT_EQ(0u, al2.debug_get_allocated(1, 2));
+
+ al2.free_l2(a1);
+
+ allocated2 = 0;
+ a2.clear();
+ al2.allocate_l2(0x1000, 0x1000, &allocated2, &a2);
+ ASSERT_EQ(allocated2, 0x1000u);
+ ASSERT_EQ(a2[0].offset, 0x0000u);
+ ASSERT_EQ(a2[0].length, 0x1000u);
+ // limit query range in debug_get_free for the sake of performance
+ ASSERT_EQ(0x3000u, al2.debug_get_allocated(0, 1));
+ ASSERT_EQ(0u, al2.debug_get_allocated(1, 2));
+
+ uint64_t allocated3 = 0;
+ interval_vector_t a3;
+ al2.allocate_l2(0x2000, 0x1000, &allocated3, &a3);
+ ASSERT_EQ(allocated3, 0x2000u);
+ ASSERT_EQ(a3.size(), 2u);
+ ASSERT_EQ(a3[0].offset, 0x1000u);
+ ASSERT_EQ(a3[0].length, 0x1000u);
+ ASSERT_EQ(a3[1].offset, 0x4000u);
+ ASSERT_EQ(a3[1].length, 0x1000u);
+ // limit query range in debug_get_free for the sake of performance
+ ASSERT_EQ(0x5000u, al2.debug_get_allocated(0, 1));
+ ASSERT_EQ(0u, al2.debug_get_allocated(1, 2));
+ {
+ interval_vector_t r;
+ r.emplace_back(0x0, 0x5000);
+ al2.free_l2(r);
+ }
+
+ a3.clear();
+ allocated3 = 0;
+ al2.allocate_l2(_1m, _1m, &allocated3, &a3);
+ ASSERT_EQ(a3.size(), 1u);
+ ASSERT_EQ(a3[0].offset, 0u);
+ ASSERT_EQ(a3[0].length, _1m);
+
+ al2.free_l2(a3);
+
+ a3.clear();
+ allocated3 = 0;
+ al2.allocate_l2(4 * _1m, _1m, &allocated3, &a3);
+ ASSERT_EQ(a3.size(), 1u);
+ ASSERT_EQ(a3[0].offset, 0u);
+ ASSERT_EQ(a3[0].length, 4 * _1m);
+
+ al2.free_l2(a3);
+
+#ifndef _DEBUG
+ for (uint64_t i = 0; i < capacity; i += 0x1000) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, 0x1000u);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+#else
+ for (uint64_t i = 0; i < capacity; i += _2m) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_2m, _2m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, _2m);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+#endif
+
+ ASSERT_EQ(0u, al2.debug_get_free());
+ for (uint64_t i = 0; i < capacity; i += _1m) {
+ interval_vector_t r;
+ r.emplace_back(i, _1m);
+ al2.free_l2(r);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "free1 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ ASSERT_EQ(capacity, al2.debug_get_free());
+
+ for (uint64_t i = 0; i < capacity; i += _1m) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, _1m);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc2 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ ASSERT_EQ(0u, al2.debug_get_free());
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+
+ for (uint64_t i = 0; i < capacity; i += 0x2000) {
+ interval_vector_t r;
+ r.emplace_back(i, 0x1000);
+ al2.free_l2(r);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "free2 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ ASSERT_EQ(capacity / 2, al2.debug_get_free());
+
+ // unable to allocate due to fragmentation
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+
+ for (uint64_t i = 0; i < capacity; i += 2 * _1m) {
+ a4.clear();
+ allocated4 = 0;
+ al2.allocate_l2(_1m, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), _1m / 0x1000);
+ ASSERT_EQ(allocated4, _1m);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, 0x1000u);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "alloc3 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ ASSERT_EQ(0u, al2.debug_get_free());
+
+ std::cout << "Done L2" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_huge)
+{
+ TestAllocatorLevel02 al2;
+ uint64_t num_l2_entries = 4 * 512;
+ uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 1 TB
+ al2.init(capacity, 0x1000);
+ std::cout << "Init L2 Huge" << std::endl;
+
+ for (uint64_t i = 0; i < capacity; i += _1m) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, 0x1000u);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, 0x1000u);
+
+ allocated4 = 0;
+ a4.clear();
+ al2.allocate_l2(_1m - 0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m - 0x1000);
+ ASSERT_EQ(a4[0].offset, i + 0x1000);
+ ASSERT_EQ(a4[0].length, _1m - 0x1000);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "allocH " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ for (uint64_t i = 0; i < capacity; i += _1m) {
+ interval_vector_t a4;
+ a4.emplace_back(i, 0x1000);
+ al2.free_l2(a4);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "freeH1 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ {
+ std::cout << "Try" << std::endl;
+ time_t t = time(NULL);
+ for (int i = 0; i < 10; ++i) {
+ uint64_t allocated = 0;
+ interval_vector_t a;
+ al2.allocate_l2(0x2000, 0x2000, &allocated, &a);
+ ASSERT_EQ(a.size(), 0u);
+ }
+ std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+ }
+ {
+ std::cout << "Try" << std::endl;
+ time_t t = time(NULL);
+ for (int i = 0; i < 10; ++i) {
+ uint64_t allocated = 0;
+ interval_vector_t a;
+ al2.allocate_l2(_2m, _2m, &allocated, &a);
+ ASSERT_EQ(a.size(), 0u);
+ }
+ std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+ }
+
+ ASSERT_EQ((capacity / _1m) * 0x1000, al2.debug_get_free());
+
+ std::cout << "Done L2 Huge" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_unaligned)
+{
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t num_l2_entries = 3;
+ uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+ al2.init(capacity, 0x1000);
+ std::cout << "Init L2 Unaligned" << std::endl;
+
+ for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m / 2);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, _1m / 2);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "allocU " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ ASSERT_EQ(0u, al2.debug_get_free());
+ {
+ // no space to allocate
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+ }
+ }
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t capacity = 500 * 512 * 4096; // 500x2 MB
+ al2.init(capacity, 0x1000);
+ std::cout << ("Init L2 Unaligned2\n");
+ for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m / 2);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, _1m / 2);
+ if (0 == (i % (1 * 1024 * _1m))) {
+ std::cout << "allocU2 " << i / 1024 / 1024 << " mb of "
+ << capacity / 1024 / 1024 << std::endl;
+ }
+ }
+ ASSERT_EQ(0u, al2.debug_get_free());
+ {
+ // no space to allocate
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+ }
+ }
+
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t capacity = 100 * 512 * 4096 + 127 * 4096;
+ al2.init(capacity, 0x1000);
+ std::cout << "Init L2 Unaligned2" << std::endl;
+ for (uint64_t i = 0; i < capacity; i += 0x1000) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, 0x1000u);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, 0x1000u);
+ }
+ ASSERT_EQ(0u, al2.debug_get_free());
+ {
+ // no space to allocate
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+ }
+ }
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t capacity = 3 * 4096;
+ al2.init(capacity, 0x1000);
+ std::cout << "Init L2 Unaligned2" << std::endl;
+ for (uint64_t i = 0; i < capacity; i += 0x1000) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, 0x1000u);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, 0x1000u);
+ }
+ ASSERT_EQ(0u, al2.debug_get_free());
+ {
+ // no space to allocate
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 0u);
+ }
+ }
+
+ std::cout << "Done L2 Unaligned" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_contiguous_alignment)
+{
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t num_l2_entries = 3;
+ uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+ uint64_t num_chunks = capacity / 4096;
+ al2.init(capacity, 4096);
+ std::cout << "Init L2 cont aligned" << std::endl;
+
+ std::map<size_t, size_t> bins_overall;
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 1u);
+// std::cout<<bins_overall.begin()->first << std::endl;
+ ASSERT_EQ(bins_overall[cbits(num_chunks) - 1], 1u);
+
+ for (uint64_t i = 0; i < capacity / 2; i += _1m) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, _1m);
+ }
+ ASSERT_EQ(capacity / 2, al2.debug_get_free());
+
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+
+ {
+ // Original free space disposition (start chunk, count):
+ // <NC/2, NC/2>
+ size_t to_release = 2 * _1m + 0x1000;
+ // release 2M + 4K at the beginning
+ interval_vector_t r;
+ r.emplace_back(0, to_release);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits(to_release / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <0, 513>, <NC / 2, NC / 2>
+ // allocate 4K within the deallocated range
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, 0x1000u);
+ ASSERT_EQ(a4[0].offset, 0u);
+ ASSERT_EQ(a4[0].length, 0x1000u);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits(2 * _1m / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <1, 512>, <NC / 2, NC / 2>
+ // allocate 1M - should go to offset 4096
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m);
+ ASSERT_EQ(a4[0].offset, 4096);
+ ASSERT_EQ(a4[0].length, _1m);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <257, 256>, <NC / 2, NC / 2>
+ // and allocate yet another 8K within the deallocated range
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x2000, 0x1000, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, 0x2000u);
+ ASSERT_EQ(a4[0].offset, _1m + 0x1000u);
+ ASSERT_EQ(a4[0].length, 0x2000u);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <259, 254>, <NC / 2, NC / 2>
+ // release 4K~1M
+ interval_vector_t r;
+ r.emplace_back(0x1000, _1m);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 3u);
+ //ASSERT_EQ(bins_overall[cbits((2 * _1m - 0x3000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <1, 257>, <259, 254>, <NC / 2, NC / 2>
+ // allocate 3M - should go to the first 1M chunk and @capacity/2
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(3 * _1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 2u);
+ ASSERT_EQ(allocated4, 3 * _1m);
+ ASSERT_EQ(a4[0].offset, 0x1000);
+ ASSERT_EQ(a4[0].length, _1m);
+ ASSERT_EQ(a4[1].offset, capacity / 2);
+ ASSERT_EQ(a4[1].length, 2 * _1m);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <259, 254>, <NC / 2 - 512, NC / 2 - 512>
+ // release allocated 1M in the first meg chunk except
+ // the first 4K chunk
+ interval_vector_t r;
+ r.emplace_back(0x1000, _1m);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 3u);
+ ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <1, 256>, <259, 254>, <NC / 2 - 512, NC / 2 - 512>
+ // release 2M @(capacity / 2)
+ interval_vector_t r;
+ r.emplace_back(capacity / 2, 2 * _1m);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 3u);
+ ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((num_chunks) / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <1, 256>, <259, 254>, <NC / 2, NC / 2>
+ // allocate 4x512K - should go to the second halves of
+ // the first and second 1M chunks and @(capacity / 2)
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(2 * _1m, _1m / 2, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 3u);
+ ASSERT_EQ(allocated4, 2 * _1m);
+ ASSERT_EQ(a4[1].offset, 0x1000);
+ ASSERT_EQ(a4[1].length, _1m);
+ ASSERT_EQ(a4[0].offset, _1m + 0x3000);
+ ASSERT_EQ(a4[0].length, _1m / 2);
+ ASSERT_EQ(a4[2].offset, capacity / 2);
+ ASSERT_EQ(a4[2].length, _1m / 2);
+
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u);
+
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <387, 126>, <NC / 2 + 128, NC / 2 - 128>
+ // cleanup first 1536K except the last 4K chunk
+ interval_vector_t r;
+ r.emplace_back(0, _1m + _1m / 2 - 0x1000);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 3u);
+ ASSERT_EQ(bins_overall[cbits((_1m + _1m / 2 - 0x1000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <0, 383> <387, 126>, <NC / 2 + 128, NC / 2 - 128>
+ // release 512K @(capacity / 2)
+ interval_vector_t r;
+ r.emplace_back(capacity / 2, _1m / 2);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 3u);
+ ASSERT_EQ(bins_overall[cbits((_1m + _1m / 2 - 0x1000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <0, 383> <387, 126>, <NC / 2, NC / 2>
+ // allocate 132M (=33792*4096) = using 4M granularity should go to (capacity / 2)
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(132 * _1m, 4 * _1m , &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(a4[0].offset, capacity / 2);
+ ASSERT_EQ(a4[0].length, 132 * _1m);
+
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 3u);
+ ASSERT_EQ(bins_overall[cbits((_1m + _1m / 2 - 0x1000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 33792) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <0, 383> <387, 126>, <NC / 2 + 33792, NC / 2 - 33792>
+ // cleanup remaining 4*4K chunks in the first 2M
+ interval_vector_t r;
+ r.emplace_back(383 * 4096, 4 * 0x1000);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits((2 * _1m + 0x1000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 33792) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <0, 513>, <NC / 2 + 33792, NC / 2 - 33792>
+ // release 132M @(capacity / 2)
+ interval_vector_t r;
+ r.emplace_back(capacity / 2, 132 * _1m);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits((2 * _1m + 0x1000) / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <0, 513>, <NC / 2, NC / 2>
+ // allocate 132M using 2M granularity should go to the first chunk and to
+ // (capacity / 2)
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(132 * _1m, 2 * _1m , &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 2u);
+ ASSERT_EQ(a4[0].offset, 0u);
+ ASSERT_EQ(a4[0].length, 2 * _1m);
+ ASSERT_EQ(a4[1].offset, capacity / 2);
+ ASSERT_EQ(a4[1].length, 130 * _1m);
+
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits(0)], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 33792) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <512, 1>, <NC / 2 + 33792, NC / 2 - 33792>
+ // release 130M @(capacity / 2)
+ interval_vector_t r;
+ r.emplace_back(capacity / 2, 132 * _1m);
+ al2.free_l2(r);
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 2u);
+ ASSERT_EQ(bins_overall[cbits(0)], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <512,1>, <NC / 2, NC / 2>
+ // release 4K~16K
+ // release 28K~32K
+ // release 68K~24K
+ interval_vector_t r;
+ r.emplace_back(0x1000, 0x4000);
+ r.emplace_back(0x7000, 0x8000);
+ r.emplace_back(0x11000, 0x6000);
+ al2.free_l2(r);
+
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 4u);
+ ASSERT_EQ(bins_overall[cbits(0)], 1u);
+ ASSERT_EQ(bins_overall[cbits(0x4000 / 0x1000) - 1], 2u); // accounts both 0x4000 & 0x6000
+ ASSERT_EQ(bins_overall[cbits(0x8000 / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <1, 4>, <7, 8>, <17, 6> <512,1>, <NC / 2, NC / 2>
+ // allocate 80K using 16K granularity
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x14000, 0x4000, &allocated4, &a4);
+
+ ASSERT_EQ(a4.size(), 4);
+ ASSERT_EQ(a4[1].offset, 0x1000u);
+ ASSERT_EQ(a4[1].length, 0x4000u);
+ ASSERT_EQ(a4[0].offset, 0x7000u);
+ ASSERT_EQ(a4[0].length, 0x8000u);
+ ASSERT_EQ(a4[2].offset, 0x11000u);
+ ASSERT_EQ(a4[2].length, 0x4000u);
+ ASSERT_EQ(a4[3].offset, capacity / 2);
+ ASSERT_EQ(a4[3].length, 0x4000u);
+
+ bins_overall.clear();
+ al2.collect_stats(bins_overall);
+
+ ASSERT_EQ(bins_overall.size(), 3u);
+ ASSERT_EQ(bins_overall[cbits(0)], 1u);
+ ASSERT_EQ(bins_overall[cbits(0x2000 / 0x1000) - 1], 1u);
+ ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 1) - 1], 1u);
+ }
+ {
+ // Original free space disposition (start chunk, count):
+ // <21, 2> <512,1>, <NC / 2 + 1, NC / 2 - 1>
+ }
+ }
+ std::cout << "Done L2 cont aligned" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug)
+{
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t capacity = 0x8000 * _1m; // = 32GB
+ al2.init(capacity, 0x10000);
+ std::cout << "Init L2 cont aligned" << std::endl;
+
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u); // the bug caused no allocations here
+ ASSERT_EQ(allocated4, _1m);
+ ASSERT_EQ(a4[0].offset, 0u);
+ ASSERT_EQ(a4[0].length, _1m);
+ }
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug2)
+{
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t capacity = 0x8000 * _1m; // = 32GB
+ al2.init(capacity, 0x10000);
+
+ for (uint64_t i = 0; i < capacity; i += _1m) {
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 1u);
+ ASSERT_EQ(allocated4, _1m);
+ ASSERT_EQ(a4[0].offset, i);
+ ASSERT_EQ(a4[0].length, _1m);
+ }
+ ASSERT_EQ(0u , al2.debug_get_free());
+
+ interval_vector_t r;
+ r.emplace_back(0x5fec30000, 0x13d0000);
+ r.emplace_back(0x628000000, 0x80000000);
+ r.emplace_back(0x6a8000000, 0x80000000);
+ r.emplace_back(0x728100000, 0x70000);
+ al2.free_l2(r);
+
+ std::map<size_t, size_t> bins_overall;
+ al2.collect_stats(bins_overall);
+
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(0x3e000000, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 2u);
+ ASSERT_EQ(allocated4, 0x3e000000u);
+ ASSERT_EQ(a4[0].offset, 0x5fec30000u);
+ ASSERT_EQ(a4[0].length, 0x1300000u);
+ ASSERT_EQ(a4[1].offset, 0x628000000u);
+ ASSERT_EQ(a4[1].length, 0x3cd00000u);
+ }
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug3)
+{
+ {
+ TestAllocatorLevel02 al2;
+ uint64_t capacity = 0x8000 * _1m; // = 32GB
+ al2.init(capacity, 0x10000);
+ std::cout << "Init L2 cont aligned" << std::endl;
+
+ uint64_t allocated4 = 0;
+ interval_vector_t a4;
+ al2.allocate_l2(4096ull * _1m, _1m, &allocated4, &a4);
+ ASSERT_EQ(a4.size(), 2u); // allocator has to split into 2 allocations
+ ASSERT_EQ(allocated4, 4096ull * _1m);
+ ASSERT_EQ(a4[0].offset, 0u);
+ ASSERT_EQ(a4[0].length, 2048ull * _1m);
+ ASSERT_EQ(a4[1].offset, 2048ull * _1m);
+ ASSERT_EQ(a4[1].length, 2048ull * _1m);
+ }
+}
+
+TEST(TestAllocatorLevel01, test_claim_free_l2)
+{
+ TestAllocatorLevel02 al2;
+ uint64_t num_l2_entries = 64;// *512;
+ uint64_t capacity = num_l2_entries * 256 * 512 * 4096;
+ al2.init(capacity, 0x1000);
+ std::cout << "Init L2" << std::endl;
+
+ uint64_t max_available = 0x20000;
+ al2.mark_allocated(max_available, capacity - max_available);
+
+ uint64_t allocated1 = 0;
+ interval_vector_t a1;
+ al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1);
+ ASSERT_EQ(allocated1, 0x2000u);
+ ASSERT_EQ(a1[0].offset, 0u);
+ ASSERT_EQ(a1[0].length, 0x2000u);
+
+ uint64_t allocated2 = 0;
+ interval_vector_t a2;
+ al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2);
+ ASSERT_EQ(allocated2, 0x2000u);
+ ASSERT_EQ(a2[0].offset, 0x2000u);
+ ASSERT_EQ(a2[0].length, 0x2000u);
+
+ uint64_t allocated3 = 0;
+ interval_vector_t a3;
+ al2.allocate_l2(0x3000, 0x3000, &allocated3, &a3);
+ ASSERT_EQ(allocated3, 0x3000u);
+ ASSERT_EQ(a3[0].offset, 0x4000u);
+ ASSERT_EQ(a3[0].length, 0x3000u);
+
+ al2.free_l2(a1);
+ al2.free_l2(a3);
+ ASSERT_EQ(max_available - 0x2000, al2.debug_get_free());
+
+ auto claimed = al2.claim_free_to_right(0x4000);
+ ASSERT_EQ(max_available - 0x4000u, claimed);
+ ASSERT_EQ(0x2000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_right(0x4000);
+ ASSERT_EQ(0, claimed);
+ ASSERT_EQ(0x2000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_left(0x2000);
+ ASSERT_EQ(0x2000u, claimed);
+ ASSERT_EQ(0, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_left(0x2000);
+ ASSERT_EQ(0, claimed);
+ ASSERT_EQ(0, al2.debug_get_free());
+
+
+ al2.mark_free(0x3000, 0x4000);
+ ASSERT_EQ(0x4000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_right(0x7000);
+ ASSERT_EQ(0, claimed);
+ ASSERT_EQ(0x4000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_right(0x6000);
+ ASSERT_EQ(0x1000, claimed);
+ ASSERT_EQ(0x3000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_right(0x6000);
+ ASSERT_EQ(0, claimed);
+ ASSERT_EQ(0x3000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_left(0x3000);
+ ASSERT_EQ(0u, claimed);
+ ASSERT_EQ(0x3000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_left(0x4000);
+ ASSERT_EQ(0x1000, claimed);
+ ASSERT_EQ(0x2000, al2.debug_get_free());
+
+ // claiming on the right boundary
+ claimed = al2.claim_free_to_right(capacity);
+ ASSERT_EQ(0x0, claimed);
+ ASSERT_EQ(0x2000, al2.debug_get_free());
+
+ // extend allocator space up to 64M
+ auto max_available2 = 64 * 1024 * 1024;
+ al2.mark_free(max_available, max_available2 - max_available);
+ ASSERT_EQ(max_available2 - max_available + 0x2000, al2.debug_get_free());
+
+ // pin some allocations
+ al2.mark_allocated(0x400000 + 0x2000, 1000);
+ al2.mark_allocated(0x400000 + 0x5000, 1000);
+ al2.mark_allocated(0x400000 + 0x20000, 1000);
+ ASSERT_EQ(max_available2 - max_available - 0x1000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_left(0x403000);
+ ASSERT_EQ(0x0, claimed);
+
+ claimed = al2.claim_free_to_left(0x404000);
+ ASSERT_EQ(0x1000, claimed);
+ ASSERT_EQ(max_available2 - max_available - 0x2000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_left(max_available);
+ ASSERT_EQ(0, claimed);
+
+ claimed = al2.claim_free_to_left(0x400000);
+ ASSERT_EQ(0x3e0000, claimed);
+ ASSERT_EQ(max_available2 - max_available - 0x3e2000, al2.get_available());
+ ASSERT_EQ(max_available2 - max_available - 0x3e2000, al2.debug_get_free());
+
+ claimed = al2.claim_free_to_right(0x407000);
+ ASSERT_EQ(0x19000, claimed);
+ ASSERT_EQ(max_available2 - max_available - 0x3e2000 - 0x19000,
+ al2.get_available());
+ ASSERT_EQ(max_available2 - max_available - 0x3e2000 - 0x19000,
+ al2.debug_get_free());
+
+ claimed = al2.claim_free_to_right(0x407000);
+ ASSERT_EQ(0, claimed);
+
+ claimed = al2.claim_free_to_right(0x430000);
+ ASSERT_EQ(max_available2 - 0x430000, claimed);
+ ASSERT_EQ(0x15000,
+ al2.get_available());
+ ASSERT_EQ(0x15000,
+ al2.debug_get_free());
+}
diff --git a/src/test/objectstore/hybrid_allocator_test.cc b/src/test/objectstore/hybrid_allocator_test.cc
new file mode 100755
index 000000000..e43d28b28
--- /dev/null
+++ b/src/test/objectstore/hybrid_allocator_test.cc
@@ -0,0 +1,231 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "os/bluestore/HybridAllocator.h"
+
+class TestHybridAllocator : public HybridAllocator {
+public:
+ TestHybridAllocator(CephContext* cct,
+ int64_t device_size,
+ int64_t _block_size,
+ uint64_t max_entries,
+ const std::string& name) :
+ HybridAllocator(cct, device_size, _block_size,
+ max_entries,
+ name) {
+ }
+
+ uint64_t get_bmap_free() {
+ return get_bmap() ? get_bmap()->get_free() : 0;
+ }
+ uint64_t get_avl_free() {
+ return AvlAllocator::get_free();
+ }
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _4m = 4 * 1024 * 1024;
+
+TEST(HybridAllocator, basic)
+{
+ {
+ uint64_t block_size = 0x1000;
+ uint64_t capacity = 0x10000 * _1m; // = 64GB
+ TestHybridAllocator ha(g_ceph_context, capacity, block_size,
+ 4 * sizeof(range_seg_t), "test_hybrid_allocator");
+
+ ASSERT_EQ(0, ha.get_free());
+ ASSERT_EQ(0, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ ha.init_add_free(0, _4m);
+ ASSERT_EQ(_4m, ha.get_free());
+ ASSERT_EQ(_4m, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ ha.init_add_free(2 * _4m, _4m);
+ ASSERT_EQ(_4m * 2, ha.get_free());
+ ASSERT_EQ(_4m * 2, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ ha.init_add_free(100 * _4m, _4m);
+ ha.init_add_free(102 * _4m, _4m);
+
+ ASSERT_EQ(_4m * 4, ha.get_free());
+ ASSERT_EQ(_4m * 4, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ // next allocs will go to bitmap
+ ha.init_add_free(4 * _4m, _4m);
+ ASSERT_EQ(_4m * 5, ha.get_free());
+ ASSERT_EQ(_4m * 4, ha.get_avl_free());
+ ASSERT_EQ(_4m * 1, ha.get_bmap_free());
+
+ ha.init_add_free(6 * _4m, _4m);
+ ASSERT_EQ(_4m * 6, ha.get_free());
+ ASSERT_EQ(_4m * 4, ha.get_avl_free());
+ ASSERT_EQ(_4m * 2, ha.get_bmap_free());
+
+ // so we have 6x4M chunks, 4 chunks at AVL and 2 at bitmap
+
+ ha.init_rm_free(_1m, _1m); // take 1M from AVL
+ ASSERT_EQ(_1m * 23, ha.get_free());
+ ASSERT_EQ(_1m * 14, ha.get_avl_free());
+ ASSERT_EQ(_1m * 9, ha.get_bmap_free());
+
+ ha.init_rm_free(6 * _4m + _1m, _1m); // take 1M from bmap
+ ASSERT_EQ(_1m * 22, ha.get_free());
+ ASSERT_EQ(_1m * 14, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8, ha.get_bmap_free());
+
+ // so we have at avl: 2M~2M, 8M~4M, 400M~4M , 408M~4M
+ // and at bmap: 0~1M, 16M~1M, 18M~2M, 24~4M
+
+ PExtentVector extents;
+ // allocate 4K, to be served from bitmap
+ EXPECT_EQ(block_size, ha.allocate(block_size, block_size,
+ 0, (int64_t)0, &extents));
+ ASSERT_EQ(1, extents.size());
+ ASSERT_EQ(0, extents[0].offset);
+
+ ASSERT_EQ(_1m * 14, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8 - block_size, ha.get_bmap_free());
+
+ interval_set<uint64_t> release_set;
+ // release 4K, to be returned to bitmap
+ release_set.insert(extents[0].offset, extents[0].length);
+ ha.release(release_set);
+
+ ASSERT_EQ(_1m * 14, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8, ha.get_bmap_free());
+ extents.clear();
+ release_set.clear();
+
+ // again we have at avl: 2M~2M, 8M~4M, 400M~4M , 408M~4M
+ // and at bmap: 0~1M, 16M~1M, 18M~2M, 24~4M
+
+ // add 12M~3M which will go to avl
+ ha.init_add_free(3 * _4m, 3 * _1m);
+ ASSERT_EQ(_1m * 17, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8, ha.get_bmap_free());
+
+
+ // add 15M~4K which will be appended to existing slot
+ ha.init_add_free(15 * _1m, 0x1000);
+ ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8, ha.get_bmap_free());
+
+
+ // again we have at avl: 2M~2M, 8M~(7M+4K), 400M~4M , 408M~4M
+ // and at bmap: 0~1M, 16M~1M, 18M~2M, 24~4M
+
+ //some removals from bmap
+ ha.init_rm_free(28 * _1m - 0x1000, 0x1000);
+ ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8 - 0x1000, ha.get_bmap_free());
+
+ ha.init_rm_free(24 * _1m + 0x1000, 0x1000);
+ ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free());
+ ASSERT_EQ(_1m * 8 - 0x2000, ha.get_bmap_free());
+
+ ha.init_rm_free(24 * _1m + 0x1000, _4m - 0x2000);
+ ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free());
+ ASSERT_EQ(_1m * 4, ha.get_bmap_free());
+
+ //4K removal from avl
+ ha.init_rm_free(15 * _1m, 0x1000);
+ ASSERT_EQ(_1m * 17, ha.get_avl_free());
+ ASSERT_EQ(_1m * 4, ha.get_bmap_free());
+
+ //remove highest 4Ms from avl
+ ha.init_rm_free(_1m * 400, _4m);
+ ha.init_rm_free(_1m * 408, _4m);
+ ASSERT_EQ(_1m * 9, ha.get_avl_free());
+ ASSERT_EQ(_1m * 4, ha.get_bmap_free());
+
+ // we have at avl: 2M~2M, 8M~7M
+ // and at bmap: 0~1M, 16M~1M, 18M~2M
+
+ // this will be merged with neighbors from bmap and go to avl
+ ha.init_add_free(17 * _1m, _1m);
+ ASSERT_EQ(_1m * 1, ha.get_bmap_free());
+ ASSERT_EQ(_1m * 13, ha.get_avl_free());
+
+ // we have at avl: 2M~2M, 8M~7M, 16M~4M
+ // and at bmap: 0~1M
+
+ // and now do some cutoffs from 0~1M span
+
+ //cut off 4K from bmap
+ ha.init_rm_free(0 * _1m, 0x1000);
+ ASSERT_EQ(_1m * 13, ha.get_avl_free());
+ ASSERT_EQ(_1m * 1 - 0x1000, ha.get_bmap_free());
+
+ //cut off 1M-4K from bmap
+ ha.init_rm_free(0 * _1m + 0x1000, _1m - 0x1000);
+ ASSERT_EQ(_1m * 13, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ //cut off 512K avl
+ ha.init_rm_free(17 * _1m + 0x1000, _1m / 2);
+ ASSERT_EQ(_1m * 13 - _1m / 2, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ //cut off the rest from avl
+ ha.init_rm_free(17 * _1m + 0x1000 + _1m / 2, _1m / 2);
+ ASSERT_EQ(_1m * 12, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+ }
+
+ {
+ uint64_t block_size = 0x1000;
+ uint64_t capacity = 0x10000 * _1m; // = 64GB
+ TestHybridAllocator ha(g_ceph_context, capacity, block_size,
+ 4 * sizeof(range_seg_t), "test_hybrid_allocator");
+
+ ha.init_add_free(_1m, _1m);
+ ha.init_add_free(_1m * 3, _1m);
+ ha.init_add_free(_1m * 5, _1m);
+ ha.init_add_free(0x4000, 0x1000);
+
+ ASSERT_EQ(_1m * 3 + 0x1000, ha.get_free());
+ ASSERT_EQ(_1m * 3 + 0x1000, ha.get_avl_free());
+ ASSERT_EQ(0, ha.get_bmap_free());
+
+ // This will substitute chunk 0x4000~1000.
+ // Since new chunk insertion into into AvlAllocator:range_tree
+ // happens immediately before 0x4000~1000 chunk care should be taken
+ // to order operations properly and do not use already disposed iterator.
+ ha.init_add_free(0, 0x2000);
+
+ ASSERT_EQ(_1m * 3 + 0x3000, ha.get_free());
+ ASSERT_EQ(_1m * 3 + 0x2000, ha.get_avl_free());
+ ASSERT_EQ(0x1000, ha.get_bmap_free());
+ }
+}
+
+TEST(HybridAllocator, fragmentation)
+{
+ {
+ uint64_t block_size = 0x1000;
+ uint64_t capacity = 0x1000 * 0x1000; // = 16M
+ TestHybridAllocator ha(g_ceph_context, capacity, block_size,
+ 4 * sizeof(range_seg_t), "test_hybrid_allocator");
+
+ ha.init_add_free(0, 0x2000);
+ ha.init_add_free(0x4000, 0x2000);
+ ha.init_add_free(0x8000, 0x2000);
+ ha.init_add_free(0xc000, 0x1000);
+
+ ASSERT_EQ(0.5, ha.get_fragmentation());
+
+ // this will got to bmap with fragmentation = 1
+ ha.init_add_free(0x10000, 0x1000);
+
+ // which results in the following total fragmentation
+ ASSERT_EQ(0.5 * 7 / 8 + 1.0 / 8, ha.get_fragmentation());
+ }
+}
diff --git a/src/test/objectstore/run_seed_to.sh b/src/test/objectstore/run_seed_to.sh
new file mode 100755
index 000000000..5a624a5d4
--- /dev/null
+++ b/src/test/objectstore/run_seed_to.sh
@@ -0,0 +1,293 @@
+#!/usr/bin/env bash
+# vim: ts=8 sw=2 smarttab
+#
+# run_seed_to.sh - Run ceph_test_filestore_idempotent_sequence up until an
+# injection point, generating a sequence of operations based on a
+# provided seed.
+#
+# We also perform three additional tests, focused on assessing if
+# replaying a larger chunck of the journal affects the expected store
+# behavior. These tests will be performed by increasing the store's
+# journal sync interval to a very large value, allowing the store to
+# finish execution before the first sync (unless the store runs for
+# over 10 hours, case on which the interval variables must be changed
+# to an appropriate value). Unless the '--no-journal-test' option is
+# specified, we will run the 3 following scenarios:
+#
+# 1) journal sync'ing for both stores is good as disabled
+# (we call it '00', for store naming purposes)
+# 2) journal sync'ing for store A is as good as disabled
+# (we call it '01', for store naming purposes)
+# 3) journal sync'ing for store B is as good as disabled
+# (we call it '10', for store naming purposes)
+#
+# All log files are also appropriately named accordingly (i.e., a.00.fail,
+# a.10.recover, or b.01.clean).
+#
+# By default, the test will not exit on error, although it will show the
+# fail message. This behavior is so defined so we run the whole battery of
+# tests, and obtain as many mismatches as possible in one go. We may force
+# the test to exit on error by specifying the '--exit-on-error' option.
+#
+#
+set -e
+
+test_opts=""
+
+usage() {
+ echo "usage: $1 [options..] <seed> <kill-at>"
+ echo
+ echo "options:"
+ echo " -c, --colls <VAL> # of collections"
+ echo " -o, --objs <VAL> # of objects"
+ echo " -b, --btrfs <VAL> seq number for btrfs stores"
+ echo " --no-journal-test don't perform journal replay tests"
+ echo " -e, --exit-on-error exit with 1 on error"
+ echo " -v, --valgrind run commands through valgrind"
+ echo
+ echo "env vars:"
+ echo " OPTS_STORE additional opts for both stores"
+ echo " OPTS_STORE_A additional opts for store A"
+ echo " OPTS_STORE_B additional opts for store B"
+ echo
+}
+
+echo $0 $*
+
+die_on_missing_arg() {
+ if [[ "$2" == "" ]]; then
+ echo "$1: missing required parameter"
+ exit 1
+ fi
+}
+
+
+required_args=2
+obtained_args=0
+
+seed=""
+killat=""
+on_btrfs=0
+on_btrfs_seq=0
+journal_test=1
+min_sync_interval="36000" # ten hours, yes.
+max_sync_interval="36001"
+exit_on_error=0
+v=""
+
+do_rm() {
+ if [[ $on_btrfs -eq 0 ]]; then
+ rm -fr $*
+ fi
+}
+
+set_arg() {
+ if [[ $1 -eq 1 ]]; then
+ seed=$2
+ elif [[ $1 -eq 2 ]]; then
+ killat=$2
+ else
+ echo "error: unknown purpose for '$2'"
+ usage $0
+ exit 1
+ fi
+}
+
+while [[ $# -gt 0 ]];
+do
+ case "$1" in
+ -c | --colls)
+ die_on_missing_arg "$1" "$2"
+ test_opts="$test_opts --test-num-colls $2"
+ shift 2
+ ;;
+ -o | --objs)
+ die_on_missing_arg "$1" "$2"
+ test_opts="$test_opts --test-num-objs $2"
+ shift 2
+ ;;
+ -h | --help)
+ usage $0 ;
+ exit 0
+ ;;
+ -b | --btrfs)
+ die_on_missing_arg "$1" "$2"
+ on_btrfs=1
+ on_btrfs_seq=$2
+ shift 2
+ ;;
+ --no-journal-test)
+ journal_test=0
+ shift
+ ;;
+ -e | --exit-on-error)
+ exit_on_error=1
+ shift
+ ;;
+ -v | --valgrind)
+ v="valgrind --leak-check=full"
+ shift
+ ;;
+ --)
+ shift
+ break
+ ;;
+ -*)
+ echo "$1: unknown option" >&2
+ usage $0
+ exit 1
+ ;;
+ *)
+ obtained_args=$(($obtained_args+1))
+ set_arg $obtained_args $1
+ shift
+ ;;
+ esac
+done
+
+if [[ $obtained_args -ne $required_args ]]; then
+ echo "error: missing argument"
+ usage $0 ;
+ exit 1
+fi
+
+if [[ "$OPTS_STORE" != "" ]]; then
+ test_opts="$test_opts $OPTS_STORE"
+fi
+
+test_opts_a="$test_opts"
+test_opts_b="$test_opts"
+
+if [[ "$OPTS_STORE_A" != "" ]]; then
+ test_opts_a="$test_opts_a $OPTS_STORE_A"
+fi
+if [[ "$OPTS_STORE_B" != "" ]]; then
+ test_opts_b="$test_opts_b $OPTS_STORE_B"
+fi
+
+echo seed $seed
+echo kill at $killat
+
+# run forever, until $killat...
+to=1000000000
+
+#
+# store names
+#
+# We need these for two reasons:
+# 1) if we are running the tests on a btrfs volume, then we need to use
+# a seq number for each run. Being on btrfs means we will fail when
+# removing the store's directories and it's far more simple to just
+# specify differente store names such as 'a.$seq' or 'b.$seq'.
+#
+# 2) unless the '--no-journal-test' option is specified, we will run
+# three additional tests for each store, and we will reuse the same
+# command for each one of the runs, but varying the store's name and
+# arguments.
+#
+store_a="a"
+store_b="b"
+
+if [[ $on_btrfs -eq 1 ]]; then
+ store_a="$store_a.$on_btrfs_seq"
+ store_b="$store_b.$on_btrfs_seq"
+fi
+
+total_runs=1
+
+if [[ $journal_test -eq 1 ]]; then
+ total_runs=$(($total_runs + 3))
+fi
+
+num_runs=0
+
+opt_min_sync="--filestore-min-sync-interval $min_sync_interval"
+opt_max_sync="--filestore-max-sync-interval $max_sync_interval"
+
+ret=0
+
+while [[ $num_runs -lt $total_runs ]];
+do
+ tmp_name_a=$store_a
+ tmp_name_b=$store_b
+ tmp_opts_a=$test_opts_a
+ tmp_opts_b=$test_opts_b
+
+ #
+ # We have already tested whether there are diffs when both journals
+ # are properly working. Now let's try on three other scenarios:
+ # 1) journal sync'ing for both stores is good as disabled
+ # (we call it '00')
+ # 2) journal sync'ing for store A is as good as disabled
+ # (we call it '01')
+ # 3) journal sync'ing for store B is as good as disabled
+ # (we call it '10')
+ #
+ if [[ $num_runs -gt 0 && $journal_test -eq 1 ]]; then
+ echo "run #$num_runs"
+ case $num_runs in
+ 1)
+ tmp_name_a="$tmp_name_a.00"
+ tmp_name_b="$tmp_name_b.00"
+ tmp_opts_a="$tmp_opts_a $opt_min_sync $opt_max_sync"
+ tmp_opts_b="$tmp_opts_b $opt_min_sync $opt_max_sync"
+ ;;
+ 2)
+ tmp_name_a="$tmp_name_a.01"
+ tmp_name_b="$tmp_name_b.01"
+ tmp_opts_a="$tmp_opts_a $opt_min_sync $opt_max_sync"
+ ;;
+ 3)
+ tmp_name_a="$tmp_name_a.10"
+ tmp_name_b="$tmp_name_b.10"
+ tmp_opts_b="$tmp_opts_b $opt_min_sync $opt_max_sync"
+ ;;
+ esac
+ fi
+
+ do_rm $tmp_name_a $tmp_name_a.fail $tmp_name_a.recover
+ $v ceph_test_filestore_idempotent_sequence run-sequence-to $to \
+ $tmp_name_a $tmp_name_a/journal \
+ --test-seed $seed --osd-journal-size 100 \
+ --filestore-kill-at $killat $tmp_opts_a \
+ --log-file $tmp_name_a.fail --debug-filestore 20 --no-log-to-stderr || true
+
+ stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \
+ $tmp_name_a $tmp_name_a/journal \
+ --log-file $tmp_name_a.recover \
+ --debug-filestore 20 --debug-journal 20 --no-log-to-stderr`
+
+ if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then
+ echo "error: get-last-op returned '$stop_at'"
+ exit 1
+ fi
+
+ echo stopped at $stop_at
+
+ do_rm $tmp_name_b $tmp_name_b.clean
+ $v ceph_test_filestore_idempotent_sequence run-sequence-to \
+ $stop_at $tmp_name_b $tmp_name_b/journal \
+ --test-seed $seed --osd-journal-size 100 \
+ --log-file $tmp_name_b.clean --debug-filestore 20 --no-log-to-stderr \
+ $tmp_opts_b
+
+ if $v ceph_test_filestore_idempotent_sequence diff \
+ $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal --no-log-to-stderr --log-file $tmp_name_a.diff.log --debug-filestore 20 ; then
+ echo OK
+ else
+ echo "FAIL"
+ echo " see:"
+ echo " $tmp_name_a.fail -- leading up to failure"
+ echo " $tmp_name_a.recover -- journal replay"
+ echo " $tmp_name_b.clean -- the clean reference"
+
+ ret=1
+ if [[ $exit_on_error -eq 1 ]]; then
+ exit 1
+ fi
+ fi
+
+ num_runs=$(($num_runs+1))
+done
+
+exit $ret
diff --git a/src/test/objectstore/run_seed_to_range.sh b/src/test/objectstore/run_seed_to_range.sh
new file mode 100755
index 000000000..7af2e59ce
--- /dev/null
+++ b/src/test/objectstore/run_seed_to_range.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+set -x
+set -e
+
+seed=$1
+from=$2
+to=$3
+dir=$4
+
+mydir=`dirname $0`
+
+for f in `seq $from $to`
+do
+ if ! $mydir/run_seed_to.sh -o 10 -e $seed $f; then
+ if [ -d "$dir" ]; then
+ echo copying evidence to $dir
+ cp -a . $dir
+ else
+ echo no dir provided for evidence disposal
+ fi
+ exit 1
+ fi
+done
diff --git a/src/test/objectstore/run_smr_bluestore_test.sh b/src/test/objectstore/run_smr_bluestore_test.sh
new file mode 100644
index 000000000..d689cf2c5
--- /dev/null
+++ b/src/test/objectstore/run_smr_bluestore_test.sh
@@ -0,0 +1,48 @@
+#!/bin/bash -ex
+
+# 1) run_smr_bluestore_test.sh
+# Setup smr device, run all tests
+
+# 2) run_smr_bluestore_test.sh --smr
+# Setup smr device but skip tests failing on smr
+
+
+before_creation=$(mktemp)
+lsscsi > $before_creation
+
+echo "cd /backstores/user:zbc
+create name=zbc0 size=20G cfgstring=model-HM/zsize-256/conv-10@zbc0.raw
+/loopback create
+cd /loopback
+create naa.50014055e5f25aa0
+cd naa.50014055e5f25aa0/luns
+create /backstores/user:zbc/zbc0 0
+" | sudo targetcli
+
+sleep 1 #if too fast device does not show up
+after_creation=$(mktemp)
+lsscsi > $after_creation
+if [[ $(diff $before_creation $after_creation | wc -l ) != 2 ]]
+then
+ echo New zbc device not created
+ false
+fi
+
+function cleanup() {
+ echo "cd /loopback
+delete naa.50014055e5f25aa0
+cd /backstores/user:zbc
+delete zbc0" | sudo targetcli
+ sudo rm -f zbc0.raw
+ rm -f $before_creation $after_creation
+}
+trap cleanup EXIT
+
+DEV=$(diff $before_creation $after_creation |grep zbc |sed "s@.* /@/@")
+sudo chmod 666 $DEV
+# Need sudo
+# https://patchwork.kernel.org/project/linux-block/patch/20210811110505.29649-3-Niklas.Cassel@wdc.com/
+sudo ceph_test_objectstore \
+ --bluestore-block-path $DEV \
+ --gtest_filter=*/2 \
+ $*
diff --git a/src/test/objectstore/run_test_deferred.sh b/src/test/objectstore/run_test_deferred.sh
new file mode 100755
index 000000000..1be4d9104
--- /dev/null
+++ b/src/test/objectstore/run_test_deferred.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+
+if [[ ! (-x ./bin/unittest_deferred) || ! (-x ./bin/ceph-kvstore-tool) || ! (-x ./bin/ceph-bluestore-tool)]]
+then
+ echo Test must be run from ceph build directory
+ echo with unittest_deferred, ceph-kvstore-tool and ceph-bluestore-tool compiled
+ exit 1
+fi
+
+# Create BlueStore, only main block device, 4K AU, forced deferred 4K, 64K AU for BlueFS
+
+# Create file zapchajdziura, that is 0xe000 in size.
+# This adds to 0x0000 - 0x1000 of BlueStore superblock and 0x1000 - 0x2000 of BlueFS superblock,
+# making 0x00000 - 0x10000 filled, nicely aligning for 64K BlueFS requirements
+
+# Prefill 10 objects Object-0 .. Object-9, each 64K. Sync to disk.
+# Do transactions like:
+# - fill Object-x+1 16 times at offsets 0x0000, 0x1000, ... 0xf000 with 8bytes, trigerring deferred writes
+# - fill Object-x with 64K data
+# Repeat for Object-0 to Object-8.
+
+# Right after getting notification on_complete for all 9 transactions, immediately exit(1).
+./bin/unittest_deferred --log-to-stderr=false
+
+# Now we should have a considerable amount of pending deferred writes.
+# They do refer disk regions that do not belong to any object.
+
+# Perform compaction on RocksDB
+# This initializes BlueFS, but does not replay deferred writes.
+# It jiggles RocksDB files around. CURRENT and MANIFEST are recreated, with some .sst files too.
+# The hope here is that newly created RocksDB files will occupy space that is free,
+# but targetted by pending deferred writes.
+./bin/ceph-kvstore-tool bluestore-kv bluestore.test_temp_dir/ compact --log-to-stderr=false
+
+# It this step we (hopefully) get RocksDB files overwritten
+# We initialize BlueFS and RocksDB, there should be no problem here.
+# Then we apply deferred writes. Now some of RocksDB files might get corrupted.
+# It is very likely that this will not cause any problems, since CURRENT and MANIFEST are only read at bootup.
+./bin/ceph-bluestore-tool --path bluestore.test_temp_dir/ --command fsck --deep 1 --debug-bluestore=30/30 --debug-bdev=30/30 --log-file=log-bs-corrupts.txt --log-to-file --log-to-stderr=false
+
+# If we were lucky, this command now fails
+./bin/ceph-bluestore-tool --path bluestore.test_temp_dir/ --command fsck --deep 1 --debug-bluestore=30/30 --debug-bdev=30/30 --log-file=log-bs-crash.txt --log-to-file --log-to-stderr=false
+if [[ $? != 0 ]]
+then
+ echo "Deferred writes corruption successfully created !"
+else
+ echo "No deferred write problems detected."
+fi
+
+#cleanup
+rm -rf bluestore.test_temp_dir/
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
new file mode 100644
index 000000000..9edfebd6b
--- /dev/null
+++ b/src/test/objectstore/store_test.cc
@@ -0,0 +1,10932 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <glob.h>
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <memory>
+#include <time.h>
+#include <sys/mount.h>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/binomial_distribution.hpp>
+#include <fmt/format.h>
+#include <gtest/gtest.h>
+
+#include "os/ObjectStore.h"
+#if defined(WITH_BLUESTORE)
+#include "os/bluestore/BlueStore.h"
+#include "os/bluestore/BlueFS.h"
+#endif
+#include "include/Context.h"
+#include "common/buffer_instrumentation.h"
+#include "common/ceph_argparse.h"
+#include "common/admin_socket.h"
+#include "global/global_init.h"
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "common/options.h" // for the size literals
+#include "common/pretty_binary.h"
+#include "include/stringify.h"
+#include "include/coredumpctl.h"
+#include "include/unordered_map.h"
+#include "os/kv.h"
+#include "store_test_fixture.h"
+
+
+using namespace std;
+using namespace std::placeholders;
+
+typedef boost::mt11213b gen_type;
+
+const uint64_t DEF_STORE_TEST_BLOCKDEV_SIZE = 10240000000;
+#define dout_context g_ceph_context
+
+bool smr = false;
+
+static bool bl_eq(bufferlist& expected, bufferlist& actual)
+{
+ if (expected.contents_equal(actual))
+ return true;
+
+ unsigned first = 0;
+ if(expected.length() != actual.length()) {
+ cout << "--- buffer lengths mismatch " << std::hex
+ << "expected 0x" << expected.length() << " != actual 0x"
+ << actual.length() << std::dec << std::endl;
+ derr << "--- buffer lengths mismatch " << std::hex
+ << "expected 0x" << expected.length() << " != actual 0x"
+ << actual.length() << std::dec << dendl;
+ }
+ auto len = std::min(expected.length(), actual.length());
+ while ( first<len && expected[first] == actual[first])
+ ++first;
+ unsigned last = len;
+ while (last > 0 && expected[last-1] == actual[last-1])
+ --last;
+ if(len > 0) {
+ cout << "--- buffer mismatch between offset 0x" << std::hex << first
+ << " and 0x" << last << ", total 0x" << len << std::dec
+ << std::endl;
+ derr << "--- buffer mismatch between offset 0x" << std::hex << first
+ << " and 0x" << last << ", total 0x" << len << std::dec
+ << dendl;
+ cout << "--- expected:\n";
+ expected.hexdump(cout);
+ cout << "--- actual:\n";
+ actual.hexdump(cout);
+ }
+ return false;
+}
+
+
+
+template <typename T>
+int queue_transaction(
+ T &store,
+ ObjectStore::CollectionHandle ch,
+ ObjectStore::Transaction &&t) {
+ if (rand() % 2) {
+ ObjectStore::Transaction t2;
+ t2.append(t);
+ return store->queue_transaction(ch, std::move(t2));
+ } else {
+ return store->queue_transaction(ch, std::move(t));
+ }
+}
+
+template <typename T>
+int collection_list(T &store, ObjectStore::CollectionHandle &c,
+ const ghobject_t& start, const ghobject_t& end, int max,
+ vector<ghobject_t> *ls, ghobject_t *pnext,
+ bool disable_legacy = false) {
+ if (disable_legacy || rand() % 2) {
+ return store->collection_list(c, start, end, max, ls, pnext);
+ } else {
+ return store->collection_list_legacy(c, start, end, max, ls, pnext);
+ }
+}
+
+bool sorted(const vector<ghobject_t> &in) {
+ ghobject_t start;
+ for (vector<ghobject_t>::const_iterator i = in.begin();
+ i != in.end();
+ ++i) {
+ if (start > *i) {
+ cout << start << " should follow " << *i << std::endl;
+ return false;
+ }
+ start = *i;
+ }
+ return true;
+}
+
+class StoreTest : public StoreTestFixture,
+ public ::testing::WithParamInterface<const char*> {
+public:
+ StoreTest()
+ : StoreTestFixture(GetParam())
+ {}
+ void doCompressionTest();
+ void doSyntheticTest(
+ int num_ops,
+ uint64_t max_obj, uint64_t max_wr, uint64_t align);
+};
+
+class StoreTestDeferredSetup : public StoreTest {
+ void SetUp() override {
+ //do nothing
+ }
+
+protected:
+ void DeferredSetup() {
+ StoreTest::SetUp();
+ }
+
+public:
+};
+
+
+class StoreTestSpecificAUSize : public StoreTestDeferredSetup {
+
+public:
+ typedef
+ std::function<void(
+ uint64_t num_ops,
+ uint64_t max_obj,
+ uint64_t max_wr,
+ uint64_t align)> MatrixTest;
+
+ void StartDeferred(size_t min_alloc_size) {
+ SetVal(g_conf(), "bluestore_min_alloc_size", stringify(min_alloc_size).c_str());
+ DeferredSetup();
+ }
+
+private:
+ // bluestore matrix testing
+ uint64_t max_write = 40 * 1024;
+ uint64_t max_size = 400 * 1024;
+ uint64_t alignment = 0;
+ uint64_t num_ops = 10000;
+
+protected:
+ string matrix_get(const char *k) {
+ if (string(k) == "max_write") {
+ return stringify(max_write);
+ } else if (string(k) == "max_size") {
+ return stringify(max_size);
+ } else if (string(k) == "alignment") {
+ return stringify(alignment);
+ } else if (string(k) == "num_ops") {
+ return stringify(num_ops);
+ } else {
+ char *buf;
+ g_conf().get_val(k, &buf, -1);
+ string v = buf;
+ free(buf);
+ return v;
+ }
+ }
+
+ void matrix_set(const char *k, const char *v) {
+ if (string(k) == "max_write") {
+ max_write = atoll(v);
+ } else if (string(k) == "max_size") {
+ max_size = atoll(v);
+ } else if (string(k) == "alignment") {
+ alignment = atoll(v);
+ } else if (string(k) == "num_ops") {
+ num_ops = atoll(v);
+ } else {
+ SetVal(g_conf(), k, v);
+ }
+ }
+
+ void do_matrix_choose(const char *matrix[][10],
+ int i, int pos, int num,
+ MatrixTest fn) {
+ if (matrix[i][0]) {
+ int count;
+ for (count = 0; matrix[i][count+1]; ++count) ;
+ for (int j = 1; matrix[i][j]; ++j) {
+ matrix_set(matrix[i][0], matrix[i][j]);
+ do_matrix_choose(matrix,
+ i + 1,
+ pos * count + j - 1,
+ num * count,
+ fn);
+ }
+ } else {
+ cout << "---------------------- " << (pos + 1) << " / " << num
+ << " ----------------------" << std::endl;
+ for (unsigned k=0; matrix[k][0]; ++k) {
+ cout << " " << matrix[k][0] << " = " << matrix_get(matrix[k][0])
+ << std::endl;
+ }
+ g_ceph_context->_conf.apply_changes(nullptr);
+ fn(num_ops, max_size, max_write, alignment);
+ }
+ }
+
+ void do_matrix(const char *matrix[][10],
+ MatrixTest fn) {
+
+ if (strcmp(matrix[0][0], "bluestore_min_alloc_size") == 0) {
+ int count;
+ for (count = 0; matrix[0][count+1]; ++count) ;
+ for (size_t j = 1; matrix[0][j]; ++j) {
+ if (j > 1) {
+ TearDown();
+ }
+ StartDeferred(strtoll(matrix[0][j], NULL, 10));
+ do_matrix_choose(matrix, 1, j - 1, count, fn);
+ }
+ } else {
+ StartDeferred(0);
+ do_matrix_choose(matrix, 0, 0, 1, fn);
+ }
+ }
+
+};
+
+class StoreTestOmapUpgrade : public StoreTestDeferredSetup {
+protected:
+ void StartDeferred() {
+ DeferredSetup();
+ }
+
+public:
+ struct generator {
+ double r = 3.6;
+ double x = 0.5;
+ double operator()(){
+ double v = x;
+ x = r * x * (1 - x);
+ return v;
+ }
+ };
+
+ std::string generate_monotonic_name(uint32_t SUM, uint32_t i, double r, double x)
+ {
+ generator gen{r, x};
+ //std::cout << "r=" << r << " x=" << x << std::endl;
+ std::string s;
+ while (SUM > 1) {
+ uint32_t lo = 0;
+ uint32_t hi = 1 + gen() * 10;
+ uint32_t start = ('z' - 'a' + 1 - hi) * gen();
+ while (hi - lo > 0) {
+ uint32_t mid = (lo + hi + 1 + (SUM&1)) / 2; // round up or down, depending on SUM
+ // std::cout << "SUM=" << SUM << " x=" << gen.x << std::endl;
+ uint32_t mid_val = gen() * (SUM - 1) + 1;
+ // LEFT = lo .. mid - 1
+ // RIGHT = mid .. hi
+ // std::cout << "lo=" << lo << " hi=" << hi << " mid=" << mid
+ // << " SUM=" << SUM << " i=" << i << " x=" << gen.x << " mid_val=" << mid_val << std::endl;
+ if (i < mid_val) {
+ hi = mid - 1;
+ SUM = mid_val;
+ } else {
+ lo = mid;
+ SUM = SUM - mid_val;
+ i = i - mid_val;
+ }
+ }
+ //std::cout << "lo=" << lo << " hi=" << hi
+ // << " SUM=" << SUM << " i=" << i << std::endl;
+
+ s.push_back('a' + lo + start); // to keep alphabetic order
+ uint32_t cnt = gen() * 8;
+ for (uint32_t j = 0; j < cnt; j++) {
+ s.push_back('a' + ('z' - 'a' + 1) * gen());
+ }
+ s.push_back('.');
+ }
+ return s;
+ }
+
+ std::string gen_string(size_t size, generator& gen) {
+ std::string s;
+ for (size_t i = 0; i < size; i++) {
+ s.push_back('a' + ('z' - 'a' + 1 ) * gen());
+ }
+ return s;
+ }
+
+ void make_omap_data(size_t object_count,
+ int64_t poolid,
+ coll_t cid) {
+ int r;
+ ObjectStore::CollectionHandle ch = store->open_collection(cid);
+ for (size_t o = 0; o < object_count; o++)
+ {
+ ObjectStore::Transaction t;
+ std::string oid = generate_monotonic_name(object_count, o, 3.71, 0.5);
+ ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 0, poolid, ""));
+ t.touch(cid, hoid);
+ generator gen{3.85 + 0.1 * o / object_count, 1 - double(o) / object_count};
+
+ map<string, bufferlist> start_set;
+ size_t omap_count = 1 + gen() * 20;
+ bool do_omap_header = gen() > 0.5;
+ if (do_omap_header) {
+ bufferlist header;
+ header.append(gen_string(50, gen));
+ t.omap_setheader(cid, hoid, header);
+ }
+ for (size_t i = 0; i < omap_count; i++) {
+ std::string name = generate_monotonic_name(omap_count, i, 3.66 + 0.22 * o / object_count, 0.5);
+ bufferlist val;
+ val.append(gen_string(100, gen));
+ start_set.emplace(name, val);
+ }
+ t.omap_setkeys(cid, hoid, start_set);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+
+ void check_omap_data(size_t object_count,
+ int64_t poolid,
+ coll_t cid) {
+ int r;
+ ObjectStore::CollectionHandle ch = store->open_collection(cid);
+
+ for (size_t o = 0; o < object_count; o++)
+ {
+ ObjectStore::Transaction t;
+ std::string oid = generate_monotonic_name(object_count, o, 3.71, 0.5);
+ ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 0, poolid, ""));
+ generator gen{3.85 + 0.1 * o / object_count, 1 - double(o) / object_count};
+
+ bufferlist omap_header;
+ map<string, bufferlist> omap_set;
+ r = store->omap_get(ch, hoid, &omap_header, &omap_set);
+ ASSERT_EQ(r, 0);
+ size_t omap_count = 1 + gen() * 20;
+ bool do_omap_header = gen() > 0.5;
+ if (do_omap_header) {
+ std::string header_str = gen_string(50, gen);
+ ASSERT_EQ(header_str, omap_header.to_str());
+ }
+ auto it = omap_set.begin();
+ for (size_t i = 0; i < omap_count; i++) {
+ ASSERT_TRUE(it != omap_set.end());
+ std::string name = generate_monotonic_name(omap_count, i, 3.66 + 0.22 * o / object_count, 0.5);
+ std::string val_gen = gen_string(100, gen);
+ ASSERT_EQ(it->first, name);
+ ASSERT_EQ(it->second.to_str(), val_gen);
+ ++it;
+ }
+ }
+ }
+};
+
+TEST_P(StoreTest, collect_metadata) {
+ map<string,string> pm;
+ store->collect_metadata(&pm);
+ if (GetParam() == string("filestore")) {
+ ASSERT_NE(pm.count("filestore_backend"), 0u);
+ ASSERT_NE(pm.count("filestore_f_type"), 0u);
+ ASSERT_NE(pm.count("backend_filestore_partition_path"), 0u);
+ ASSERT_NE(pm.count("backend_filestore_dev_node"), 0u);
+ }
+}
+
+TEST_P(StoreTest, Trivial) {
+}
+
+TEST_P(StoreTest, TrivialRemount) {
+ int r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+}
+
+TEST_P(StoreTest, TrivialRemountFsck) {
+ if(string(GetParam()) != "bluestore")
+ return;
+ int r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->fsck(false);
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+}
+
+TEST_P(StoreTest, SimpleRemount) {
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+ bufferlist bl;
+ bl.append("1234512345");
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ cerr << "create collection + write" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ch.reset();
+ r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+ ch = store->open_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid2, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ch.reset();
+ r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+ ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, IORemount) {
+ coll_t cid;
+ bufferlist bl;
+ bl.append("1234512345");
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ cerr << "create collection + objects" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ for (int n=1; n<=100; ++n) {
+ ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
+ t.write(cid, hoid, 0, bl.length(), bl);
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ // overwrites
+ {
+ cout << "overwrites" << std::endl;
+ for (int n=1; n<=100; ++n) {
+ ObjectStore::Transaction t;
+ ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
+ t.write(cid, hoid, 1, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+ ch.reset();
+ r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+ {
+ ObjectStore::Transaction t;
+ for (int n=1; n<=100; ++n) {
+ ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP)));
+ t.remove(cid, hoid);
+ }
+ t.remove_collection(cid);
+ auto ch = store->open_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, UnprintableCharsName) {
+ coll_t cid;
+ string name = "funnychars_";
+ for (unsigned i = 0; i < 256; ++i) {
+ name.push_back(i);
+ }
+ ghobject_t oid(hobject_t(sobject_t(name, CEPH_NOSNAP)));
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ cerr << "create collection + object" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ch.reset();
+ r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+ {
+ cout << "removing" << std::endl;
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.remove_collection(cid);
+ auto ch = store->open_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, FiemapEmpty) {
+ coll_t cid;
+ int r = 0;
+ ghobject_t oid(hobject_t(sobject_t("fiemap_object", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oid);
+ t.truncate(cid, oid, 100000);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist bl;
+ store->fiemap(ch, oid, 0, 100000, bl);
+ map<uint64_t,uint64_t> m, e;
+ auto p = bl.cbegin();
+ decode(m, p);
+ cout << " got " << m << std::endl;
+ e[0] = 100000;
+ EXPECT_TRUE(m == e || m.empty());
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, FiemapHoles) {
+ const uint64_t MAX_EXTENTS = 4000;
+ const uint64_t SKIP_STEP = 65536;
+ coll_t cid;
+ int r = 0;
+ ghobject_t oid(hobject_t(sobject_t("fiemap_object", CEPH_NOSNAP)));
+ bufferlist bl;
+ bl.append("foo");
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oid);
+ for (uint64_t i = 0; i < MAX_EXTENTS; i++)
+ t.write(cid, oid, SKIP_STEP * i, 3, bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ //fiemap test from 0 to SKIP_STEP * (MAX_EXTENTS - 1) + 3
+ bufferlist bl;
+ store->fiemap(ch, oid, 0, SKIP_STEP * (MAX_EXTENTS - 1) + 3, bl);
+ map<uint64_t,uint64_t> m, e;
+ auto p = bl.cbegin();
+ decode(m, p);
+ cout << " got " << m << std::endl;
+ ASSERT_TRUE(!m.empty());
+ ASSERT_GE(m[0], 3u);
+ auto last = m.crbegin();
+ if (m.size() == 1) {
+ ASSERT_EQ(0u, last->first);
+ } else if (m.size() == MAX_EXTENTS) {
+ for (uint64_t i = 0; i < MAX_EXTENTS; i++) {
+ ASSERT_TRUE(m.count(SKIP_STEP * i));
+ }
+ }
+ ASSERT_GT(last->first + last->second, SKIP_STEP * (MAX_EXTENTS - 1));
+ }
+ {
+ // fiemap test from SKIP_STEP to SKIP_STEP * (MAX_EXTENTS - 2) + 3
+ bufferlist bl;
+ store->fiemap(ch, oid, SKIP_STEP, SKIP_STEP * (MAX_EXTENTS - 2) + 3, bl);
+ map<uint64_t,uint64_t> m, e;
+ auto p = bl.cbegin();
+ decode(m, p);
+ cout << " got " << m << std::endl;
+ ASSERT_TRUE(!m.empty());
+ // kstore always returns [0, object_size] regardless of offset and length
+ // FIXME: if fiemap logic in kstore is refined
+ if (string(GetParam()) != "kstore") {
+ ASSERT_GE(m[SKIP_STEP], 3u);
+ auto last = m.crbegin();
+ if (m.size() == 1) {
+ ASSERT_EQ(SKIP_STEP, last->first);
+ } else if (m.size() == MAX_EXTENTS - 2) {
+ for (uint64_t i = 1; i < MAX_EXTENTS - 1; i++) {
+ ASSERT_TRUE(m.count(SKIP_STEP*i));
+ }
+ }
+ ASSERT_GT(last->first + last->second, SKIP_STEP * (MAX_EXTENTS - 1));
+ }
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SimpleMetaColTest) {
+ coll_t cid;
+ int r = 0;
+ {
+ auto ch = store->create_new_collection(cid);
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "create collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ auto ch = store->open_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ auto ch = store->create_new_collection(cid);
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "add collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ auto ch = store->open_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SimplePGColTest) {
+ coll_t cid(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+ int r = 0;
+ {
+ ObjectStore::Transaction t;
+ auto ch = store->create_new_collection(cid);
+ t.create_collection(cid, 4);
+ cerr << "create collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ auto ch = store->open_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 4);
+ cerr << "add collection" << std::endl;
+ auto ch = store->create_new_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ auto ch = store->open_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SimpleColPreHashTest) {
+ // Firstly we will need to revert the value making sure
+ // collection hint actually works
+ int merge_threshold = g_ceph_context->_conf->filestore_merge_threshold;
+ std::ostringstream oss;
+ if (merge_threshold > 0) {
+ oss << "-" << merge_threshold;
+ SetVal(g_conf(), "filestore_merge_threshold", oss.str().c_str());
+ }
+
+ uint32_t pg_num = 128;
+
+ boost::uniform_int<> pg_id_range(0, pg_num);
+ gen_type rng(time(NULL));
+ int pg_id = pg_id_range(rng);
+
+ int objs_per_folder = abs(merge_threshold) * 16 * g_ceph_context->_conf->filestore_split_multiple;
+ boost::uniform_int<> folders_range(5, 256);
+ uint64_t expected_num_objs = (uint64_t)objs_per_folder * (uint64_t)folders_range(rng);
+
+ coll_t cid(spg_t(pg_t(pg_id, 15), shard_id_t::NO_SHARD));
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ // Create a collection along with a hint
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 5);
+ cerr << "create collection" << std::endl;
+ bufferlist hint;
+ encode(pg_num, hint);
+ encode(expected_num_objs, hint);
+ t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint);
+ cerr << "collection hint" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // Remove the collection
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "remove collection" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SmallBlockWrites) {
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist a;
+ bufferptr ap(0x1000);
+ memset(ap.c_str(), 'a', 0x1000);
+ a.append(ap);
+ bufferlist b;
+ bufferptr bp(0x1000);
+ memset(bp.c_str(), 'b', 0x1000);
+ b.append(bp);
+ bufferlist c;
+ bufferptr cp(0x1000);
+ memset(cp.c_str(), 'c', 0x1000);
+ c.append(cp);
+ bufferptr zp(0x1000);
+ zp.zero();
+ bufferlist z;
+ z.append(zp);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, 0x1000, a);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in, exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(0x1000, r);
+ exp.append(a);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0x1000, 0x1000, b);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in, exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(0x2000, r);
+ exp.append(a);
+ exp.append(b);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0x3000, 0x1000, c);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in, exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(0x4000, r);
+ exp.append(a);
+ exp.append(b);
+ exp.append(z);
+ exp.append(c);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0x2000, 0x1000, a);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in, exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(0x4000, r);
+ exp.append(a);
+ exp.append(b);
+ exp.append(a);
+ exp.append(c);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, 0x1000, c);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist in, exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(0x4000, r);
+ exp.append(c);
+ exp.append(b);
+ exp.append(a);
+ exp.append(c);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, BufferCacheReadTest) {
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ {
+ auto ch = store->open_collection(cid);
+ ASSERT_FALSE(ch);
+ }
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append("abcde");
+ t.write(cid, hoid, 0, 5, bl);
+ t.write(cid, hoid, 10, 5, bl);
+ cerr << "TwinWrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, 15, newdata);
+ ASSERT_EQ(r, 15);
+ {
+ bufferlist expected;
+ expected.append(bl);
+ expected.append_zero(5);
+ expected.append(bl);
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+ //overwrite over the same extents
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append("edcba");
+ t.write(cid, hoid, 0, 5, bl);
+ t.write(cid, hoid, 10, 5, bl);
+ cerr << "TwinWrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, 15, newdata);
+ ASSERT_EQ(r, 15);
+ {
+ bufferlist expected;
+ expected.append(bl);
+ expected.append_zero(5);
+ expected.append(bl);
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+ //additional write to an unused region of some blob
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl2, newdata;
+ bl2.append("1234567890");
+
+ t.write(cid, hoid, 20, bl2.length(), bl2);
+ cerr << "Append" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, 30, newdata);
+ ASSERT_EQ(r, 30);
+ {
+ bufferlist expected;
+ expected.append("edcba");
+ expected.append_zero(5);
+ expected.append("edcba");
+ expected.append_zero(5);
+ expected.append(bl2);
+
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+ //additional write to an unused region of some blob and partial owerite over existing extents
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, bl2, bl3, newdata;
+ bl.append("DCB");
+ bl2.append("1234567890");
+ bl3.append("BA");
+
+ t.write(cid, hoid, 30, bl2.length(), bl2);
+ t.write(cid, hoid, 1, bl.length(), bl);
+ t.write(cid, hoid, 13, bl3.length(), bl3);
+ cerr << "TripleWrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, 40, newdata);
+ ASSERT_EQ(r, 40);
+ {
+ bufferlist expected;
+ expected.append("eDCBa");
+ expected.append_zero(5);
+ expected.append("edcBA");
+ expected.append_zero(5);
+ expected.append(bl2);
+ expected.append(bl2);
+
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+}
+
+void StoreTest::doCompressionTest()
+{
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ {
+ auto ch = store->open_collection(cid);
+ ASSERT_FALSE(ch);
+ }
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ std::string data;
+ data.resize(0x10000 * 4);
+ for(size_t i = 0;i < data.size(); i++)
+ data[i] = i / 256;
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append(data);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "CompressibleData (4xAU) Write" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, data.size() , newdata);
+
+ ASSERT_EQ(r, (int)data.size());
+ {
+ bufferlist expected;
+ expected.append(data);
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ r = store->read(ch, hoid, 0, 711 , newdata);
+ ASSERT_EQ(r, 711);
+ {
+ bufferlist expected;
+ expected.append(data.substr(0,711));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ r = store->read(ch, hoid, 0xf00f, data.size(), newdata);
+ ASSERT_EQ(r, int(data.size() - 0xf00f) );
+ {
+ bufferlist expected;
+ expected.append(data.substr(0xf00f));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)data.size());
+ ASSERT_LE(statfs.data_compressed, (unsigned)data.size());
+ ASSERT_EQ(statfs.data_compressed_original, (unsigned)data.size());
+ ASSERT_LE(statfs.data_compressed_allocated, (unsigned)data.size());
+ }
+ }
+ std::string data2;
+ data2.resize(0x10000 * 4 - 0x9000);
+ for(size_t i = 0;i < data2.size(); i++)
+ data2[i] = (i+1) / 256;
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append(data2);
+ t.write(cid, hoid, 0x8000, bl.length(), bl);
+ cerr << "CompressibleData partial overwrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, 0x10000, newdata);
+ ASSERT_EQ(r, (int)0x10000);
+ {
+ bufferlist expected;
+ expected.append(data.substr(0, 0x8000));
+ expected.append(data2.substr(0, 0x8000));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ r = store->read(ch, hoid, 0x9000, 711 , newdata);
+ ASSERT_EQ(r, 711);
+ {
+ bufferlist expected;
+ expected.append(data2.substr(0x1000,711));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ r = store->read(ch, hoid, 0x0, 0x40000, newdata);
+ ASSERT_EQ(r, int(0x40000) );
+ {
+ bufferlist expected;
+ expected.append(data.substr(0, 0x8000));
+ expected.append(data2.substr(0, 0x37000));
+ expected.append(data.substr(0x3f000, 0x1000));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+ data2.resize(0x3f000);
+ for(size_t i = 0;i < data2.size(); i++)
+ data2[i] = (i+2) / 256;
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append(data2);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "CompressibleData partial overwrite, two extents overlapped, single one to be removed" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0, 0x3e000 - 1, newdata);
+ ASSERT_EQ(r, (int)0x3e000 - 1);
+ {
+ bufferlist expected;
+ expected.append(data2.substr(0, 0x3e000 - 1));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ r = store->read(ch, hoid, 0x3e000-1, 0x2001, newdata);
+ ASSERT_EQ(r, 0x2001);
+ {
+ bufferlist expected;
+ expected.append(data2.substr(0x3e000-1, 0x1001));
+ expected.append(data.substr(0x3f000, 0x1000));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ r = store->read(ch, hoid, 0x0, 0x40000, newdata);
+ ASSERT_EQ(r, int(0x40000) );
+ {
+ bufferlist expected;
+ expected.append(data2.substr(0, 0x3f000));
+ expected.append(data.substr(0x3f000, 0x1000));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+ data.resize(0x1001);
+ for(size_t i = 0;i < data.size(); i++)
+ data[i] = (i+3) / 256;
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append(data);
+ t.write(cid, hoid, 0x3f000-1, bl.length(), bl);
+ cerr << "Small chunk partial overwrite, two extents overlapped, single one to be removed" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 0x3e000, 0x2000, newdata);
+ ASSERT_EQ(r, (int)0x2000);
+ {
+ bufferlist expected;
+ expected.append(data2.substr(0x3e000, 0x1000 - 1));
+ expected.append(data.substr(0, 0x1001));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ cerr << "Cleaning object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ auto settingsBookmark = BookmarkSettings();
+ SetVal(g_conf(), "bluestore_compression_min_blob_size", "262144");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ {
+ data.resize(0x10000*6);
+
+ for(size_t i = 0;i < data.size(); i++)
+ data[i] = i / 256;
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append(data);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "CompressibleData large blob" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, CompressionTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "TODO: need to adjust statfs check for smr" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_compression_algorithm", "snappy");
+ SetVal(g_conf(), "bluestore_compression_mode", "force");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ doCompressionTest();
+
+ SetVal(g_conf(), "bluestore_compression_algorithm", "zlib");
+ SetVal(g_conf(), "bluestore_compression_mode", "aggressive");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ doCompressionTest();
+}
+
+TEST_P(StoreTest, SimpleObjectTest) {
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ {
+ auto ch = store->open_collection(cid);
+ ASSERT_FALSE(ch);
+ }
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.touch(cid, hoid);
+ cerr << "Remove then create" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ bl.append("abcde");
+ orig = bl;
+ t.remove(cid, hoid);
+ t.write(cid, hoid, 0, 5, bl);
+ cerr << "Remove then create" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 5, in);
+ ASSERT_EQ(5, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, exp;
+ bl.append("abcde");
+ exp = bl;
+ exp.append(bl);
+ t.write(cid, hoid, 5, 5, bl);
+ cerr << "Append" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 10, in);
+ ASSERT_EQ(10, r);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, exp;
+ bl.append("abcdeabcde");
+ exp = bl;
+ t.write(cid, hoid, 0, 10, bl);
+ cerr << "Full overwrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 10, in);
+ ASSERT_EQ(10, r);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("abcde");
+ t.write(cid, hoid, 3, 5, bl);
+ cerr << "Partial overwrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in, exp;
+ exp.append("abcabcdede");
+ r = store->read(ch, hoid, 0, 10, in);
+ ASSERT_EQ(10, r);
+ in.hexdump(cout);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("fghij");
+ t.truncate(cid, hoid, 0);
+ t.write(cid, hoid, 5, 5, bl);
+ cerr << "Truncate + hole" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("abcde");
+ t.write(cid, hoid, 0, 5, bl);
+ cerr << "Reverse fill-in" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bufferlist in, exp;
+ exp.append("abcdefghij");
+ r = store->read(ch, hoid, 0, 10, in);
+ ASSERT_EQ(10, r);
+ in.hexdump(cout);
+ ASSERT_TRUE(bl_eq(exp, in));
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234");
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "larger overwrite" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, bl.length(), in);
+ ASSERT_EQ((int)bl.length(), r);
+ in.hexdump(cout);
+ ASSERT_TRUE(bl_eq(bl, in));
+ }
+ {
+ bufferlist bl;
+ bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234");
+
+ //test: offset=len=0 mean read all data
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 0, in);
+ ASSERT_EQ((int)bl.length(), r);
+ in.hexdump(cout);
+ ASSERT_TRUE(bl_eq(bl, in));
+ }
+ {
+ //verifying unaligned csums
+ std::string s1("1"), s2(0x1000, '2'), s3("00");
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(s1);
+ bl.append(s2);
+ t.truncate(cid, hoid, 0);
+ t.write(cid, hoid, 0x1000-1, bl.length(), bl);
+ cerr << "Write unaligned csum, stage 1" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bufferlist in, exp1, exp2, exp3;
+ exp1.append(s1);
+ exp2.append(s2);
+ exp3.append(s3);
+ r = store->read(ch, hoid, 0x1000-1, 1, in);
+ ASSERT_EQ(1, r);
+ ASSERT_TRUE(bl_eq(exp1, in));
+ in.clear();
+ r = store->read(ch, hoid, 0x1000, 0x1000, in);
+ ASSERT_EQ(0x1000, r);
+ ASSERT_TRUE(bl_eq(exp2, in));
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(s3);
+ t.write(cid, hoid, 1, bl.length(), bl);
+ cerr << "Write unaligned csum, stage 2" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ in.clear();
+ r = store->read(ch, hoid, 1, 2, in);
+ ASSERT_EQ(2, r);
+ ASSERT_TRUE(bl_eq(exp3, in));
+ in.clear();
+ r = store->read(ch, hoid, 0x1000-1, 1, in);
+ ASSERT_EQ(1, r);
+ ASSERT_TRUE(bl_eq(exp1, in));
+ in.clear();
+ r = store->read(ch, hoid, 0x1000, 0x1000, in);
+ ASSERT_EQ(0x1000, r);
+ ASSERT_TRUE(bl_eq(exp2, in));
+
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+#if defined(WITH_BLUESTORE)
+
+TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) {
+ if(string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP (smr)" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_max_blob_size", "524288");
+ SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
+ g_conf().apply_changes(nullptr);
+ StartDeferred(65536);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ const PerfCounters* logger = store->get_perf_counters();
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(4096, 'a');
+ bl.append(s);
+ t.write(cid, hoid, 0x11000, bl.length(), bl);
+ cerr << "write1" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(4096 * 3, 'a');
+ bl.append(s);
+ t.write(cid, hoid, 0x15000, bl.length(), bl);
+ cerr << "write2" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_unused), 1u);
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(4096 * 2, 'a');
+ bl.append(s);
+ t.write(cid, hoid, 0xe000, bl.length(), bl);
+ cerr << "write3" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 3u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_unused), 2u);
+
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(4096, 'a');
+ bl.append(s);
+ t.write(cid, hoid, 0xf000, bl.length(), bl);
+ t.write(cid, hoid, 0x10000, bl.length(), bl);
+ cerr << "write3" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 5u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_unused), 2u);
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+
+TEST_P(StoreTestSpecificAUSize, BluestoreStatFSTest) {
+ if(string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "TODO: fix this for smr" << std::endl;
+ return;
+ }
+ SetVal(g_conf(), "bluestore_block_db_path", "");
+ StartDeferred(65536);
+ SetVal(g_conf(), "bluestore_compression_mode", "force");
+ SetVal(g_conf(), "bluestore_max_blob_size", "524288");
+ // just a big number to disble gc
+ SetVal(g_conf(), "bluestore_gc_enable_total_threshold", "100000");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "true");
+ g_conf().apply_changes(nullptr);
+ int r;
+
+ int poolid = 4373;
+ coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD));
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP),
+ string(),
+ 0,
+ poolid,
+ string()));
+ ghobject_t hoid2 = hoid;
+ hoid2.hobj.snap = 1;
+ {
+ auto ch = store->open_collection(cid);
+ ASSERT_FALSE(ch);
+ }
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ( 0u, statfs.allocated);
+ ASSERT_EQ( 0u, statfs.data_stored);
+ ASSERT_EQ(g_conf()->bluestore_block_size, statfs.total);
+ ASSERT_TRUE(statfs.available > 0u && statfs.available < g_conf()->bluestore_block_size);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ( 0u, statfs_pool.allocated);
+ ASSERT_EQ( 0u, statfs_pool.data_stored);
+
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("abcde");
+ t.write(cid, hoid, 0, 5, bl);
+ cerr << "Append 5 bytes" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(5, statfs.data_stored);
+ ASSERT_EQ(0x10000, statfs.allocated);
+ ASSERT_EQ(0, statfs.data_compressed);
+ ASSERT_EQ(0, statfs.data_compressed_original);
+ ASSERT_EQ(0, statfs.data_compressed_allocated);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(5, statfs_pool.data_stored);
+ ASSERT_EQ(0x10000, statfs_pool.allocated);
+ ASSERT_EQ(0, statfs_pool.data_compressed);
+ ASSERT_EQ(0, statfs_pool.data_compressed_original);
+ ASSERT_EQ(0, statfs_pool.data_compressed_allocated);
+
+ // accessing unknown pool
+ r = store->pool_statfs(poolid + 1, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0, statfs_pool.data_stored);
+ ASSERT_EQ(0, statfs_pool.allocated);
+ ASSERT_EQ(0, statfs_pool.data_compressed);
+ ASSERT_EQ(0, statfs_pool.data_compressed_original);
+ ASSERT_EQ(0, statfs_pool.data_compressed_allocated);
+
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ std::string s(0x30000, 'a');
+ bufferlist bl;
+ bl.append(s);
+ t.write(cid, hoid, 0x10000, bl.length(), bl);
+ cerr << "Append 0x30000 compressible bytes" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30005, statfs.data_stored);
+ ASSERT_EQ(0x30000, statfs.allocated);
+ ASSERT_LE(statfs.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000, statfs.data_compressed_original);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30005, statfs_pool.data_stored);
+ ASSERT_EQ(0x30000, statfs_pool.allocated);
+ ASSERT_LE(statfs_pool.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000, statfs_pool.data_compressed_original);
+ ASSERT_EQ(statfs_pool.data_compressed_allocated, 0x10000);
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 1, 3);
+ t.zero(cid, hoid, 0x20000, 9);
+ cerr << "Punch hole at 1~3, 0x20000~9" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30005 - 3 - 9, statfs.data_stored);
+ ASSERT_EQ(0x30000, statfs.allocated);
+ ASSERT_LE(statfs.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000 - 9, statfs.data_compressed_original);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30005 - 3 - 9, statfs_pool.data_stored);
+ ASSERT_EQ(0x30000, statfs_pool.allocated);
+ ASSERT_LE(statfs_pool.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000 - 9, statfs_pool.data_compressed_original);
+ ASSERT_EQ(statfs_pool.data_compressed_allocated, 0x10000);
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ std::string s(0x1000, 'b');
+ bufferlist bl;
+ bl.append(s);
+ t.write(cid, hoid, 1, bl.length(), bl);
+ t.write(cid, hoid, 0x10001, bl.length(), bl);
+ cerr << "Overwrite first and second(compressible) extents" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30001 - 9 + 0x1000, statfs.data_stored);
+ ASSERT_EQ(0x40000, statfs.allocated);
+ ASSERT_LE(statfs.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000 - 9 - 0x1000, statfs.data_compressed_original);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30001 - 9 + 0x1000, statfs_pool.data_stored);
+ ASSERT_EQ(0x40000, statfs_pool.allocated);
+ ASSERT_LE(statfs_pool.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000 - 9 - 0x1000, statfs_pool.data_compressed_original);
+ ASSERT_EQ(statfs_pool.data_compressed_allocated, 0x10000);
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ std::string s(0x10000, 'c');
+ bufferlist bl;
+ bl.append(s);
+ t.write(cid, hoid, 0x10000, bl.length(), bl);
+ t.write(cid, hoid, 0x20000, bl.length(), bl);
+ t.write(cid, hoid, 0x30000, bl.length(), bl);
+ cerr << "Overwrite compressed extent with 3 uncompressible ones" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30000 + 0x1001, statfs.data_stored);
+ ASSERT_EQ(0x40000, statfs.allocated);
+ ASSERT_LE(statfs.data_compressed, 0);
+ ASSERT_EQ(0, statfs.data_compressed_original);
+ ASSERT_EQ(0, statfs.data_compressed_allocated);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30000 + 0x1001, statfs_pool.data_stored);
+ ASSERT_EQ(0x40000, statfs_pool.allocated);
+ ASSERT_LE(statfs_pool.data_compressed, 0);
+ ASSERT_EQ(0, statfs_pool.data_compressed_original);
+ ASSERT_EQ(0, statfs_pool.data_compressed_allocated);
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 0, 0x40000);
+ cerr << "Zero object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0u, statfs.allocated);
+ ASSERT_EQ(0u, statfs.data_stored);
+ ASSERT_EQ(0u, statfs.data_compressed_original);
+ ASSERT_EQ(0u, statfs.data_compressed);
+ ASSERT_EQ(0u, statfs.data_compressed_allocated);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0u, statfs_pool.allocated);
+ ASSERT_EQ(0u, statfs_pool.data_stored);
+ ASSERT_EQ(0u, statfs_pool.data_compressed_original);
+ ASSERT_EQ(0u, statfs_pool.data_compressed);
+ ASSERT_EQ(0u, statfs_pool.data_compressed_allocated);
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ ObjectStore::Transaction t;
+ std::string s(0x10000, 'c');
+ bufferlist bl;
+ bl.append(s);
+ bl.append(s);
+ bl.append(s);
+ bl.append(s.substr(0, 0x10000-2));
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "Yet another compressible write" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ struct store_statfs_t statfs;
+ r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x40000 - 2, statfs.data_stored);
+ ASSERT_EQ(0x30000, statfs.allocated);
+ ASSERT_LE(statfs.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000, statfs.data_compressed_original);
+ ASSERT_EQ(0x10000, statfs.data_compressed_allocated);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x40000 - 2, statfs_pool.data_stored);
+ ASSERT_EQ(0x30000, statfs_pool.allocated);
+ ASSERT_LE(statfs_pool.data_compressed, 0x10000);
+ ASSERT_EQ(0x20000, statfs_pool.data_compressed_original);
+ ASSERT_EQ(0x10000, statfs_pool.data_compressed_allocated);
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+ {
+ struct store_statfs_t statfs;
+ r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+
+ ObjectStore::Transaction t;
+ t.clone(cid, hoid, hoid2);
+ cerr << "Clone compressed objecte" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ struct store_statfs_t statfs2;
+ r = store->statfs(&statfs2);
+ ASSERT_EQ(r, 0);
+ ASSERT_GT(statfs2.data_stored, statfs.data_stored);
+ ASSERT_EQ(statfs2.allocated, statfs.allocated);
+ ASSERT_GT(statfs2.data_compressed, statfs.data_compressed);
+ ASSERT_GT(statfs2.data_compressed_original, statfs.data_compressed_original);
+ ASSERT_EQ(statfs2.data_compressed_allocated, statfs.data_compressed_allocated);
+
+ struct store_statfs_t statfs2_pool;
+ r = store->pool_statfs(poolid, &statfs2_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_GT(statfs2_pool.data_stored, statfs_pool.data_stored);
+ ASSERT_EQ(statfs2_pool.allocated, statfs_pool.allocated);
+ ASSERT_GT(statfs2_pool.data_compressed, statfs_pool.data_compressed);
+ ASSERT_GT(statfs2_pool.data_compressed_original,
+ statfs_pool.data_compressed_original);
+ ASSERT_EQ(statfs2_pool.data_compressed_allocated,
+ statfs_pool.data_compressed_allocated);
+ }
+
+ {
+ // verify no
+ auto poolid2 = poolid + 1;
+ coll_t cid2 = coll_t(spg_t(pg_t(20, poolid2), shard_id_t::NO_SHARD));
+ ghobject_t hoid(hobject_t(sobject_t("Object 2", CEPH_NOSNAP),
+ string(),
+ 0,
+ poolid2,
+ string()));
+ auto ch = store->create_new_collection(cid2);
+
+ {
+
+ struct store_statfs_t statfs1_pool;
+ bool per_pool_omap;
+ int r = store->pool_statfs(poolid, &statfs1_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+
+ cerr << "Creating second collection " << cid2 << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid2, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ t = ObjectStore::Transaction();
+ bufferlist bl;
+ bl.append("abcde");
+ t.write(cid2, hoid, 0, 5, bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs2_pool;
+ r = store->pool_statfs(poolid2, &statfs2_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(5, statfs2_pool.data_stored);
+ ASSERT_EQ(0x10000, statfs2_pool.allocated);
+ ASSERT_EQ(0, statfs2_pool.data_compressed);
+ ASSERT_EQ(0, statfs2_pool.data_compressed_original);
+ ASSERT_EQ(0, statfs2_pool.data_compressed_allocated);
+
+ struct store_statfs_t statfs1_pool_again;
+ r = store->pool_statfs(poolid, &statfs1_pool_again, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ // adjust 'available' since it has changed
+ statfs1_pool_again.available = statfs1_pool.available;
+ ASSERT_EQ(statfs1_pool_again, statfs1_pool);
+
+ t = ObjectStore::Transaction();
+ t.remove(cid2, hoid);
+ t.remove_collection(cid2);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+
+ {
+ // verify ops on temporary object
+
+ auto poolid3 = poolid + 2;
+ coll_t cid3 = coll_t(spg_t(pg_t(20, poolid3), shard_id_t::NO_SHARD));
+ ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP),
+ string(),
+ 0,
+ poolid3,
+ string()));
+ ghobject_t hoid3_temp;
+ hoid3_temp.hobj = hoid3.hobj.make_temp_hobject("Object 3 temp");
+ auto ch3 = store->create_new_collection(cid3);
+ {
+ struct store_statfs_t statfs1_pool;
+ bool per_pool_omap;
+ int r = store->pool_statfs(poolid, &statfs1_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+
+ cerr << "Creating third collection " << cid3 << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid3, 0);
+ r = queue_transaction(store, ch3, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ t = ObjectStore::Transaction();
+ bufferlist bl;
+ bl.append("abcde");
+ t.write(cid3, hoid3_temp, 0, 5, bl);
+ r = queue_transaction(store, ch3, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs3_pool;
+ r = store->pool_statfs(poolid3, &statfs3_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(5, statfs3_pool.data_stored);
+ ASSERT_EQ(0x10000, statfs3_pool.allocated);
+ ASSERT_EQ(0, statfs3_pool.data_compressed);
+ ASSERT_EQ(0, statfs3_pool.data_compressed_original);
+ ASSERT_EQ(0, statfs3_pool.data_compressed_allocated);
+
+ struct store_statfs_t statfs1_pool_again;
+ r = store->pool_statfs(poolid, &statfs1_pool_again, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ // adjust 'available' since it has changed
+ statfs1_pool_again.available = statfs1_pool.available;
+ ASSERT_EQ(statfs1_pool_again, statfs1_pool);
+
+ //force fsck
+ ch.reset();
+ ch3.reset();
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ ch3 = store->open_collection(cid3);
+
+ t = ObjectStore::Transaction();
+ t.collection_move_rename(
+ cid3, hoid3_temp,
+ cid3, hoid3);
+ r = queue_transaction(store, ch3, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs3_pool_again;
+ r = store->pool_statfs(poolid3, &statfs3_pool_again, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs3_pool_again, statfs3_pool);
+
+ //force fsck
+ ch.reset();
+ ch3.reset();
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ ch3 = store->open_collection(cid3);
+
+ t = ObjectStore::Transaction();
+ t.remove(cid3, hoid3);
+ t.remove_collection(cid3);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch3, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ( 0u, statfs.allocated);
+ ASSERT_EQ( 0u, statfs.data_stored);
+ ASSERT_EQ( 0u, statfs.data_compressed_original);
+ ASSERT_EQ( 0u, statfs.data_compressed);
+ ASSERT_EQ( 0u, statfs.data_compressed_allocated);
+
+ struct store_statfs_t statfs_pool;
+ bool per_pool_omap;
+ r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ( 0u, statfs_pool.allocated);
+ ASSERT_EQ( 0u, statfs_pool.data_stored);
+ ASSERT_EQ( 0u, statfs_pool.data_compressed_original);
+ ASSERT_EQ( 0u, statfs_pool.data_compressed);
+ ASSERT_EQ( 0u, statfs_pool.data_compressed_allocated);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BluestoreFragmentedBlobTest) {
+ if(string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "TODO: fix this for smr" << std::endl;
+ return;
+ }
+ SetVal(g_conf(), "bluestore_block_db_path", "");
+ StartDeferred(0x10000);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(g_conf()->bluestore_block_size, statfs.total);
+ ASSERT_EQ(0u, statfs.allocated);
+ ASSERT_EQ(0u, statfs.data_stored);
+ ASSERT_TRUE(statfs.available > 0u && statfs.available < g_conf()->bluestore_block_size);
+ }
+ std::string data;
+ data.resize(0x10000 * 3);
+ {
+ ObjectStore::Transaction t;
+ for(size_t i = 0;i < data.size(); i++)
+ data[i] = i / 256 + 1;
+ bufferlist bl, newdata;
+ bl.append(data);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ t.zero(cid, hoid, 0x10000, 0x10000);
+ cerr << "Append 3*0x10000 bytes and punch a hole 0x10000~10000" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x20000, statfs.data_stored);
+ ASSERT_EQ(0x20000, statfs.allocated);
+
+ r = store->read(ch, hoid, 0, data.size(), newdata);
+ ASSERT_EQ(r, (int)data.size());
+ {
+ bufferlist expected;
+ expected.append(data.substr(0, 0x10000));
+ expected.append(string(0x10000, 0));
+ expected.append(data.substr(0x20000, 0x10000));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+
+ r = store->read(ch, hoid, 1, data.size()-2, newdata);
+ ASSERT_EQ(r, (int)data.size()-2);
+ {
+ bufferlist expected;
+ expected.append(data.substr(1, 0x10000-1));
+ expected.append(string(0x10000, 0));
+ expected.append(data.substr(0x20000, 0x10000 - 1));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ }
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+
+ {
+ ObjectStore::Transaction t;
+ std::string data2(3, 'b');
+ bufferlist bl, newdata;
+ bl.append(data2);
+ t.write(cid, hoid, 0x20000, bl.length(), bl);
+ cerr << "Write 3 bytes after the hole" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x20000, statfs.allocated);
+ ASSERT_EQ(0x20000, statfs.data_stored);
+
+ r = store->read(ch, hoid, 0x20000-1, 21, newdata);
+ ASSERT_EQ(r, (int)21);
+ {
+ bufferlist expected;
+ expected.append(string(0x1, 0));
+ expected.append(string(data2));
+ expected.append(data.substr(0x20003, 21-4));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ }
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+
+ {
+ ObjectStore::Transaction t;
+ std::string data2(3, 'a');
+ bufferlist bl, newdata;
+ bl.append(data2);
+ t.write(cid, hoid, 0x10000+1, bl.length(), bl);
+ cerr << "Write 3 bytes to the hole" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x30000, statfs.allocated);
+ ASSERT_EQ(0x20003, statfs.data_stored);
+
+ r = store->read(ch, hoid, 0x10000-1, 0x10000+22, newdata);
+ ASSERT_EQ(r, (int)0x10000+22);
+ {
+ bufferlist expected;
+ expected.append(data.substr(0x10000-1, 1));
+ expected.append(string(0x1, 0));
+ expected.append(data2);
+ expected.append(string(0x10000-4, 0));
+ expected.append(string(0x3, 'b'));
+ expected.append(data.substr(0x20004, 21-3));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, newdata;
+ bl.append(string(0x30000, 'c'));
+ t.write(cid, hoid, 0, 0x30000, bl);
+ t.zero(cid, hoid, 0, 0x10000);
+ t.zero(cid, hoid, 0x20000, 0x10000);
+ cerr << "Rewrite an object and create two holes at the beginning and the end" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(0x10000, statfs.allocated);
+ ASSERT_EQ(0x10000, statfs.data_stored);
+
+ r = store->read(ch, hoid, 0, 0x30000, newdata);
+ ASSERT_EQ(r, (int)0x30000);
+ {
+ bufferlist expected;
+ expected.append(string(0x10000, 0));
+ expected.append(string(0x10000, 'c'));
+ expected.append(string(0x10000, 0));
+ ASSERT_TRUE(bl_eq(expected, newdata));
+ }
+ newdata.clear();
+ }
+
+ //force fsck
+ ch.reset();
+ EXPECT_EQ(store->umount(), 0);
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ struct store_statfs_t statfs;
+ r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ( 0u, statfs.allocated);
+ ASSERT_EQ( 0u, statfs.data_stored);
+ ASSERT_EQ( 0u, statfs.data_compressed_original);
+ ASSERT_EQ( 0u, statfs.data_compressed);
+ ASSERT_EQ( 0u, statfs.data_compressed_allocated);
+ }
+}
+#endif
+
+TEST_P(StoreTest, ManySmallWrite) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ ghobject_t b(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ bufferptr bp(4096);
+ bp.zero();
+ bl.append(bp);
+ for (int i=0; i<100; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, a, i*4096, 4096, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (int i=0; i<100; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, b, (rand() % 1024)*4096, 4096, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove(cid, b);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, MultiSmallWriteSameBlock) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ bl.append("short");
+ C_SaferCond c, d;
+ // touch same block in both same transaction, tls, and pipelined txns
+ {
+ ObjectStore::Transaction t, u;
+ t.write(cid, a, 0, 5, bl, 0);
+ t.write(cid, a, 5, 5, bl, 0);
+ t.write(cid, a, 4094, 5, bl, 0);
+ t.write(cid, a, 9000, 5, bl, 0);
+ u.write(cid, a, 10, 5, bl, 0);
+ u.write(cid, a, 7000, 5, bl, 0);
+ t.register_on_commit(&c);
+ vector<ObjectStore::Transaction> v = {t, u};
+ store->queue_transactions(ch, v);
+ }
+ {
+ ObjectStore::Transaction t, u;
+ t.write(cid, a, 40, 5, bl, 0);
+ t.write(cid, a, 45, 5, bl, 0);
+ t.write(cid, a, 4094, 5, bl, 0);
+ t.write(cid, a, 6000, 5, bl, 0);
+ u.write(cid, a, 610, 5, bl, 0);
+ u.write(cid, a, 11000, 5, bl, 0);
+ t.register_on_commit(&d);
+ vector<ObjectStore::Transaction> v = {t, u};
+ store->queue_transactions(ch, v);
+ }
+ c.wait();
+ d.wait();
+ {
+ bufferlist bl2;
+ r = store->read(ch, a, 0, 16000, bl2);
+ ASSERT_GE(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SmallSkipFront) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, a);
+ t.truncate(cid, a, 3000);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist bl;
+ bufferptr bp(4096);
+ memset(bp.c_str(), 1, 4096);
+ bl.append(bp);
+ ObjectStore::Transaction t;
+ t.write(cid, a, 4096, 4096, bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist bl;
+ ASSERT_EQ(8192, store->read(ch, a, 0, 8192, bl));
+ for (unsigned i=0; i<4096; ++i)
+ ASSERT_EQ(0, bl[i]);
+ for (unsigned i=4096; i<8192; ++i)
+ ASSERT_EQ(1, bl[i]);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, AppendDeferredVsTailCache) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("fooo", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ unsigned min_alloc = g_conf()->bluestore_min_alloc_size;
+ unsigned size = min_alloc / 3;
+ bufferptr bpa(size);
+ memset(bpa.c_str(), 1, bpa.length());
+ bufferlist bla;
+ bla.append(bpa);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, 0, bla.length(), bla, 0);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // force cached tail to clear ...
+ {
+ ch.reset();
+ int r = store->umount();
+ ASSERT_EQ(0, r);
+ r = store->mount();
+ ASSERT_EQ(0, r);
+ ch = store->open_collection(cid);
+ }
+
+ bufferptr bpb(size);
+ memset(bpb.c_str(), 2, bpb.length());
+ bufferlist blb;
+ blb.append(bpb);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, bla.length(), blb.length(), blb, 0);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferptr bpc(size);
+ memset(bpc.c_str(), 3, bpc.length());
+ bufferlist blc;
+ blc.append(bpc);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, bla.length() + blb.length(), blc.length(), blc, 0);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist final;
+ final.append(bla);
+ final.append(blb);
+ final.append(blc);
+ bufferlist actual;
+ {
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, a, 0, final.length(), actual));
+ ASSERT_TRUE(bl_eq(final, actual));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, AppendZeroTrailingSharedBlock) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("fooo", CEPH_NOSNAP)));
+ ghobject_t b = a;
+ b.hobj.snap = 1;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ unsigned min_alloc = g_conf()->bluestore_min_alloc_size;
+ unsigned size = min_alloc / 3;
+ bufferptr bpa(size);
+ memset(bpa.c_str(), 1, bpa.length());
+ bufferlist bla;
+ bla.append(bpa);
+ // make sure there is some trailing gunk in the last block
+ {
+ bufferlist bt;
+ bt.append(bla);
+ bt.append("BADBADBADBAD");
+ ObjectStore::Transaction t;
+ t.write(cid, a, 0, bt.length(), bt, 0);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.truncate(cid, a, size);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // clone
+ {
+ ObjectStore::Transaction t;
+ t.clone(cid, a, b);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // append with implicit zeroing
+ bufferptr bpb(size);
+ memset(bpb.c_str(), 2, bpb.length());
+ bufferlist blb;
+ blb.append(bpb);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, min_alloc * 3, blb.length(), blb, 0);
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist final;
+ final.append(bla);
+ bufferlist zeros;
+ zeros.append_zero(min_alloc * 3 - size);
+ final.append(zeros);
+ final.append(blb);
+ bufferlist actual;
+ {
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, a, 0, final.length(), actual));
+ final.hexdump(cout);
+ actual.hexdump(cout);
+ ASSERT_TRUE(bl_eq(final, actual));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove(cid, b);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = store->queue_transaction(ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SmallSequentialUnaligned) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ int len = 1000;
+ bufferptr bp(len);
+ bp.zero();
+ bl.append(bp);
+ for (int i=0; i<1000; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, a, i*len, len, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, ManyBigWrite) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ ghobject_t b(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ bufferptr bp(4 * 1048576);
+ bp.zero();
+ bl.append(bp);
+ for (int i=0; i<10; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, a, i*4*1048586, 4*1048576, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ // aligned
+ for (int i=0; i<10; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, b, (rand() % 256)*4*1048576, 4*1048576, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ // unaligned
+ for (int i=0; i<10; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, b, (rand() % (256*4096))*1024, 4*1048576, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ // do some zeros
+ for (int i=0; i<10; ++i) {
+ ObjectStore::Transaction t;
+ t.zero(cid, b, (rand() % (256*4096))*1024, 16*1048576);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove(cid, b);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, BigWriteBigZero) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ bufferptr bp(1048576);
+ memset(bp.c_str(), 'b', bp.length());
+ bl.append(bp);
+ bufferlist s;
+ bufferptr sp(4096);
+ memset(sp.c_str(), 's', sp.length());
+ s.append(sp);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, a, bl.length() / 4, bl.length() / 2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, bl.length() / 2, s.length(), s);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, MiscFragmentTests) {
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ bufferptr bp(524288);
+ bp.zero();
+ bl.append(bp);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, 0, 524288, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, 1048576, 524288, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist inbl;
+ int r = store->read(ch, a, 524288 + 131072, 1024, inbl);
+ ASSERT_EQ(r, 1024);
+ ASSERT_EQ(inbl.length(), 1024u);
+ ASSERT_TRUE(inbl.is_zero());
+ }
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, a, 1048576 - 4096, 524288, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+}
+
+TEST_P(StoreTest, ZeroVsObjectSize) {
+ int r;
+ coll_t cid;
+ struct stat stat;
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist a;
+ a.append("stuff");
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, 5, a);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(0, store->stat(ch, hoid, &stat));
+ ASSERT_EQ(5, stat.st_size);
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 1, 2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(0, store->stat(ch, hoid, &stat));
+ ASSERT_EQ(5, stat.st_size);
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 3, 200);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(0, store->stat(ch, hoid, &stat));
+ ASSERT_EQ(203, stat.st_size);
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 100000, 200);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(0, store->stat(ch, hoid, &stat));
+ ASSERT_EQ(100200, stat.st_size);
+}
+
+TEST_P(StoreTest, ZeroLengthWrite) {
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist empty;
+ t.write(cid, hoid, 1048576, 0, empty);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ struct stat stat;
+ r = store->stat(ch, hoid, &stat);
+ ASSERT_EQ(0, r);
+ ASSERT_EQ(0, stat.st_size);
+
+ bufferlist newdata;
+ r = store->read(ch, hoid, 0, 1048576, newdata);
+ ASSERT_EQ(0, r);
+}
+
+TEST_P(StoreTest, ZeroLengthZero) {
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 1048576, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+ struct stat stat;
+ r = store->stat(ch, hoid, &stat);
+ ASSERT_EQ(0, r);
+ ASSERT_EQ(0, stat.st_size);
+
+ bufferlist newdata;
+ r = store->read(ch, hoid, 0, 1048576, newdata);
+ ASSERT_EQ(0, r);
+}
+
+TEST_P(StoreTest, SimpleAttrTest) {
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("attr object 1", CEPH_NOSNAP)));
+ bufferlist val, val2;
+ val.append("value");
+ val.append("value2");
+ {
+ auto ch = store->open_collection(cid);
+ ASSERT_FALSE(ch);
+ }
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool empty;
+ int r = store->collection_empty(ch, &empty);
+ ASSERT_EQ(0, r);
+ ASSERT_TRUE(empty);
+ }
+ {
+ bufferptr bp;
+ r = store->getattr(ch, hoid, "nofoo", bp);
+ ASSERT_EQ(-ENOENT, r);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.setattr(cid, hoid, "foo", val);
+ t.setattr(cid, hoid, "bar", val2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool empty;
+ int r = store->collection_empty(ch, &empty);
+ ASSERT_EQ(0, r);
+ ASSERT_TRUE(!empty);
+ }
+ {
+ bufferptr bp;
+ r = store->getattr(ch, hoid, "nofoo", bp);
+ ASSERT_EQ(-ENODATA, r);
+
+ r = store->getattr(ch, hoid, "foo", bp);
+ ASSERT_EQ(0, r);
+ bufferlist bl;
+ bl.append(bp);
+ ASSERT_TRUE(bl_eq(val, bl));
+
+ map<string,bufferptr,less<>> bm;
+ r = store->getattrs(ch, hoid, bm);
+ ASSERT_EQ(0, r);
+
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SimpleListTest) {
+ int r;
+ coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ set<ghobject_t> all;
+ {
+ ObjectStore::Transaction t;
+ for (int i=0; i<200; ++i) {
+ string name("object_");
+ name += stringify(i);
+ ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)),
+ ghobject_t::NO_GEN, shard_id_t(1));
+ hoid.hobj.pool = 1;
+ all.insert(hoid);
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ set<ghobject_t> saw;
+ vector<ghobject_t> objects;
+ ghobject_t next, current;
+ while (!next.is_max()) {
+ int r = collection_list(store, ch, current, ghobject_t::get_max(), 50,
+ &objects, &next);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(sorted(objects));
+ cout << " got " << objects.size() << " next " << next << std::endl;
+ for (vector<ghobject_t>::iterator p = objects.begin(); p != objects.end();
+ ++p) {
+ if (saw.count(*p)) {
+ cout << "got DUP " << *p << std::endl;
+ } else {
+ //cout << "got new " << *p << std::endl;
+ }
+ saw.insert(*p);
+ }
+ objects.clear();
+ current = next;
+ }
+ ASSERT_EQ(saw.size(), all.size());
+ ASSERT_EQ(saw, all);
+ }
+ {
+ ObjectStore::Transaction t;
+ for (set<ghobject_t>::iterator p = all.begin(); p != all.end(); ++p)
+ t.remove(cid, *p);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, ListEndTest) {
+ int r;
+ coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ set<ghobject_t> all;
+ {
+ ObjectStore::Transaction t;
+ for (int i=0; i<200; ++i) {
+ string name("object_");
+ name += stringify(i);
+ ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)),
+ ghobject_t::NO_GEN, shard_id_t(1));
+ hoid.hobj.pool = 1;
+ all.insert(hoid);
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ghobject_t end(hobject_t(sobject_t("object_100", CEPH_NOSNAP)),
+ ghobject_t::NO_GEN, shard_id_t(1));
+ end.hobj.pool = 1;
+ vector<ghobject_t> objects;
+ ghobject_t next;
+ int r = collection_list(store, ch, ghobject_t(), end, 500, &objects, &next);
+ ASSERT_EQ(r, 0);
+ for (auto &p : objects) {
+ ASSERT_NE(p, end);
+ }
+ }
+ {
+ ObjectStore::Transaction t;
+ for (set<ghobject_t>::iterator p = all.begin(); p != all.end(); ++p)
+ t.remove(cid, *p);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, List_0xfffffff_Hash_Test_in_meta) {
+ int r = 0;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ ghobject_t hoid(hobject_t(sobject_t("obj", CEPH_NOSNAP),
+ "", UINT32_C(0xffffffff), -1, "nspace"));
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, nullptr, true);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), 1);
+ }
+}
+
+TEST_P(StoreTest, List_0xfffffff_Hash_Test_in_PG) {
+ int r = 0;
+ const int64_t poolid = 1;
+ coll_t cid(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ ghobject_t hoid(hobject_t(sobject_t("obj", CEPH_NOSNAP),
+ "", UINT32_C(0xffffffff), poolid, "nspace"));
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, nullptr, true);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), 1);
+ }
+}
+
+TEST_P(StoreTest, Sort) {
+ {
+ hobject_t a(sobject_t("a", CEPH_NOSNAP));
+ hobject_t b = a;
+ ASSERT_EQ(a, b);
+ b.oid.name = "b";
+ ASSERT_NE(a, b);
+ ASSERT_TRUE(a < b);
+ a.pool = 1;
+ b.pool = 2;
+ ASSERT_TRUE(a < b);
+ a.pool = 3;
+ ASSERT_TRUE(a > b);
+ }
+ {
+ ghobject_t a(hobject_t(sobject_t("a", CEPH_NOSNAP)));
+ ghobject_t b(hobject_t(sobject_t("b", CEPH_NOSNAP)));
+ a.hobj.pool = 1;
+ b.hobj.pool = 1;
+ ASSERT_TRUE(a < b);
+ a.hobj.pool = -3;
+ ASSERT_TRUE(a < b);
+ a.hobj.pool = 1;
+ b.hobj.pool = -3;
+ ASSERT_TRUE(a > b);
+ }
+}
+
+TEST_P(StoreTest, MultipoolListTest) {
+ int r;
+ int poolid = 4373;
+ coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ set<ghobject_t> all, saw;
+ {
+ ObjectStore::Transaction t;
+ for (int i=0; i<200; ++i) {
+ string name("object_");
+ name += stringify(i);
+ ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)));
+ if (rand() & 1)
+ hoid.hobj.pool = -2 - poolid;
+ else
+ hoid.hobj.pool = poolid;
+ all.insert(hoid);
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ vector<ghobject_t> objects;
+ ghobject_t next, current;
+ while (!next.is_max()) {
+ int r = collection_list(store, ch, current, ghobject_t::get_max(), 50,
+ &objects, &next);
+ ASSERT_EQ(r, 0);
+ cout << " got " << objects.size() << " next " << next << std::endl;
+ for (vector<ghobject_t>::iterator p = objects.begin(); p != objects.end();
+ ++p) {
+ saw.insert(*p);
+ }
+ objects.clear();
+ current = next;
+ }
+ ASSERT_EQ(saw, all);
+ }
+ {
+ ObjectStore::Transaction t;
+ for (set<ghobject_t>::iterator p = all.begin(); p != all.end(); ++p)
+ t.remove(cid, *p);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SimpleCloneTest) {
+ int r;
+ coll_t cid;
+
+ SetDeathTestStyle("threadsafe");
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP),
+ "key", 123, -1, ""));
+ bufferlist small, large, xlarge, newdata, attr;
+ small.append("small");
+ large.append("large");
+ xlarge.append("xlarge");
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.setattr(cid, hoid, "attr1", small);
+ t.setattr(cid, hoid, "attr2", large);
+ t.setattr(cid, hoid, "attr3", xlarge);
+ t.write(cid, hoid, 0, small.length(), small);
+ t.write(cid, hoid, 10, small.length(), small);
+ cerr << "Creating object and set attr " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP),
+ "key", 123, -1, ""));
+ ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP)));
+ {
+ ObjectStore::Transaction t;
+ t.clone(cid, hoid, hoid2);
+ t.setattr(cid, hoid2, "attr2", small);
+ t.rmattr(cid, hoid2, "attr1");
+ t.write(cid, hoid, 10, large.length(), large);
+ t.setattr(cid, hoid, "attr1", large);
+ t.setattr(cid, hoid, "attr2", small);
+ cerr << "Clone object and rm attr" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ r = store->read(ch, hoid, 10, 5, newdata);
+ ASSERT_EQ(r, 5);
+ ASSERT_TRUE(bl_eq(large, newdata));
+
+ newdata.clear();
+ r = store->read(ch, hoid, 0, 5, newdata);
+ ASSERT_EQ(r, 5);
+ ASSERT_TRUE(bl_eq(small, newdata));
+
+ newdata.clear();
+ r = store->read(ch, hoid2, 10, 5, newdata);
+ ASSERT_EQ(r, 5);
+ ASSERT_TRUE(bl_eq(small, newdata));
+
+ r = store->getattr(ch, hoid2, "attr2", attr);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(bl_eq(small, attr));
+
+ attr.clear();
+ r = store->getattr(ch, hoid2, "attr3", attr);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(bl_eq(xlarge, attr));
+
+ attr.clear();
+ r = store->getattr(ch, hoid, "attr1", attr);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(bl_eq(large, attr));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ }
+ {
+ bufferlist final;
+ bufferptr p(16384);
+ memset(p.c_str(), 1, p.length());
+ bufferlist pl;
+ pl.append(p);
+ final.append(p);
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, pl.length(), pl);
+ t.clone(cid, hoid, hoid2);
+ bufferptr a(4096);
+ memset(a.c_str(), 2, a.length());
+ bufferlist al;
+ al.append(a);
+ final.append(a);
+ t.write(cid, hoid, pl.length(), a.length(), al);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ bufferlist rl;
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, hoid, 0, final.length(), rl));
+ ASSERT_TRUE(bl_eq(rl, final));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ }
+ {
+ bufferlist final;
+ bufferptr p(16384);
+ memset(p.c_str(), 111, p.length());
+ bufferlist pl;
+ pl.append(p);
+ final.append(p);
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, pl.length(), pl);
+ t.clone(cid, hoid, hoid2);
+ bufferptr z(4096);
+ z.zero();
+ final.append(z);
+ bufferptr a(4096);
+ memset(a.c_str(), 112, a.length());
+ bufferlist al;
+ al.append(a);
+ final.append(a);
+ t.write(cid, hoid, pl.length() + z.length(), a.length(), al);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ bufferlist rl;
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, hoid, 0, final.length(), rl));
+ ASSERT_TRUE(bl_eq(rl, final));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ }
+ {
+ bufferlist final;
+ bufferptr p(16000);
+ memset(p.c_str(), 5, p.length());
+ bufferlist pl;
+ pl.append(p);
+ final.append(p);
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, pl.length(), pl);
+ t.clone(cid, hoid, hoid2);
+ bufferptr z(1000);
+ z.zero();
+ final.append(z);
+ bufferptr a(8000);
+ memset(a.c_str(), 6, a.length());
+ bufferlist al;
+ al.append(a);
+ final.append(a);
+ t.write(cid, hoid, 17000, a.length(), al);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ bufferlist rl;
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, hoid, 0, final.length(), rl));
+ /*cout << "expected:\n";
+ final.hexdump(cout);
+ cout << "got:\n";
+ rl.hexdump(cout);*/
+ ASSERT_TRUE(bl_eq(rl, final));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ }
+ {
+ bufferptr p(1048576);
+ memset(p.c_str(), 3, p.length());
+ bufferlist pl;
+ pl.append(p);
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, pl.length(), pl);
+ t.clone(cid, hoid, hoid2);
+ bufferptr a(65536);
+ memset(a.c_str(), 4, a.length());
+ bufferlist al;
+ al.append(a);
+ t.write(cid, hoid, a.length(), a.length(), al);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ bufferlist rl;
+ bufferlist final;
+ final.substr_of(pl, 0, al.length());
+ final.append(al);
+ bufferlist end;
+ end.substr_of(pl, al.length()*2, pl.length() - al.length()*2);
+ final.append(end);
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, hoid, 0, final.length(), rl));
+ /*cout << "expected:\n";
+ final.hexdump(cout);
+ cout << "got:\n";
+ rl.hexdump(cout);*/
+ ASSERT_TRUE(bl_eq(rl, final));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ }
+ {
+ bufferptr p(65536);
+ memset(p.c_str(), 7, p.length());
+ bufferlist pl;
+ pl.append(p);
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, pl.length(), pl);
+ t.clone(cid, hoid, hoid2);
+ bufferptr a(4096);
+ memset(a.c_str(), 8, a.length());
+ bufferlist al;
+ al.append(a);
+ t.write(cid, hoid, 32768, a.length(), al);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ bufferlist rl;
+ bufferlist final;
+ final.substr_of(pl, 0, 32768);
+ final.append(al);
+ bufferlist end;
+ end.substr_of(pl, final.length(), pl.length() - final.length());
+ final.append(end);
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, hoid, 0, final.length(), rl));
+ /*cout << "expected:\n";
+ final.hexdump(cout);
+ cout << "got:\n";
+ rl.hexdump(cout);*/
+ ASSERT_TRUE(bl_eq(rl, final));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ }
+ {
+ bufferptr p(65536);
+ memset(p.c_str(), 9, p.length());
+ bufferlist pl;
+ pl.append(p);
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, pl.length(), pl);
+ t.clone(cid, hoid, hoid2);
+ bufferptr a(4096);
+ memset(a.c_str(), 10, a.length());
+ bufferlist al;
+ al.append(a);
+ t.write(cid, hoid, 33768, a.length(), al);
+ ASSERT_EQ(0, queue_transaction(store, ch, std::move(t)));
+ bufferlist rl;
+ bufferlist final;
+ final.substr_of(pl, 0, 33768);
+ final.append(al);
+ bufferlist end;
+ end.substr_of(pl, final.length(), pl.length() - final.length());
+ final.append(end);
+ ASSERT_EQ((int)final.length(),
+ store->read(ch, hoid, 0, final.length(), rl));
+ /*cout << "expected:\n";
+ final.hexdump(cout);
+ cout << "got:\n";
+ rl.hexdump(cout);*/
+ ASSERT_TRUE(bl_eq(rl, final));
+ }
+
+ {
+ //verify if non-empty collection is properly handled after store reload
+ ch.reset();
+ r = store->umount();
+ ASSERT_EQ(r, 0);
+ r = store->mount();
+ ASSERT_EQ(r, 0);
+ ch = store->open_collection(cid);
+
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ cerr << "Invalid rm coll" << std::endl;
+ PrCtl unset_dumpable;
+ EXPECT_DEATH(queue_transaction(store, ch, std::move(t)), "");
+ }
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid3); //new record in db
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ //verify if non-empty collection is properly handled when there are some pending removes and live records in db
+ cerr << "Invalid rm coll again" << std::endl;
+ ch.reset();
+ r = store->umount();
+ ASSERT_EQ(r, 0);
+ r = store->mount();
+ ASSERT_EQ(r, 0);
+ ch = store->open_collection(cid);
+
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ PrCtl unset_dumpable;
+ EXPECT_DEATH(queue_transaction(store, ch, std::move(t)), "");
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove(cid, hoid3);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, OmapSimple) {
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("omap_obj", CEPH_NOSNAP),
+ "key", 123, -1, ""));
+ bufferlist small;
+ small.append("small");
+ map<string,bufferlist> km;
+ km["foo"] = small;
+ km["bar"].append("asdfjkasdkjdfsjkafskjsfdj");
+ bufferlist header;
+ header.append("this is a header");
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.omap_setkeys(cid, hoid, km);
+ t.omap_setheader(cid, hoid, header);
+ cerr << "Creating object and set omap " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ // get header, keys
+ {
+ bufferlist h;
+ map<string,bufferlist> r;
+ store->omap_get(ch, hoid, &h, &r);
+ ASSERT_TRUE(bl_eq(header, h));
+ ASSERT_EQ(r.size(), km.size());
+ cout << "r: " << r << std::endl;
+ }
+ // test iterator with seek_to_first
+ {
+ map<string,bufferlist> r;
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ r[iter->key()] = iter->value();
+ }
+ cout << "r: " << r << std::endl;
+ ASSERT_EQ(r.size(), km.size());
+ }
+ // test iterator with initial lower_bound
+ {
+ map<string,bufferlist> r;
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid);
+ for (iter->lower_bound(string()); iter->valid(); iter->next()) {
+ r[iter->key()] = iter->value();
+ }
+ cout << "r: " << r << std::endl;
+ ASSERT_EQ(r.size(), km.size());
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, OmapCloneTest) {
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP),
+ "key", 123, -1, ""));
+ bufferlist small;
+ small.append("small");
+ map<string,bufferlist> km;
+ km["foo"] = small;
+ km["bar"].append("asdfjkasdkjdfsjkafskjsfdj");
+ bufferlist header;
+ header.append("this is a header");
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.omap_setkeys(cid, hoid, km);
+ t.omap_setheader(cid, hoid, header);
+ cerr << "Creating object and set omap " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP),
+ "key", 123, -1, ""));
+ {
+ ObjectStore::Transaction t;
+ t.clone(cid, hoid, hoid2);
+ cerr << "Clone object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ map<string,bufferlist> r;
+ bufferlist h;
+ store->omap_get(ch, hoid2, &h, &r);
+ ASSERT_TRUE(bl_eq(header, h));
+ ASSERT_EQ(r.size(), km.size());
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, SimpleCloneRangeTest) {
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ hoid.hobj.pool = -1;
+ bufferlist small, newdata;
+ small.append("small");
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 10, 5, small);
+ cerr << "Creating object and write bl " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+ hoid2.hobj.pool = -1;
+ {
+ ObjectStore::Transaction t;
+ t.clone_range(cid, hoid, hoid2, 10, 5, 10);
+ cerr << "Clone range object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ r = store->read(ch, hoid2, 10, 5, newdata);
+ ASSERT_EQ(r, 5);
+ ASSERT_TRUE(bl_eq(small, newdata));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.truncate(cid, hoid, 1024*1024);
+ t.clone_range(cid, hoid, hoid2, 0, 1024*1024, 0);
+ cerr << "Clone range object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ struct stat stat, stat2;
+ r = store->stat(ch, hoid, &stat);
+ r = store->stat(ch, hoid2, &stat2);
+ ASSERT_EQ(stat.st_size, stat2.st_size);
+ ASSERT_EQ(1024*1024, stat2.st_size);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+#if defined(WITH_BLUESTORE)
+TEST_P(StoreTest, BlueStoreUnshareBlobTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: non-deterministic behavior with smr" << std::endl;
+ return;
+ }
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ hoid.hobj.pool = -1;
+ ghobject_t hoid2(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ hoid2.hobj.pool = -1;
+ hoid2.generation = 2;
+ {
+ // check if blob is unshared properly
+ bufferlist data, newdata;
+ data.append(string(8192, 'a'));
+
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, data.length(), data);
+ cerr << "Creating object and write 8K " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ ObjectStore::Transaction t2;
+ t2.clone_range(cid, hoid, hoid2, 0, 4096, 0);
+ cerr << "Clone range object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t2));
+ ASSERT_EQ(r, 0);
+
+ data.clear();
+ data.append(string(4096, 'b'));
+
+ ObjectStore::Transaction t3;
+ t3.write(cid, hoid, 0, data.length(), data);
+ cerr << "Writing 4k to source object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t3));
+ ASSERT_EQ(r, 0);
+
+ {
+ // this trims hoid one out of onode cache
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+
+ ObjectStore::Transaction t4;
+ t4.remove(cid, hoid2);
+ cerr << "Deleting dest object" << hoid2 << std::endl;
+ r = queue_transaction(store, ch, std::move(t4));
+ ASSERT_EQ(r, 0);
+
+ {
+ // this ensures remove operation submitted to kv store
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+
+ bufferlist resdata;
+ r = store->read(ch, hoid, 0, 0x2000, resdata);
+ ASSERT_EQ(r, 0x2000);
+
+ {
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+ auto* kv = bstore->get_kv();
+
+ // to be inline with BlueStore.cc
+ const string PREFIX_SHARED_BLOB = "X";
+
+ size_t cnt = 0;
+ auto it = kv->get_iterator(PREFIX_SHARED_BLOB);
+ ceph_assert(it);
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ ++cnt;
+ }
+ ASSERT_EQ(cnt, 0);
+ }
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, BlueStoreUnshareBlobBugTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ hoid.hobj.pool = -1;
+ ghobject_t hoid2(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ hoid2.hobj.pool = -1;
+ hoid2.generation = 2;
+ {
+ // check if blob is unshared properly
+ bufferlist data, newdata;
+ data.append(string(8192, 'a'));
+
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, data.length(), data);
+ cerr << "Creating object and write 8K " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ ObjectStore::Transaction t2;
+ t2.clone_range(cid, hoid, hoid2, 0, 4096, 0);
+ cerr << "Clone range object" << std::endl;
+ r = queue_transaction(store, ch, std::move(t2));
+ ASSERT_EQ(r, 0);
+
+ data.clear();
+ data.append(string(4096, 'b'));
+
+ ObjectStore::Transaction t3;
+ t3.write(cid, hoid, 0, data.length(), data);
+ cerr << "Writing 4k to source object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t3));
+ ASSERT_EQ(r, 0);
+
+ {
+ // this trims hoid one out of onode cache
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+
+ ObjectStore::Transaction t4;
+ t4.write(cid, hoid2, 0, data.length(), data);
+ cerr << "Writing 4k to second object " << hoid2 << std::endl;
+ r = queue_transaction(store, ch, std::move(t4));
+ ASSERT_EQ(r, 0);
+
+ bufferlist resdata;
+ r = store->read(ch, hoid, 0, 0x2000, resdata);
+ ASSERT_EQ(r, 0x2000);
+
+ {
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+ auto* kv = bstore->get_kv();
+
+ // to be inline with BlueStore.cc
+ const string PREFIX_SHARED_BLOB = "X";
+
+ size_t cnt = 0;
+ auto it = kv->get_iterator(PREFIX_SHARED_BLOB);
+ ceph_assert(it);
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ ++cnt;
+ }
+ // This shows a bug in unsharing a blob,
+ // after writing to 0x0~1000 to hoid2 share blob at hoid should be
+ //unshared but it doesn't in the current implementation
+ ASSERT_EQ(cnt, 1);
+ }
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+#endif
+
+TEST_P(StoreTest, SimpleObjectLongnameTest) {
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP)));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+ghobject_t generate_long_name(unsigned i)
+{
+ stringstream name;
+ name << "object id " << i << " ";
+ for (unsigned j = 0; j < 500; ++j) name << 'a';
+ ghobject_t hoid(hobject_t(sobject_t(name.str(), CEPH_NOSNAP)));
+ hoid.hobj.set_hash(i % 2);
+ return hoid;
+}
+
+TEST_P(StoreTest, LongnameSplitTest) {
+ int r;
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+ for (unsigned i = 0; i < 320; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t hoid = generate_long_name(i);
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+
+ ghobject_t test_obj = generate_long_name(319);
+ ghobject_t test_obj_2 = test_obj;
+ test_obj_2.generation = 0;
+ {
+ ObjectStore::Transaction t;
+ // should cause a split
+ t.collection_move_rename(
+ cid, test_obj,
+ cid, test_obj_2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+
+ for (unsigned i = 0; i < 319; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t hoid = generate_long_name(i);
+ t.remove(cid, hoid);
+ cerr << "Removing object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, test_obj_2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(0, r);
+ }
+
+}
+
+TEST_P(StoreTest, ManyObjectTest) {
+ int NUM_OBJS = 2000;
+ int r = 0;
+ coll_t cid;
+ string base = "";
+ for (int i = 0; i < 100; ++i) base.append("aaaaa");
+ set<ghobject_t> created;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (int i = 0; i < NUM_OBJS; ++i) {
+ if (!(i % 5)) {
+ cerr << "Object " << i << std::endl;
+ }
+ ObjectStore::Transaction t;
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", i);
+ ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP)));
+ t.touch(cid, hoid);
+ created.insert(hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ for (set<ghobject_t>::iterator i = created.begin();
+ i != created.end();
+ ++i) {
+ struct stat buf;
+ ASSERT_TRUE(!store->stat(ch, *i, &buf));
+ }
+
+ set<ghobject_t> listed, listed2;
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+
+ cerr << "objects.size() is " << objects.size() << std::endl;
+ for (vector<ghobject_t> ::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ listed.insert(*i);
+ ASSERT_TRUE(created.count(*i));
+ }
+ ASSERT_TRUE(listed.size() == created.size());
+
+ ghobject_t start, next;
+ objects.clear();
+ r = collection_list(
+ store,
+ ch,
+ ghobject_t::get_max(),
+ ghobject_t::get_max(),
+ 50,
+ &objects,
+ &next
+ );
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(objects.empty());
+
+ objects.clear();
+ listed.clear();
+ ghobject_t start2, next2;
+ while (1) {
+ r = collection_list(store, ch, start, ghobject_t::get_max(), 50, &objects,
+ &next);
+ ASSERT_TRUE(sorted(objects));
+ ASSERT_EQ(r, 0);
+ listed.insert(objects.begin(), objects.end());
+ if (objects.size() < 50) {
+ ASSERT_TRUE(next.is_max());
+ break;
+ }
+ objects.clear();
+
+ start = next;
+ }
+ cerr << "listed.size() is " << listed.size() << std::endl;
+ ASSERT_TRUE(listed.size() == created.size());
+ if (listed2.size()) {
+ ASSERT_EQ(listed.size(), listed2.size());
+ }
+ for (set<ghobject_t>::iterator i = listed.begin();
+ i != listed.end();
+ ++i) {
+ ASSERT_TRUE(created.count(*i));
+ }
+
+ for (set<ghobject_t>::iterator i = created.begin();
+ i != created.end();
+ ++i) {
+ ObjectStore::Transaction t;
+ t.remove(cid, *i);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ cerr << "cleaning up" << std::endl;
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+
+class ObjectGenerator {
+public:
+ virtual ghobject_t create_object(gen_type *gen) = 0;
+ virtual ~ObjectGenerator() {}
+};
+
+class MixedGenerator : public ObjectGenerator {
+public:
+ unsigned seq;
+ int64_t poolid;
+ explicit MixedGenerator(int64_t p) : seq(0), poolid(p) {}
+ ghobject_t create_object(gen_type *gen) override {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "OBJ_%u", seq);
+ string name(buf);
+ if (seq % 2) {
+ for (unsigned i = 0; i < 300; ++i) {
+ name.push_back('a');
+ }
+ }
+ ++seq;
+ return ghobject_t(
+ hobject_t(
+ name, string(), rand() & 2 ? CEPH_NOSNAP : rand(),
+ (((seq / 1024) % 2) * 0xF00 ) +
+ (seq & 0xFF),
+ poolid, ""));
+ }
+};
+
+class SyntheticWorkloadState {
+ struct Object {
+ bufferlist data;
+ map<string, bufferlist> attrs;
+ };
+public:
+ static const unsigned max_in_flight = 16;
+ static const unsigned max_objects = 3000;
+ static const unsigned max_attr_size = 5;
+ static const unsigned max_attr_name_len = 100;
+ static const unsigned max_attr_value_len = 1024 * 64;
+ coll_t cid;
+ unsigned write_alignment;
+ unsigned max_object_len, max_write_len;
+ unsigned in_flight;
+ map<ghobject_t, Object> contents;
+ set<ghobject_t> available_objects;
+ set<ghobject_t>::iterator next_available_object;
+ set<ghobject_t> in_flight_objects;
+ ObjectGenerator *object_gen;
+ gen_type *rng;
+ ObjectStore *store;
+ ObjectStore::CollectionHandle ch;
+
+ ceph::mutex lock = ceph::make_mutex("State lock");
+ ceph::condition_variable cond;
+
+ struct EnterExit {
+ const char *msg;
+ explicit EnterExit(const char *m) : msg(m) {
+ //cout << pthread_self() << " enter " << msg << std::endl;
+ }
+ ~EnterExit() {
+ //cout << pthread_self() << " exit " << msg << std::endl;
+ }
+ };
+
+ class C_SyntheticOnReadable : public Context {
+ public:
+ SyntheticWorkloadState *state;
+ ghobject_t hoid;
+ C_SyntheticOnReadable(SyntheticWorkloadState *state, ghobject_t hoid)
+ : state(state), hoid(hoid) {}
+
+ void finish(int r) override {
+ std::lock_guard locker{state->lock};
+ EnterExit ee("onreadable finish");
+ ASSERT_TRUE(state->in_flight_objects.count(hoid));
+ ASSERT_EQ(r, 0);
+ state->in_flight_objects.erase(hoid);
+ if (state->contents.count(hoid))
+ state->available_objects.insert(hoid);
+ --(state->in_flight);
+ state->cond.notify_all();
+
+ bufferlist r2;
+ r = state->store->read(state->ch, hoid, 0, state->contents[hoid].data.length(), r2);
+ ceph_assert(bl_eq(state->contents[hoid].data, r2));
+ state->cond.notify_all();
+ }
+ };
+
+ class C_SyntheticOnStash : public Context {
+ public:
+ SyntheticWorkloadState *state;
+ ghobject_t oid, noid;
+
+ C_SyntheticOnStash(SyntheticWorkloadState *state,
+ ghobject_t oid, ghobject_t noid)
+ : state(state), oid(oid), noid(noid) {}
+
+ void finish(int r) override {
+ std::lock_guard locker{state->lock};
+ EnterExit ee("stash finish");
+ ASSERT_TRUE(state->in_flight_objects.count(oid));
+ ASSERT_EQ(r, 0);
+ state->in_flight_objects.erase(oid);
+ if (state->contents.count(noid))
+ state->available_objects.insert(noid);
+ --(state->in_flight);
+ bufferlist r2;
+ r = state->store->read(
+ state->ch, noid, 0,
+ state->contents[noid].data.length(), r2);
+ ceph_assert(bl_eq(state->contents[noid].data, r2));
+ state->cond.notify_all();
+ }
+ };
+
+ class C_SyntheticOnClone : public Context {
+ public:
+ SyntheticWorkloadState *state;
+ ghobject_t oid, noid;
+
+ C_SyntheticOnClone(SyntheticWorkloadState *state,
+ ghobject_t oid, ghobject_t noid)
+ : state(state), oid(oid), noid(noid) {}
+
+ void finish(int r) override {
+ std::lock_guard locker{state->lock};
+ EnterExit ee("clone finish");
+ ASSERT_TRUE(state->in_flight_objects.count(oid));
+ ASSERT_EQ(r, 0);
+ state->in_flight_objects.erase(oid);
+ if (state->contents.count(oid))
+ state->available_objects.insert(oid);
+ if (state->contents.count(noid))
+ state->available_objects.insert(noid);
+ --(state->in_flight);
+ bufferlist r2;
+ r = state->store->read(state->ch, noid, 0, state->contents[noid].data.length(), r2);
+ ceph_assert(bl_eq(state->contents[noid].data, r2));
+ state->cond.notify_all();
+ }
+ };
+
+ static void filled_byte_array(bufferlist& bl, size_t size)
+ {
+ static const char alphanum[] = "0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz";
+ if (!size) {
+ return;
+ }
+ bufferptr bp(size);
+ for (unsigned int i = 0; i < size - 1; i++) {
+ // severely limit entropy so we can compress...
+ bp[i] = alphanum[rand() % 10]; //(sizeof(alphanum) - 1)];
+ }
+ bp[size - 1] = '\0';
+
+ bl.append(bp);
+ }
+
+ SyntheticWorkloadState(ObjectStore *store,
+ ObjectGenerator *gen,
+ gen_type *rng,
+ coll_t cid,
+ unsigned max_size,
+ unsigned max_write,
+ unsigned alignment)
+ : cid(cid), write_alignment(alignment), max_object_len(max_size),
+ max_write_len(max_write), in_flight(0),
+ next_available_object(available_objects.end()),
+ object_gen(gen), rng(rng), store(store) {}
+
+ int init() {
+ ObjectStore::Transaction t;
+ ch = store->create_new_collection(cid);
+ t.create_collection(cid, 0);
+ return queue_transaction(store, ch, std::move(t));
+ }
+ void shutdown() {
+ ghobject_t next;
+ while (1) {
+ vector<ghobject_t> objects;
+ int r = collection_list(store, ch, next, ghobject_t::get_max(), 10,
+ &objects, &next);
+ ceph_assert(r >= 0);
+ if (objects.size() == 0)
+ break;
+ ObjectStore::Transaction t;
+ std::map<std::string, ceph::buffer::list> attrset;
+ for (vector<ghobject_t>::iterator p = objects.begin();
+ p != objects.end(); ++p) {
+ t.remove(cid, *p);
+ }
+ queue_transaction(store, ch, std::move(t));
+ }
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ queue_transaction(store, ch, std::move(t));
+ }
+ void statfs(store_statfs_t& stat) {
+ store->statfs(&stat);
+ }
+
+ ghobject_t get_uniform_random_object(std::unique_lock<ceph::mutex>& locker) {
+ cond.wait(locker, [this] {
+ return in_flight < max_in_flight && !available_objects.empty();
+ });
+ boost::uniform_int<> choose(0, available_objects.size() - 1);
+ int index = choose(*rng);
+ set<ghobject_t>::iterator i = available_objects.begin();
+ for ( ; index > 0; --index, ++i) ;
+ ghobject_t ret = *i;
+ return ret;
+ }
+
+ ghobject_t get_next_object(std::unique_lock<ceph::mutex>& locker) {
+ cond.wait(locker, [this] {
+ return in_flight < max_in_flight && !available_objects.empty();
+ });
+
+ if (next_available_object == available_objects.end()) {
+ next_available_object = available_objects.begin();
+ }
+
+ ghobject_t ret = *next_available_object;
+ ++next_available_object;
+ return ret;
+ }
+
+ void wait_for_ready(std::unique_lock<ceph::mutex>& locker) {
+ cond.wait(locker, [this] { return in_flight < max_in_flight; });
+ }
+
+ void wait_for_done() {
+ std::unique_lock locker{lock};
+ cond.wait(locker, [this] { return in_flight == 0; });
+ }
+
+ bool can_create() {
+ return (available_objects.size() + in_flight_objects.size()) < max_objects;
+ }
+
+ bool can_unlink() {
+ return (available_objects.size() + in_flight_objects.size()) > 0;
+ }
+
+ unsigned get_random_alloc_hints() {
+ unsigned f = 0;
+ {
+ boost::uniform_int<> u(0, 3);
+ switch (u(*rng)) {
+ case 1:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE;
+ break;
+ case 2:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE;
+ break;
+ }
+ }
+ {
+ boost::uniform_int<> u(0, 3);
+ switch (u(*rng)) {
+ case 1:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ;
+ break;
+ case 2:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ;
+ break;
+ }
+ }
+ {
+ // append_only, immutable
+ boost::uniform_int<> u(0, 4);
+ f |= u(*rng) << 4;
+ }
+ {
+ boost::uniform_int<> u(0, 3);
+ switch (u(*rng)) {
+ case 1:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED;
+ break;
+ case 2:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED;
+ break;
+ }
+ }
+ {
+ boost::uniform_int<> u(0, 3);
+ switch (u(*rng)) {
+ case 1:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
+ break;
+ case 2:
+ f |= CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ break;
+ }
+ }
+ return f;
+ }
+
+ int touch() {
+ std::unique_lock locker{lock};
+ EnterExit ee("touch");
+ if (!can_create())
+ return -ENOSPC;
+ wait_for_ready(locker);
+ ghobject_t new_obj = object_gen->create_object(rng);
+ available_objects.erase(new_obj);
+ ObjectStore::Transaction t;
+ t.touch(cid, new_obj);
+ boost::uniform_int<> u(17, 22);
+ boost::uniform_int<> v(12, 17);
+ t.set_alloc_hint(cid, new_obj,
+ 1ull << u(*rng),
+ 1ull << v(*rng),
+ get_random_alloc_hints());
+ ++in_flight;
+ in_flight_objects.insert(new_obj);
+ if (!contents.count(new_obj))
+ contents[new_obj] = Object();
+ t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ int stash() {
+ std::unique_lock locker{lock};
+ EnterExit ee("stash");
+ if (!can_unlink())
+ return -ENOENT;
+ if (!can_create())
+ return -ENOSPC;
+ wait_for_ready(locker);
+
+ ghobject_t old_obj;
+ int max = 20;
+ do {
+ old_obj = get_uniform_random_object(locker);
+ } while (--max && !contents[old_obj].data.length());
+ available_objects.erase(old_obj);
+ ghobject_t new_obj = old_obj;
+ new_obj.generation++;
+ available_objects.erase(new_obj);
+
+ ObjectStore::Transaction t;
+ t.collection_move_rename(cid, old_obj, cid, new_obj);
+ ++in_flight;
+ in_flight_objects.insert(old_obj);
+
+ contents[new_obj].attrs = contents[old_obj].attrs;
+ contents[new_obj].data = contents[old_obj].data;
+ contents.erase(old_obj);
+ t.register_on_applied(new C_SyntheticOnStash(this, old_obj, new_obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ int clone() {
+ std::unique_lock locker{lock};
+ EnterExit ee("clone");
+ if (!can_unlink())
+ return -ENOENT;
+ if (!can_create())
+ return -ENOSPC;
+ wait_for_ready(locker);
+
+ ghobject_t old_obj;
+ int max = 20;
+ do {
+ old_obj = get_uniform_random_object(locker);
+ } while (--max && !contents[old_obj].data.length());
+ available_objects.erase(old_obj);
+ ghobject_t new_obj = object_gen->create_object(rng);
+ // make the hash match
+ new_obj.hobj.set_hash(old_obj.hobj.get_hash());
+ available_objects.erase(new_obj);
+
+ ObjectStore::Transaction t;
+ t.clone(cid, old_obj, new_obj);
+ ++in_flight;
+ in_flight_objects.insert(old_obj);
+
+ contents[new_obj].attrs = contents[old_obj].attrs;
+ contents[new_obj].data = contents[old_obj].data;
+
+ t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ int clone_range() {
+ std::unique_lock locker{lock};
+ EnterExit ee("clone_range");
+ if (!can_unlink())
+ return -ENOENT;
+ if (!can_create())
+ return -ENOSPC;
+ wait_for_ready(locker);
+
+ ghobject_t old_obj;
+ int max = 20;
+ do {
+ old_obj = get_uniform_random_object(locker);
+ } while (--max && !contents[old_obj].data.length());
+ bufferlist &srcdata = contents[old_obj].data;
+ if (srcdata.length() == 0) {
+ return 0;
+ }
+ available_objects.erase(old_obj);
+ ghobject_t new_obj = get_uniform_random_object(locker);
+ available_objects.erase(new_obj);
+
+ boost::uniform_int<> u1(0, max_object_len - max_write_len);
+ boost::uniform_int<> u2(0, max_write_len);
+ uint64_t srcoff = u1(*rng);
+ // make src and dst offsets match, since that's what the osd does
+ uint64_t dstoff = srcoff; //u1(*rng);
+ uint64_t len = u2(*rng);
+ if (write_alignment) {
+ srcoff = round_up_to(srcoff, write_alignment);
+ dstoff = round_up_to(dstoff, write_alignment);
+ len = round_up_to(len, write_alignment);
+ }
+
+ if (srcoff > srcdata.length() - 1) {
+ srcoff = srcdata.length() - 1;
+ }
+ if (srcoff + len > srcdata.length()) {
+ len = srcdata.length() - srcoff;
+ }
+ if (0)
+ cout << __func__ << " from " << srcoff << "~" << len
+ << " (size " << srcdata.length() << ") to "
+ << dstoff << "~" << len << std::endl;
+
+ ObjectStore::Transaction t;
+ t.clone_range(cid, old_obj, new_obj, srcoff, len, dstoff);
+ ++in_flight;
+ in_flight_objects.insert(old_obj);
+
+ bufferlist bl;
+ if (srcoff < srcdata.length()) {
+ if (srcoff + len > srcdata.length()) {
+ bl.substr_of(srcdata, srcoff, srcdata.length() - srcoff);
+ } else {
+ bl.substr_of(srcdata, srcoff, len);
+ }
+ }
+
+ bufferlist& dstdata = contents[new_obj].data;
+ if (dstdata.length() <= dstoff) {
+ if (bl.length() > 0) {
+ dstdata.append_zero(dstoff - dstdata.length());
+ dstdata.append(bl);
+ }
+ } else {
+ bufferlist value;
+ ceph_assert(dstdata.length() > dstoff);
+ dstdata.cbegin().copy(dstoff, value);
+ value.append(bl);
+ if (value.length() < dstdata.length())
+ dstdata.cbegin(value.length()).copy(
+ dstdata.length() - value.length(), value);
+ value.swap(dstdata);
+ }
+
+ t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+
+ int write() {
+ std::unique_lock locker{lock};
+ EnterExit ee("write");
+ if (!can_unlink())
+ return -ENOENT;
+ wait_for_ready(locker);
+
+ ghobject_t new_obj = get_uniform_random_object(locker);
+ available_objects.erase(new_obj);
+ ObjectStore::Transaction t;
+
+ boost::uniform_int<> u1(0, max_object_len - max_write_len);
+ boost::uniform_int<> u2(0, max_write_len);
+ uint64_t offset = u1(*rng);
+ uint64_t len = u2(*rng);
+ bufferlist bl;
+ if (write_alignment) {
+ offset = round_up_to(offset, write_alignment);
+ len = round_up_to(len, write_alignment);
+ }
+
+ filled_byte_array(bl, len);
+
+ bufferlist& data = contents[new_obj].data;
+ if (data.length() <= offset) {
+ if (len > 0) {
+ data.append_zero(offset-data.length());
+ data.append(bl);
+ }
+ } else {
+ bufferlist value;
+ ceph_assert(data.length() > offset);
+ data.cbegin().copy(offset, value);
+ value.append(bl);
+ if (value.length() < data.length())
+ data.cbegin(value.length()).copy(
+ data.length()-value.length(), value);
+ value.swap(data);
+ }
+
+ t.write(cid, new_obj, offset, len, bl);
+ ++in_flight;
+ in_flight_objects.insert(new_obj);
+ t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ int truncate() {
+ std::unique_lock locker{lock};
+ EnterExit ee("truncate");
+ if (!can_unlink())
+ return -ENOENT;
+ wait_for_ready(locker);
+
+ ghobject_t obj = get_uniform_random_object(locker);
+ available_objects.erase(obj);
+ ObjectStore::Transaction t;
+
+ boost::uniform_int<> choose(0, max_object_len);
+ size_t len = choose(*rng);
+ if (write_alignment) {
+ len = round_up_to(len, write_alignment);
+ }
+
+ t.truncate(cid, obj, len);
+ ++in_flight;
+ in_flight_objects.insert(obj);
+ bufferlist& data = contents[obj].data;
+ if (data.length() <= len) {
+ data.append_zero(len - data.length());
+ } else {
+ bufferlist bl;
+ data.cbegin().copy(len, bl);
+ bl.swap(data);
+ }
+
+ t.register_on_applied(new C_SyntheticOnReadable(this, obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ int zero() {
+ std::unique_lock locker{lock};
+ EnterExit ee("zero");
+ if (!can_unlink())
+ return -ENOENT;
+ wait_for_ready(locker);
+
+ ghobject_t new_obj = get_uniform_random_object(locker);
+ available_objects.erase(new_obj);
+ ObjectStore::Transaction t;
+
+ boost::uniform_int<> u1(0, max_object_len - max_write_len);
+ boost::uniform_int<> u2(0, max_write_len);
+ uint64_t offset = u1(*rng);
+ uint64_t len = u2(*rng);
+ if (write_alignment) {
+ offset = round_up_to(offset, write_alignment);
+ len = round_up_to(len, write_alignment);
+ }
+
+ if (len > 0) {
+ auto& data = contents[new_obj].data;
+ if (data.length() < offset + len) {
+ data.append_zero(offset+len-data.length());
+ }
+ bufferlist n;
+ n.substr_of(data, 0, offset);
+ n.append_zero(len);
+ if (data.length() > offset + len)
+ data.cbegin(offset + len).copy(data.length() - offset - len, n);
+ data.swap(n);
+ }
+
+ t.zero(cid, new_obj, offset, len);
+ ++in_flight;
+ in_flight_objects.insert(new_obj);
+ t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ void read() {
+ EnterExit ee("read");
+ boost::uniform_int<> u1(0, max_object_len/2);
+ boost::uniform_int<> u2(0, max_object_len);
+ uint64_t offset = u1(*rng);
+ uint64_t len = u2(*rng);
+ if (offset > len)
+ swap(offset, len);
+
+ ghobject_t obj;
+ bufferlist expected;
+ int r;
+ {
+ std::unique_lock locker{lock};
+ EnterExit ee("read locked");
+ if (!can_unlink())
+ return ;
+ wait_for_ready(locker);
+
+ obj = get_uniform_random_object(locker);
+ expected = contents[obj].data;
+ }
+ bufferlist bl, result;
+ if (0) cout << " obj " << obj
+ << " size " << expected.length()
+ << " offset " << offset
+ << " len " << len << std::endl;
+ r = store->read(ch, obj, offset, len, result);
+ if (offset >= expected.length()) {
+ ASSERT_EQ(r, 0);
+ } else {
+ size_t max_len = expected.length() - offset;
+ if (len > max_len)
+ len = max_len;
+ ceph_assert(len == result.length());
+ ASSERT_EQ(len, result.length());
+ expected.cbegin(offset).copy(len, bl);
+ ASSERT_EQ(r, (int)len);
+ ASSERT_TRUE(bl_eq(bl, result));
+ }
+ }
+
+ int setattrs() {
+ std::unique_lock locker{lock};
+ EnterExit ee("setattrs");
+ if (!can_unlink())
+ return -ENOENT;
+ wait_for_ready(locker);
+
+ ghobject_t obj = get_uniform_random_object(locker);
+ available_objects.erase(obj);
+ ObjectStore::Transaction t;
+
+ boost::uniform_int<> u0(1, max_attr_size);
+ boost::uniform_int<> u1(4, max_attr_name_len);
+ boost::uniform_int<> u2(4, max_attr_value_len);
+ boost::uniform_int<> u3(0, 100);
+ uint64_t size = u0(*rng);
+ uint64_t name_len;
+ map<string, bufferlist, less<>> attrs;
+ set<string> keys;
+ for (map<string, bufferlist>::iterator it = contents[obj].attrs.begin();
+ it != contents[obj].attrs.end(); ++it)
+ keys.insert(it->first);
+
+ while (size--) {
+ bufferlist name, value;
+ uint64_t get_exist = u3(*rng);
+ uint64_t value_len = u2(*rng);
+ filled_byte_array(value, value_len);
+ if (get_exist < 50 && keys.size()) {
+ set<string>::iterator k = keys.begin();
+ attrs[*k] = value;
+ contents[obj].attrs[*k] = value;
+ keys.erase(k);
+ } else {
+ name_len = u1(*rng);
+ filled_byte_array(name, name_len);
+ attrs[name.c_str()] = value;
+ contents[obj].attrs[name.c_str()] = value;
+ }
+ }
+ t.setattrs(cid, obj, attrs);
+ ++in_flight;
+ in_flight_objects.insert(obj);
+ t.register_on_applied(new C_SyntheticOnReadable(this, obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ int set_fixed_attrs(size_t entries, size_t key_size, size_t val_size) {
+ std::unique_lock locker{ lock };
+ EnterExit ee("setattrs");
+ if (!can_unlink())
+ return -ENOENT;
+ wait_for_ready(locker);
+
+ ghobject_t obj = get_next_object(locker);
+ available_objects.erase(obj);
+ ObjectStore::Transaction t;
+
+ map<string, bufferlist, less<>> attrs;
+ set<string> keys;
+
+ while (entries--) {
+ bufferlist name, value;
+ filled_byte_array(value, val_size);
+ filled_byte_array(name, key_size);
+ attrs[name.c_str()] = value;
+ contents[obj].attrs[name.c_str()] = value;
+ }
+ t.setattrs(cid, obj, attrs);
+ ++in_flight;
+ in_flight_objects.insert(obj);
+ t.register_on_applied(new C_SyntheticOnReadable(this, obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ void getattrs() {
+ EnterExit ee("getattrs");
+ ghobject_t obj;
+ map<string, bufferlist> expected;
+ {
+ std::unique_lock locker{lock};
+ EnterExit ee("getattrs locked");
+ if (!can_unlink())
+ return ;
+ wait_for_ready(locker);
+
+ int retry = 10;
+ do {
+ obj = get_uniform_random_object(locker);
+ if (!--retry)
+ return ;
+ } while (contents[obj].attrs.empty());
+ expected = contents[obj].attrs;
+ }
+ map<string, bufferlist, less<>> attrs;
+ int r = store->getattrs(ch, obj, attrs);
+ ASSERT_TRUE(r == 0);
+ ASSERT_TRUE(attrs.size() == expected.size());
+ for (map<string, bufferlist>::iterator it = expected.begin();
+ it != expected.end(); ++it) {
+ ASSERT_TRUE(bl_eq(attrs[it->first], it->second));
+ }
+ }
+
+ void getattr() {
+ EnterExit ee("getattr");
+ ghobject_t obj;
+ int r;
+ int retry;
+ map<string, bufferlist> expected;
+ {
+ std::unique_lock locker{lock};
+ EnterExit ee("getattr locked");
+ if (!can_unlink())
+ return ;
+ wait_for_ready(locker);
+
+ retry = 10;
+ do {
+ obj = get_uniform_random_object(locker);
+ if (!--retry)
+ return ;
+ } while (contents[obj].attrs.empty());
+ expected = contents[obj].attrs;
+ }
+ boost::uniform_int<> u(0, expected.size()-1);
+ retry = u(*rng);
+ map<string, bufferlist>::iterator it = expected.begin();
+ while (retry) {
+ retry--;
+ ++it;
+ }
+
+ bufferlist bl;
+ r = store->getattr(ch, obj, it->first, bl);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(bl_eq(it->second, bl));
+ }
+
+ int rmattr() {
+ std::unique_lock locker{lock};
+ EnterExit ee("rmattr");
+ if (!can_unlink())
+ return -ENOENT;
+ wait_for_ready(locker);
+
+ ghobject_t obj;
+ int retry = 10;
+ do {
+ obj = get_uniform_random_object(locker);
+ if (!--retry)
+ return 0;
+ } while (contents[obj].attrs.empty());
+
+ boost::uniform_int<> u(0, contents[obj].attrs.size()-1);
+ retry = u(*rng);
+ map<string, bufferlist>::iterator it = contents[obj].attrs.begin();
+ while (retry) {
+ retry--;
+ ++it;
+ }
+
+ available_objects.erase(obj);
+ ObjectStore::Transaction t;
+ t.rmattr(cid, obj, it->first);
+
+ contents[obj].attrs.erase(it->first);
+ ++in_flight;
+ in_flight_objects.insert(obj);
+ t.register_on_applied(new C_SyntheticOnReadable(this, obj));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ void fsck(bool deep) {
+ std::unique_lock locker{lock};
+ EnterExit ee("fsck");
+ cond.wait(locker, [this] { return in_flight == 0; });
+ ch.reset();
+ store->umount();
+ int r = store->fsck(deep);
+ ceph_assert(r == 0 || r == -EOPNOTSUPP);
+ store->mount();
+ ch = store->open_collection(cid);
+ }
+
+ void scan() {
+ std::unique_lock locker{lock};
+ EnterExit ee("scan");
+ cond.wait(locker, [this] { return in_flight == 0; });
+ vector<ghobject_t> objects;
+ set<ghobject_t> objects_set, objects_set2;
+ ghobject_t next, current;
+ while (1) {
+ //cerr << "scanning..." << std::endl;
+ int r = collection_list(store, ch, current, ghobject_t::get_max(), 100,
+ &objects, &next);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(sorted(objects));
+ objects_set.insert(objects.begin(), objects.end());
+ objects.clear();
+ if (next.is_max()) break;
+ current = next;
+ }
+ if (objects_set.size() != available_objects.size()) {
+ for (set<ghobject_t>::iterator p = objects_set.begin();
+ p != objects_set.end();
+ ++p)
+ if (available_objects.count(*p) == 0) {
+ cerr << "+ " << *p << std::endl;
+ ceph_abort();
+ }
+ for (set<ghobject_t>::iterator p = available_objects.begin();
+ p != available_objects.end();
+ ++p)
+ if (objects_set.count(*p) == 0)
+ cerr << "- " << *p << std::endl;
+ //cerr << " objects_set: " << objects_set << std::endl;
+ //cerr << " available_set: " << available_objects << std::endl;
+ ceph_abort_msg("badness");
+ }
+
+ ASSERT_EQ(objects_set.size(), available_objects.size());
+ for (set<ghobject_t>::iterator i = objects_set.begin();
+ i != objects_set.end();
+ ++i) {
+ ASSERT_GT(available_objects.count(*i), (unsigned)0);
+ }
+
+ int r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(),
+ INT_MAX, &objects, 0);
+ ASSERT_EQ(r, 0);
+ objects_set2.insert(objects.begin(), objects.end());
+ ASSERT_EQ(objects_set2.size(), available_objects.size());
+ for (set<ghobject_t>::iterator i = objects_set2.begin();
+ i != objects_set2.end();
+ ++i) {
+ ASSERT_GT(available_objects.count(*i), (unsigned)0);
+ if (available_objects.count(*i) == 0) {
+ cerr << "+ " << *i << std::endl;
+ }
+ }
+ }
+
+ void stat() {
+ EnterExit ee("stat");
+ ghobject_t hoid;
+ uint64_t expected;
+ {
+ std::unique_lock locker{lock};
+ EnterExit ee("stat lock1");
+ if (!can_unlink())
+ return ;
+ hoid = get_uniform_random_object(locker);
+ in_flight_objects.insert(hoid);
+ available_objects.erase(hoid);
+ ++in_flight;
+ expected = contents[hoid].data.length();
+ }
+ struct stat buf;
+ int r = store->stat(ch, hoid, &buf);
+ ASSERT_EQ(0, r);
+ ceph_assert((uint64_t)buf.st_size == expected);
+ ASSERT_TRUE((uint64_t)buf.st_size == expected);
+ {
+ std::lock_guard locker{lock};
+ EnterExit ee("stat lock2");
+ --in_flight;
+ cond.notify_all();
+ in_flight_objects.erase(hoid);
+ available_objects.insert(hoid);
+ }
+ }
+
+ int unlink() {
+ std::unique_lock locker{lock};
+ EnterExit ee("unlink");
+ if (!can_unlink())
+ return -ENOENT;
+ ghobject_t to_remove = get_uniform_random_object(locker);
+ ObjectStore::Transaction t;
+ t.remove(cid, to_remove);
+ ++in_flight;
+ available_objects.erase(to_remove);
+ in_flight_objects.insert(to_remove);
+ contents.erase(to_remove);
+ t.register_on_applied(new C_SyntheticOnReadable(this, to_remove));
+ int status = store->queue_transaction(ch, std::move(t));
+ return status;
+ }
+
+ void print_internal_state() {
+ std::lock_guard locker{lock};
+ cerr << "available_objects: " << available_objects.size()
+ << " in_flight_objects: " << in_flight_objects.size()
+ << " total objects: " << in_flight_objects.size() + available_objects.size()
+ << " in_flight " << in_flight << std::endl;
+ }
+};
+
+
+void StoreTest::doSyntheticTest(
+ int num_ops,
+ uint64_t max_obj, uint64_t max_wr, uint64_t align)
+{
+ MixedGenerator gen(555);
+ gen_type rng(time(NULL));
+ coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD));
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ SyntheticWorkloadState test_obj(store.get(), &gen, &rng, cid,
+ max_obj, max_wr, align);
+ test_obj.init();
+ for (int i = 0; i < num_ops/10; ++i) {
+ if (!(i % 500)) cerr << "seeding object " << i << std::endl;
+ test_obj.touch();
+ }
+ for (int i = 0; i < num_ops; ++i) {
+ if (!(i % 1000)) {
+ cerr << "Op " << i << std::endl;
+ test_obj.print_internal_state();
+ }
+ boost::uniform_int<> true_false(0, 999);
+ int val = true_false(rng);
+ if (val > 998) {
+ test_obj.fsck(true);
+ } else if (val > 997) {
+ test_obj.fsck(false);
+ } else if (val > 970) {
+ test_obj.scan();
+ } else if (val > 950) {
+ test_obj.stat();
+ } else if (val > 850) {
+ test_obj.zero();
+ } else if (val > 800) {
+ test_obj.unlink();
+ } else if (val > 550) {
+ test_obj.write();
+ } else if (val > 500) {
+ test_obj.clone();
+ } else if (val > 450) {
+ test_obj.clone_range();
+ } else if (val > 300) {
+ test_obj.stash();
+ } else if (val > 100) {
+ test_obj.read();
+ } else {
+ test_obj.truncate();
+ }
+ }
+ test_obj.wait_for_done();
+ test_obj.shutdown();
+}
+
+TEST_P(StoreTest, Synthetic) {
+ doSyntheticTest(10000, 400*1024, 40*1024, 0);
+}
+
+#if defined(WITH_BLUESTORE)
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixSharding) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "4096", 0 }, // must be the first!
+ { "num_ops", "50000", 0 },
+ { "max_write", "65536", 0 },
+ { "max_size", "262144", 0 },
+ { "alignment", "4096", 0 },
+ { "bluestore_max_blob_size", "65536", 0 },
+ { "bluestore_extent_map_shard_min_size", "60", 0 },
+ { "bluestore_extent_map_shard_max_size", "300", 0 },
+ { "bluestore_extent_map_shard_target_size", "150", 0 },
+ { "bluestore_default_buffered_read", "true", 0 },
+ { "bluestore_default_buffered_write", "true", 0 },
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+
+TEST_P(StoreTestSpecificAUSize, ZipperPatternSharded) {
+ if(string(GetParam()) != "bluestore")
+ return;
+ StartDeferred(4096);
+
+ int r;
+ coll_t cid;
+ ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ int len = 4096;
+ bufferptr bp(len);
+ bp.zero();
+ bl.append(bp);
+ for (int i=0; i<1000; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, a, i*2*len, len, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (int i=0; i<1000; ++i) {
+ ObjectStore::Transaction t;
+ t.write(cid, a, i*2*len + 1, len, bl, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCsumAlgorithm) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "65536", 0 }, // must be the first!
+ { "max_write", "65536", 0 },
+ { "max_size", "1048576", 0 },
+ { "alignment", "16", 0 },
+ { "bluestore_csum_type", "crc32c", "crc32c_16", "crc32c_8", "xxhash32",
+ "xxhash64", "none", 0 },
+ { "bluestore_default_buffered_write", "false", 0 },
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCsumVsCompression) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "4096", "16384", 0 }, //to be the first!
+ { "max_write", "131072", 0 },
+ { "max_size", "262144", 0 },
+ { "alignment", "512", 0 },
+ { "bluestore_compression_mode", "force", 0},
+ { "bluestore_compression_algorithm", "snappy", "zlib", 0 },
+ { "bluestore_csum_type", "crc32c", 0 },
+ { "bluestore_default_buffered_read", "true", "false", 0 },
+ { "bluestore_default_buffered_write", "true", "false", 0 },
+ { "bluestore_sync_submit_transaction", "false", 0 },
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCompression) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first!
+ { "max_write", "1048576", 0 },
+ { "max_size", "4194304", 0 },
+ { "alignment", "65536", 0 },
+ { "bluestore_compression_mode", "force", "aggressive", "passive", "none", 0},
+ { "bluestore_default_buffered_write", "false", 0 },
+ { "bluestore_sync_submit_transaction", "true", 0 },
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCompressionAlgorithm) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first!
+ { "max_write", "1048576", 0 },
+ { "max_size", "4194304", 0 },
+ { "alignment", "65536", 0 },
+ { "bluestore_compression_algorithm", "zlib", "snappy", 0 },
+ { "bluestore_compression_mode", "force", 0 },
+ { "bluestore_default_buffered_write", "false", 0 },
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixNoCsum) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first!
+ { "max_write", "65536", 0 },
+ { "max_size", "1048576", 0 },
+ { "alignment", "512", 0 },
+ { "bluestore_max_blob_size", "262144", 0 },
+ { "bluestore_compression_mode", "force", "none", 0},
+ { "bluestore_csum_type", "none", 0},
+ { "bluestore_default_buffered_read", "true", "false", 0 },
+ { "bluestore_default_buffered_write", "true", 0 },
+ { "bluestore_sync_submit_transaction", "true", "false", 0 },
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+
+TEST_P(StoreTestSpecificAUSize, SyntheticMatrixPreferDeferred) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ const char *m[][10] = {
+ { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first!
+ { "max_write", "65536", 0 },
+ { "max_size", "1048576", 0 },
+ { "alignment", "512", 0 },
+ { "bluestore_max_blob_size", "262144", 0 },
+ { "bluestore_compression_mode", "force", "none", 0},
+ { "bluestore_prefer_deferred_size", "32768", "0", 0},
+ { 0 },
+ };
+ do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4));
+}
+#endif // WITH_BLUESTORE
+
+TEST_P(StoreTest, AttrSynthetic) {
+ MixedGenerator gen(447);
+ gen_type rng(time(NULL));
+ coll_t cid(spg_t(pg_t(0,447),shard_id_t::NO_SHARD));
+
+ SyntheticWorkloadState test_obj(store.get(), &gen, &rng, cid, 40*1024, 4*1024, 0);
+ test_obj.init();
+ for (int i = 0; i < 500; ++i) {
+ if (!(i % 10)) cerr << "seeding object " << i << std::endl;
+ test_obj.touch();
+ }
+ for (int i = 0; i < 1000; ++i) {
+ if (!(i % 100)) {
+ cerr << "Op " << i << std::endl;
+ test_obj.print_internal_state();
+ }
+ boost::uniform_int<> true_false(0, 99);
+ int val = true_false(rng);
+ if (val > 97) {
+ test_obj.scan();
+ } else if (val > 93) {
+ test_obj.stat();
+ } else if (val > 75) {
+ test_obj.rmattr();
+ } else if (val > 47) {
+ test_obj.setattrs();
+ } else if (val > 45) {
+ test_obj.clone();
+ } else if (val > 37) {
+ test_obj.stash();
+ } else if (val > 30) {
+ test_obj.getattrs();
+ } else {
+ test_obj.getattr();
+ }
+ }
+ test_obj.wait_for_done();
+ test_obj.shutdown();
+}
+
+TEST_P(StoreTest, HashCollisionTest) {
+ int64_t poolid = 11;
+ coll_t cid(spg_t(pg_t(0,poolid),shard_id_t::NO_SHARD));
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ string base = "";
+ for (int i = 0; i < 100; ++i) base.append("aaaaa");
+ set<ghobject_t> created;
+ for (int n = 0; n < 10; ++n) {
+ char nbuf[100];
+ sprintf(nbuf, "n%d", n);
+ for (int i = 0; i < 1000; ++i) {
+ char buf[100];
+ sprintf(buf, "%d", i);
+ if (!(i % 100)) {
+ cerr << "Object n" << n << " "<< i << std::endl;
+ }
+ ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, poolid, string(nbuf)));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ created.insert(hoid);
+ }
+ }
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+ set<ghobject_t> listed(objects.begin(), objects.end());
+ cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
+ ASSERT_TRUE(listed.size() == created.size());
+ objects.clear();
+ listed.clear();
+ ghobject_t current, next;
+ while (1) {
+ r = collection_list(store, ch, current, ghobject_t::get_max(), 60, &objects,
+ &next);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(sorted(objects));
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (listed.count(*i))
+ cerr << *i << " repeated" << std::endl;
+ listed.insert(*i);
+ }
+ if (objects.size() < 50) {
+ ASSERT_TRUE(next.is_max());
+ break;
+ }
+ objects.clear();
+ current = next;
+ }
+ cerr << "listed.size() is " << listed.size() << std::endl;
+ ASSERT_TRUE(listed.size() == created.size());
+ for (set<ghobject_t>::iterator i = listed.begin();
+ i != listed.end();
+ ++i) {
+ ASSERT_TRUE(created.count(*i));
+ }
+
+ for (set<ghobject_t>::iterator i = created.begin();
+ i != created.end();
+ ++i) {
+ ObjectStore::Transaction t;
+ t.remove(cid, *i);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+}
+
+TEST_P(StoreTest, HashCollisionSorting) {
+ bool disable_legacy = (string(GetParam()) == "bluestore");
+
+ char buf121664318_1[] = {18, -119, -121, -111, 0};
+ char buf121664318_2[] = {19, 127, -121, 32, 0};
+ char buf121664318_3[] = {19, -118, 15, 19, 0};
+ char buf121664318_4[] = {28, 27, -116, -113, 0};
+ char buf121664318_5[] = {28, 27, -115, -124, 0};
+
+ char buf121666222_1[] = {18, -119, -120, -111, 0};
+ char buf121666222_2[] = {19, 127, -120, 32, 0};
+ char buf121666222_3[] = {19, -118, 15, 30, 0};
+ char buf121666222_4[] = {29, 17, -126, -113, 0};
+ char buf121666222_5[] = {29, 17, -125, -124, 0};
+
+ std::map<uint32_t, std::vector<std::string>> object_names = {
+ {121664318, {{buf121664318_1},
+ {buf121664318_2},
+ {buf121664318_3},
+ {buf121664318_4},
+ {buf121664318_5}}},
+ {121666222, {{buf121666222_1},
+ {buf121666222_2},
+ {buf121666222_3},
+ {buf121666222_4},
+ {buf121666222_5}}}};
+
+ int64_t poolid = 111;
+ coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ std::set<ghobject_t> created;
+ for (auto &[hash, names] : object_names) {
+ for (auto &name : names) {
+ ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP),
+ string(),
+ hash,
+ poolid,
+ string()));
+ ASSERT_EQ(hash, hoid.hobj.get_hash());
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ created.insert(hoid);
+ }
+ }
+
+ vector<ghobject_t> objects;
+ int r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(),
+ INT_MAX, &objects, 0, disable_legacy);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(created.size(), objects.size());
+ auto it = objects.begin();
+ for (auto &hoid : created) {
+ ASSERT_EQ(hoid, *it);
+ it++;
+ }
+
+ for (auto i = created.begin(); i != created.end(); i++) {
+ auto j = i;
+ for (j++; j != created.end(); j++) {
+ std::set<ghobject_t> created_sub(i, j);
+ objects.clear();
+ ghobject_t next;
+ r = collection_list(store, ch, *i, ghobject_t::get_max(),
+ created_sub.size(), &objects, &next, disable_legacy);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(created_sub.size(), objects.size());
+ it = objects.begin();
+ for (auto &hoid : created_sub) {
+ ASSERT_EQ(hoid, *it);
+ it++;
+ }
+ if (j == created.end()) {
+ ASSERT_TRUE(next.is_max());
+ } else {
+ ASSERT_EQ(*j, next);
+ }
+ }
+ }
+
+ for (auto i = created.begin(); i != created.end(); i++) {
+ auto j = i;
+ for (j++; j != created.end(); j++) {
+ std::set<ghobject_t> created_sub(i, j);
+ objects.clear();
+ ghobject_t next;
+ r = collection_list(store, ch, *i, *j, INT_MAX, &objects, &next,
+ disable_legacy);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(created_sub.size(), objects.size());
+ it = objects.begin();
+ for (auto &hoid : created_sub) {
+ ASSERT_EQ(hoid, *it);
+ it++;
+ }
+ if (j == created.end()) {
+ ASSERT_TRUE(next.is_max());
+ } else {
+ ASSERT_EQ(*j, next);
+ }
+ }
+ }
+}
+
+TEST_P(StoreTest, ScrubTest) {
+ int64_t poolid = 111;
+ coll_t cid(spg_t(pg_t(0, poolid),shard_id_t(1)));
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ string base = "aaaaa";
+ set<ghobject_t> created;
+ for (int i = 0; i < 1000; ++i) {
+ char buf[100];
+ sprintf(buf, "%d", i);
+ if (!(i % 5)) {
+ cerr << "Object " << i << std::endl;
+ }
+ ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, i,
+ poolid, ""),
+ ghobject_t::NO_GEN, shard_id_t(1));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ created.insert(hoid);
+ }
+
+ // Add same hobject_t but different generation
+ {
+ ghobject_t hoid1(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""),
+ ghobject_t::NO_GEN, shard_id_t(1));
+ ghobject_t hoid2(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), (gen_t)1, shard_id_t(1));
+ ghobject_t hoid3(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), (gen_t)2, shard_id_t(1));
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid1);
+ t.touch(cid, hoid2);
+ t.touch(cid, hoid3);
+ r = queue_transaction(store, ch, std::move(t));
+ created.insert(hoid1);
+ created.insert(hoid2);
+ created.insert(hoid3);
+ ASSERT_EQ(r, 0);
+ }
+
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+ set<ghobject_t> listed(objects.begin(), objects.end());
+ cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
+ ASSERT_TRUE(listed.size() == created.size());
+ objects.clear();
+ listed.clear();
+ ghobject_t current, next;
+ while (1) {
+ r = collection_list(store, ch, current, ghobject_t::get_max(), 60, &objects,
+ &next);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(sorted(objects));
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end(); ++i) {
+ if (listed.count(*i))
+ cerr << *i << " repeated" << std::endl;
+ listed.insert(*i);
+ }
+ if (objects.size() < 50) {
+ ASSERT_TRUE(next.is_max());
+ break;
+ }
+ objects.clear();
+ current = next.get_boundary();
+ }
+ cerr << "listed.size() is " << listed.size() << std::endl;
+ ASSERT_TRUE(listed.size() == created.size());
+ for (set<ghobject_t>::iterator i = listed.begin();
+ i != listed.end();
+ ++i) {
+ ASSERT_TRUE(created.count(*i));
+ }
+
+ for (set<ghobject_t>::iterator i = created.begin();
+ i != created.end();
+ ++i) {
+ ObjectStore::Transaction t;
+ t.remove(cid, *i);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+}
+
+
+TEST_P(StoreTest, OMapTest) {
+ coll_t cid;
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ map<string, bufferlist> attrs;
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.omap_clear(cid, hoid);
+ map<string, bufferlist> start_set;
+ t.omap_setkeys(cid, hoid, start_set);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ for (int i = 0; i < 100; i++) {
+ if (!(i%5)) {
+ std::cout << "On iteration " << i << std::endl;
+ }
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ map<string, bufferlist> cur_attrs;
+ r = store->omap_get(ch, hoid, &bl, &cur_attrs);
+ ASSERT_EQ(r, 0);
+ for (map<string, bufferlist>::iterator j = attrs.begin();
+ j != attrs.end();
+ ++j) {
+ bool correct = cur_attrs.count(j->first) && string(cur_attrs[j->first].c_str()) == string(j->second.c_str());
+ if (!correct) {
+ std::cout << j->first << " is present in cur_attrs " << cur_attrs.count(j->first) << " times " << std::endl;
+ if (cur_attrs.count(j->first) > 0) {
+ std::cout << j->second.c_str() << " : " << cur_attrs[j->first].c_str() << std::endl;
+ }
+ }
+ ASSERT_EQ(correct, true);
+ }
+ ASSERT_EQ(attrs.size(), cur_attrs.size());
+
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", i);
+ bl.clear();
+ bufferptr bp(buf, strlen(buf) + 1);
+ bl.append(bp);
+ map<string, bufferlist> to_add;
+ to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl));
+ attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
+ t.omap_setkeys(cid, hoid, to_add);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ int i = 0;
+ while (attrs.size()) {
+ if (!(i%5)) {
+ std::cout << "removal: On iteration " << i << std::endl;
+ }
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ map<string, bufferlist> cur_attrs;
+ r = store->omap_get(ch, hoid, &bl, &cur_attrs);
+ ASSERT_EQ(r, 0);
+ for (map<string, bufferlist>::iterator j = attrs.begin();
+ j != attrs.end();
+ ++j) {
+ bool correct = cur_attrs.count(j->first) && string(cur_attrs[j->first].c_str()) == string(j->second.c_str());
+ if (!correct) {
+ std::cout << j->first << " is present in cur_attrs " << cur_attrs.count(j->first) << " times " << std::endl;
+ if (cur_attrs.count(j->first) > 0) {
+ std::cout << j->second.c_str() << " : " << cur_attrs[j->first].c_str() << std::endl;
+ }
+ }
+ ASSERT_EQ(correct, true);
+ }
+
+ string to_remove = attrs.begin()->first;
+ t.omap_rmkey(cid, hoid, to_remove);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ attrs.erase(to_remove);
+
+ ++i;
+ }
+
+ {
+ bufferlist bl1;
+ bl1.append("omap_header");
+ ObjectStore::Transaction t;
+ t.omap_setheader(cid, hoid, bl1);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ t = ObjectStore::Transaction();
+
+ bufferlist bl2;
+ bl2.append("value");
+ map<string, bufferlist> to_add;
+ to_add.insert(pair<string, bufferlist>("key", bl2));
+ t.omap_setkeys(cid, hoid, to_add);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bufferlist bl3;
+ map<string, bufferlist> cur_attrs;
+ r = store->omap_get(ch, hoid, &bl3, &cur_attrs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(cur_attrs.size(), size_t(1));
+ ASSERT_TRUE(bl_eq(bl1, bl3));
+
+ set<string> keys;
+ r = store->omap_get_keys(ch, hoid, &keys);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(keys.size(), size_t(1));
+ }
+
+ // test omap_clear, omap_rmkey_range
+ {
+ {
+ map<string,bufferlist> to_set;
+ for (int n=0; n<10; ++n) {
+ to_set[stringify(n)].append("foo");
+ }
+ bufferlist h;
+ h.append("header");
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.touch(cid, hoid);
+ t.omap_setheader(cid, hoid, h);
+ t.omap_setkeys(cid, hoid, to_set);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.omap_rmkeyrange(cid, hoid, "3", "7");
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist hdr;
+ map<string,bufferlist> m;
+ store->omap_get(ch, hoid, &hdr, &m);
+ ASSERT_EQ(6u, hdr.length());
+ ASSERT_TRUE(m.count("2"));
+ ASSERT_TRUE(!m.count("3"));
+ ASSERT_TRUE(!m.count("6"));
+ ASSERT_TRUE(m.count("7"));
+ ASSERT_TRUE(m.count("8"));
+ //cout << m << std::endl;
+ ASSERT_EQ(6u, m.size());
+ }
+ {
+ ObjectStore::Transaction t;
+ t.omap_clear(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist hdr;
+ map<string,bufferlist> m;
+ store->omap_get(ch, hoid, &hdr, &m);
+ ASSERT_EQ(0u, hdr.length());
+ ASSERT_EQ(0u, m.size());
+ }
+ }
+
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+}
+
+TEST_P(StoreTest, OMapIterator) {
+ coll_t cid;
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
+ int count = 0;
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ map<string, bufferlist> attrs;
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.omap_clear(cid, hoid);
+ map<string, bufferlist> start_set;
+ t.omap_setkeys(cid, hoid, start_set);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ObjectMap::ObjectMapIterator iter;
+ bool correct;
+ //basic iteration
+ for (int i = 0; i < 100; i++) {
+ if (!(i%5)) {
+ std::cout << "On iteration " << i << std::endl;
+ }
+ bufferlist bl;
+
+ // FileStore may deadlock two active iterators over the same data
+ iter = ObjectMap::ObjectMapIterator();
+
+ iter = store->get_omap_iterator(ch, hoid);
+ for (iter->seek_to_first(), count=0; iter->valid(); iter->next(), count++) {
+ string key = iter->key();
+ bufferlist value = iter->value();
+ correct = attrs.count(key) && (string(value.c_str()) == string(attrs[key].c_str()));
+ if (!correct) {
+ if (attrs.count(key) > 0) {
+ std::cout << "key " << key << "in omap , " << value.c_str() << " : " << attrs[key].c_str() << std::endl;
+ }
+ else
+ std::cout << "key " << key << "should not exists in omap" << std::endl;
+ }
+ ASSERT_EQ(correct, true);
+ }
+ ASSERT_EQ((int)attrs.size(), count);
+
+ // FileStore may deadlock an active iterator vs queue_transaction
+ iter = ObjectMap::ObjectMapIterator();
+
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", i);
+ bl.clear();
+ bufferptr bp(buf, strlen(buf) + 1);
+ bl.append(bp);
+ map<string, bufferlist> to_add;
+ to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl));
+ attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
+ ObjectStore::Transaction t;
+ t.omap_setkeys(cid, hoid, to_add);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ iter = store->get_omap_iterator(ch, hoid);
+ //lower bound
+ string bound_key = "key-5";
+ iter->lower_bound(bound_key);
+ correct = bound_key <= iter->key();
+ if (!correct) {
+ std::cout << "lower bound, bound key is " << bound_key << " < iter key is " << iter->key() << std::endl;
+ }
+ ASSERT_EQ(correct, true);
+ //upper bound
+ iter->upper_bound(bound_key);
+ correct = iter->key() > bound_key;
+ if (!correct) {
+ std::cout << "upper bound, bound key is " << bound_key << " >= iter key is " << iter->key() << std::endl;
+ }
+ ASSERT_EQ(correct, true);
+
+ // FileStore may deadlock an active iterator vs queue_transaction
+ iter = ObjectMap::ObjectMapIterator();
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, XattrTest) {
+ coll_t cid;
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
+ bufferlist big;
+ for (unsigned i = 0; i < 10000; ++i) {
+ big.append('\0');
+ }
+ bufferlist small;
+ for (unsigned i = 0; i < 10; ++i) {
+ small.append('\0');
+ }
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ map<string, bufferlist> attrs;
+ {
+ ObjectStore::Transaction t;
+ t.setattr(cid, hoid, "attr1", small);
+ attrs["attr1"] = small;
+ t.setattr(cid, hoid, "attr2", big);
+ attrs["attr2"] = big;
+ t.setattr(cid, hoid, "attr3", small);
+ attrs["attr3"] = small;
+ t.setattr(cid, hoid, "attr1", small);
+ attrs["attr1"] = small;
+ t.setattr(cid, hoid, "attr4", big);
+ attrs["attr4"] = big;
+ t.setattr(cid, hoid, "attr3", big);
+ attrs["attr3"] = big;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ map<string, bufferptr, less<>> aset;
+ store->getattrs(ch, hoid, aset);
+ ASSERT_EQ(aset.size(), attrs.size());
+ for (map<string, bufferptr>::iterator i = aset.begin();
+ i != aset.end();
+ ++i) {
+ bufferlist bl;
+ bl.push_back(i->second);
+ ASSERT_TRUE(attrs[i->first] == bl);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.rmattr(cid, hoid, "attr2");
+ attrs.erase("attr2");
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ aset.clear();
+ store->getattrs(ch, hoid, aset);
+ ASSERT_EQ(aset.size(), attrs.size());
+ for (map<string, bufferptr>::iterator i = aset.begin();
+ i != aset.end();
+ ++i) {
+ bufferlist bl;
+ bl.push_back(i->second);
+ ASSERT_TRUE(attrs[i->first] == bl);
+ }
+
+ bufferptr bp;
+ r = store->getattr(ch, hoid, "attr2", bp);
+ ASSERT_EQ(r, -ENODATA);
+
+ r = store->getattr(ch, hoid, "attr3", bp);
+ ASSERT_EQ(r, 0);
+ bufferlist bl2;
+ bl2.push_back(bp);
+ ASSERT_TRUE(bl2 == attrs["attr3"]);
+
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+}
+
+void colsplittest(
+ ObjectStore *store,
+ unsigned num_objects,
+ unsigned common_suffix_size,
+ bool clones
+ ) {
+ coll_t cid(spg_t(pg_t(0,52),shard_id_t::NO_SHARD));
+ coll_t tid(spg_t(pg_t(1<<common_suffix_size,52),shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+ auto tch = store->create_new_collection(tid);
+ int r = 0;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, common_suffix_size);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist small;
+ small.append("small");
+ {
+ ObjectStore::Transaction t;
+ for (uint32_t i = 0; i < (2 - (int)clones)*num_objects; ++i) {
+ stringstream objname;
+ objname << "obj" << i;
+ ghobject_t a(hobject_t(
+ objname.str(),
+ "",
+ CEPH_NOSNAP,
+ i<<common_suffix_size,
+ 52, ""));
+ t.write(cid, a, 0, small.length(), small,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ if (clones) {
+ objname << "-clone";
+ ghobject_t b(hobject_t(
+ objname.str(),
+ "",
+ CEPH_NOSNAP,
+ i<<common_suffix_size,
+ 52, ""));
+ t.clone(cid, a, b);
+ }
+ if (i % 100) {
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ t = ObjectStore::Transaction();
+ }
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(tid, common_suffix_size + 1);
+ t.split_collection(cid, common_suffix_size+1, 1<<common_suffix_size, tid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ch->flush();
+
+ // check
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), num_objects);
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ ASSERT_EQ(!!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u);
+ }
+
+ objects.clear();
+ r = collection_list(store, tch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), num_objects);
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ ASSERT_EQ(!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u);
+ }
+
+ // merge them again!
+ {
+ ObjectStore::Transaction t;
+ t.merge_collection(tid, cid, common_suffix_size);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // check and clean up
+ ObjectStore::Transaction t;
+ {
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), num_objects * 2); // both halves
+ unsigned size = 0;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ t.remove(cid, *i);
+ if (++size > 100) {
+ size = 0;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ t = ObjectStore::Transaction();
+ }
+ }
+ }
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ ch->flush();
+ ASSERT_TRUE(!store->collection_exists(tid));
+}
+
+TEST_P(StoreTest, ColSplitTest0) {
+ colsplittest(store.get(), 10, 5, false);
+}
+TEST_P(StoreTest, ColSplitTest1) {
+ colsplittest(store.get(), 10000, 11, false);
+}
+TEST_P(StoreTest, ColSplitTest1Clones) {
+ colsplittest(store.get(), 10000, 11, true);
+}
+TEST_P(StoreTest, ColSplitTest2) {
+ colsplittest(store.get(), 100, 7, false);
+}
+TEST_P(StoreTest, ColSplitTest2Clones) {
+ colsplittest(store.get(), 100, 7, true);
+}
+
+#if 0
+TEST_P(StoreTest, ColSplitTest3) {
+ colsplittest(store.get(), 100000, 25);
+}
+#endif
+
+void test_merge_skewed(ObjectStore *store,
+ unsigned base, unsigned bits,
+ unsigned anum, unsigned bnum)
+{
+ cout << __func__ << " 0x" << std::hex << base << std::dec
+ << " bits " << bits
+ << " anum " << anum << " bnum " << bnum << std::endl;
+ /*
+ make merge source pgs have radically different # of objects in them,
+ which should trigger different splitting in filestore, and verify that
+ post-merge all objects are accessible.
+ */
+ int r;
+ coll_t a(spg_t(pg_t(base, 0), shard_id_t::NO_SHARD));
+ coll_t b(spg_t(pg_t(base | (1<<bits), 0), shard_id_t::NO_SHARD));
+
+ auto cha = store->create_new_collection(a);
+ auto chb = store->create_new_collection(b);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(a, bits + 1);
+ r = queue_transaction(store, cha, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(b, bits + 1);
+ r = queue_transaction(store, chb, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bufferlist small;
+ small.append("small");
+ string suffix = "ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooaaaaaaaaaa";
+ set<ghobject_t> aobjects, bobjects;
+ {
+ // fill a
+ ObjectStore::Transaction t;
+ for (unsigned i = 0; i < 1000; ++i) {
+ string objname = "a" + stringify(i) + suffix;
+ ghobject_t o(hobject_t(
+ objname,
+ "",
+ CEPH_NOSNAP,
+ i<<(bits+1) | base,
+ 52, ""));
+ aobjects.insert(o);
+ t.write(a, o, 0, small.length(), small, 0);
+ if (i % 100) {
+ r = queue_transaction(store, cha, std::move(t));
+ ASSERT_EQ(r, 0);
+ t = ObjectStore::Transaction();
+ }
+ }
+ r = queue_transaction(store, cha, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // fill b
+ ObjectStore::Transaction t;
+ for (unsigned i = 0; i < 10; ++i) {
+ string objname = "b" + stringify(i) + suffix;
+ ghobject_t o(hobject_t(
+ objname,
+ "",
+ CEPH_NOSNAP,
+ (i<<(base+1)) | base | (1<<bits),
+ 52, ""));
+ bobjects.insert(o);
+ t.write(b, o, 0, small.length(), small, 0);
+ if (i % 100) {
+ r = queue_transaction(store, chb, std::move(t));
+ ASSERT_EQ(r, 0);
+ t = ObjectStore::Transaction();
+ }
+ }
+ r = queue_transaction(store, chb, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // merge b->a
+ {
+ ObjectStore::Transaction t;
+ t.merge_collection(b, a, bits);
+ r = queue_transaction(store, cha, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // verify
+ {
+ vector<ghobject_t> got;
+ collection_list(store, cha, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &got, 0);
+ set<ghobject_t> gotset;
+ for (auto& o : got) {
+ ASSERT_TRUE(aobjects.count(o) || bobjects.count(o));
+ gotset.insert(o);
+ }
+ // check both listing and stat-ability (different code paths!)
+ struct stat st;
+ for (auto& o : aobjects) {
+ ASSERT_TRUE(gotset.count(o));
+ int r = store->stat(cha, o, &st, false);
+ ASSERT_EQ(r, 0);
+ }
+ for (auto& o : bobjects) {
+ ASSERT_TRUE(gotset.count(o));
+ int r = store->stat(cha, o, &st, false);
+ ASSERT_EQ(r, 0);
+ }
+ }
+
+ // clean up
+ {
+ ObjectStore::Transaction t;
+ for (auto &o : aobjects) {
+ t.remove(a, o);
+ }
+ r = queue_transaction(store, cha, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ for (auto &o : bobjects) {
+ t.remove(a, o);
+ }
+ t.remove_collection(a);
+ r = queue_transaction(store, cha, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, MergeSkewed) {
+ if (string(GetParam()) != "filestore")
+ return;
+
+ // this is sufficient to exercise merges with different hashing levels
+ test_merge_skewed(store.get(), 0xf, 4, 10, 10000);
+ test_merge_skewed(store.get(), 0xf, 4, 10000, 10);
+
+ /*
+ // this covers a zillion variations that all boil down to the same thing
+ for (unsigned base = 3; base < 0x1000; base *= 5) {
+ unsigned bits;
+ unsigned t = base;
+ for (bits = 0; t; t >>= 1) {
+ ++bits;
+ }
+ for (unsigned b = bits; b < bits + 10; b += 3) {
+ for (auto anum : { 10, 1000, 10000 }) {
+ for (auto bnum : { 10, 1000, 10000 }) {
+ if (anum == bnum) {
+ continue;
+ }
+ test_merge_skewed(store.get(), base, b, anum, bnum);
+ }
+ }
+ }
+ }
+ */
+}
+
+
+/**
+ * This test tests adding two different groups
+ * of objects, each with 1 common prefix and 1
+ * different prefix. We then remove half
+ * in order to verify that the merging correctly
+ * stops at the common prefix subdir. See bug
+ * #5273 */
+TEST_P(StoreTest, TwoHash) {
+ coll_t cid;
+ int r;
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ std::cout << "Making objects" << std::endl;
+ for (int i = 0; i < 360; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t o;
+ o.hobj.pool = -1;
+ if (i < 8) {
+ o.hobj.set_hash((i << 16) | 0xA1);
+ t.touch(cid, o);
+ }
+ o.hobj.set_hash((i << 16) | 0xB1);
+ t.touch(cid, o);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ std::cout << "Removing half" << std::endl;
+ for (int i = 1; i < 8; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t o;
+ o.hobj.pool = -1;
+ o.hobj.set_hash((i << 16) | 0xA1);
+ t.remove(cid, o);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ std::cout << "Checking" << std::endl;
+ for (int i = 1; i < 8; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t o;
+ o.hobj.set_hash((i << 16) | 0xA1);
+ o.hobj.pool = -1;
+ bool exists = store->exists(ch, o);
+ ASSERT_EQ(exists, false);
+ }
+ {
+ ghobject_t o;
+ o.hobj.set_hash(0xA1);
+ o.hobj.pool = -1;
+ bool exists = store->exists(ch, o);
+ ASSERT_EQ(exists, true);
+ }
+ std::cout << "Cleanup" << std::endl;
+ for (int i = 0; i < 360; ++i) {
+ ObjectStore::Transaction t;
+ ghobject_t o;
+ o.hobj.set_hash((i << 16) | 0xA1);
+ o.hobj.pool = -1;
+ t.remove(cid, o);
+ o.hobj.set_hash((i << 16) | 0xB1);
+ t.remove(cid, o);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+}
+
+TEST_P(StoreTest, Rename) {
+ coll_t cid(spg_t(pg_t(0, 2122),shard_id_t::NO_SHARD));
+ ghobject_t srcoid(hobject_t("src_oid", "", CEPH_NOSNAP, 0, 0, ""));
+ ghobject_t dstoid(hobject_t("dest_oid", "", CEPH_NOSNAP, 0, 0, ""));
+ bufferlist a, b;
+ a.append("foo");
+ b.append("bar");
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.write(cid, srcoid, 0, a.length(), a);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(ch, srcoid));
+ {
+ ObjectStore::Transaction t;
+ t.collection_move_rename(cid, srcoid, cid, dstoid);
+ t.write(cid, srcoid, 0, b.length(), b);
+ t.setattr(cid, srcoid, "attr", b);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(ch, srcoid));
+ ASSERT_TRUE(store->exists(ch, dstoid));
+ {
+ bufferlist bl;
+ store->read(ch, srcoid, 0, 3, bl);
+ ASSERT_TRUE(bl_eq(b, bl));
+ store->read(ch, dstoid, 0, 3, bl);
+ ASSERT_TRUE(bl_eq(a, bl));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, dstoid);
+ t.collection_move_rename(cid, srcoid, cid, dstoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(ch, dstoid));
+ ASSERT_FALSE(store->exists(ch, srcoid));
+ {
+ bufferlist bl;
+ store->read(ch, dstoid, 0, 3, bl);
+ ASSERT_TRUE(bl_eq(b, bl));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, dstoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, MoveRename) {
+ coll_t cid(spg_t(pg_t(0, 212),shard_id_t::NO_SHARD));
+ ghobject_t temp_oid(hobject_t("tmp_oid", "", CEPH_NOSNAP, 0, 0, ""));
+ ghobject_t oid(hobject_t("dest_oid", "", CEPH_NOSNAP, 0, 0, ""));
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(ch, oid));
+ bufferlist data, attr;
+ map<string, bufferlist> omap;
+ data.append("data payload");
+ attr.append("attr value");
+ omap["omap_key"].append("omap value");
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, temp_oid);
+ t.write(cid, temp_oid, 0, data.length(), data);
+ t.setattr(cid, temp_oid, "attr", attr);
+ t.omap_setkeys(cid, temp_oid, omap);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(ch, temp_oid));
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.collection_move_rename(cid, temp_oid, cid, oid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_TRUE(store->exists(ch, oid));
+ ASSERT_FALSE(store->exists(ch, temp_oid));
+ {
+ bufferlist newdata;
+ r = store->read(ch, oid, 0, 1000, newdata);
+ ASSERT_GE(r, 0);
+ ASSERT_TRUE(bl_eq(data, newdata));
+ bufferlist newattr;
+ r = store->getattr(ch, oid, "attr", newattr);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(bl_eq(attr, newattr));
+ set<string> keys;
+ keys.insert("omap_key");
+ map<string, bufferlist> newomap;
+ r = store->omap_get_values(ch, oid, keys, &newomap);
+ ASSERT_GE(r, 0);
+ ASSERT_EQ(1u, newomap.size());
+ ASSERT_TRUE(newomap.count("omap_key"));
+ ASSERT_TRUE(bl_eq(omap["omap_key"], newomap["omap_key"]));
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, BigRGWObjectName) {
+ coll_t cid(spg_t(pg_t(0,12),shard_id_t::NO_SHARD));
+ ghobject_t oid(
+ hobject_t(
+ "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "",
+ CEPH_NOSNAP,
+ 0x81920472,
+ 12,
+ ""),
+ 15,
+ shard_id_t::NO_SHARD);
+ ghobject_t oid2(oid);
+ oid2.generation = 17;
+ ghobject_t oidhead(oid);
+ oidhead.generation = ghobject_t::NO_GEN;
+
+ auto ch = store->create_new_collection(cid);
+
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oidhead);
+ t.collection_move_rename(cid, oidhead, cid, oid);
+ t.touch(cid, oidhead);
+ t.collection_move_rename(cid, oidhead, cid, oid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ vector<ghobject_t> objects;
+ r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX,
+ &objects, 0);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), 1u);
+ ASSERT_EQ(objects[0], oid2);
+ }
+
+ ASSERT_FALSE(store->exists(ch, oid));
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid2);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ }
+}
+
+TEST_P(StoreTest, SetAllocHint) {
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, TryMoveRename) {
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, ""));
+ ghobject_t hoid2(hobject_t("test_hint2", "", CEPH_NOSNAP, 0, -1, ""));
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.try_rename(cid, hoid, hoid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.try_rename(cid, hoid, hoid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ struct stat st;
+ ASSERT_EQ(store->stat(ch, hoid, &st), -ENOENT);
+ ASSERT_EQ(store->stat(ch, hoid2, &st), 0);
+}
+
+#if defined(WITH_BLUESTORE)
+TEST_P(StoreTest, BluestoreOnOffCSumTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ SetVal(g_conf(), "bluestore_csum_type", "crc32c");
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ {
+ auto ch = store->open_collection(cid);
+ ASSERT_FALSE(ch);
+ }
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ //write with csum enabled followed by read with csum disabled
+ size_t block_size = 64*1024;
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ bl.append(std::string(block_size, 'a'));
+ orig = bl;
+ t.remove(cid, hoid);
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*8, 0);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "Remove then create" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ SetVal(g_conf(), "bluestore_csum_type", "none");
+ g_conf().apply_changes(nullptr);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+
+ }
+ {
+ //write with csum disabled followed by read with csum enabled
+
+ size_t block_size = 64*1024;
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ bl.append(std::string(block_size, 'a'));
+ orig = bl;
+ t.remove(cid, hoid);
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*8, 0);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "Remove then create" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ SetVal(g_conf(), "bluestore_csum_type", "crc32c");
+ g_conf().apply_changes(nullptr);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+ }
+ {
+ //'mixed' non-overlapping writes to the same blob
+
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ size_t block_size = 8000;
+ bl.append(std::string(block_size, 'a'));
+ orig = bl;
+ t.remove(cid, hoid);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "Remove then create" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ SetVal(g_conf(), "bluestore_csum_type", "none");
+ g_conf().apply_changes(nullptr);
+
+ ObjectStore::Transaction t2;
+ t2.write(cid, hoid, block_size*2, bl.length(), bl);
+ cerr << "Append 'unprotected'" << std::endl;
+ r = queue_transaction(store, ch, std::move(t2));
+ ASSERT_EQ(r, 0);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+ in.clear();
+ r = store->read(ch, hoid, block_size*2, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+
+ SetVal(g_conf(), "bluestore_csum_type", "crc32c");
+ g_conf().apply_changes(nullptr);
+ in.clear();
+ r = store->read(ch, hoid, 0, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+ in.clear();
+ r = store->read(ch, hoid, block_size*2, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+ }
+ {
+ //partially blob overwrite under a different csum enablement mode
+
+ ObjectStore::Transaction t;
+ bufferlist bl, orig, orig2;
+ size_t block_size0 = 0x10000;
+ size_t block_size = 9000;
+ size_t block_size2 = 5000;
+ bl.append(std::string(block_size0, 'a'));
+ t.remove(cid, hoid);
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*8, 0);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "Remove then create" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ SetVal(g_conf(), "bluestore_csum_type", "none");
+ g_conf().apply_changes(nullptr);
+
+ ObjectStore::Transaction t2;
+ bl.clear();
+ bl.append(std::string(block_size, 'b'));
+ t2.write(cid, hoid, 0, bl.length(), bl);
+ t2.write(cid, hoid, block_size0, bl.length(), bl);
+ cerr << "Overwrite with unprotected data" << std::endl;
+ r = queue_transaction(store, ch, std::move(t2));
+ ASSERT_EQ(r, 0);
+
+ orig = bl;
+ orig2 = bl;
+ orig.append( std::string(block_size0 - block_size, 'a'));
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, block_size0, in);
+ ASSERT_EQ((int)block_size0, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+
+ r = store->read(ch, hoid, block_size0, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig2, in));
+
+ SetVal(g_conf(), "bluestore_csum_type", "crc32c");
+ g_conf().apply_changes(nullptr);
+
+ ObjectStore::Transaction t3;
+ bl.clear();
+ bl.append(std::string(block_size2, 'c'));
+ t3.write(cid, hoid, block_size0, bl.length(), bl);
+ cerr << "Overwrite with protected data" << std::endl;
+ r = queue_transaction(store, ch, std::move(t3));
+ ASSERT_EQ(r, 0);
+
+ in.clear();
+ orig = bl;
+ orig.append( std::string(block_size - block_size2, 'b'));
+ r = store->read(ch, hoid, block_size0, block_size, in);
+ ASSERT_EQ((int)block_size, r);
+ ASSERT_TRUE(bl_eq(orig, in));
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+#endif
+
+INSTANTIATE_TEST_SUITE_P(
+ ObjectStore,
+ StoreTest,
+ ::testing::Values(
+ "memstore",
+#if defined(WITH_BLUESTORE)
+ "bluestore",
+#endif
+ "kstore"));
+
+// Note: instantiate all stores to preserve store numbering order only
+INSTANTIATE_TEST_SUITE_P(
+ ObjectStore,
+ StoreTestSpecificAUSize,
+ ::testing::Values(
+ "memstore",
+#if defined(WITH_BLUESTORE)
+ "bluestore",
+#endif
+ "kstore"));
+
+// Note: instantiate all stores to preserve store numbering order only
+INSTANTIATE_TEST_SUITE_P(
+ ObjectStore,
+ StoreTestOmapUpgrade,
+ ::testing::Values(
+ "memstore",
+#if defined(WITH_BLUESTORE)
+ "bluestore",
+#endif
+ "kstore"));
+
+#if defined(WITH_BLUESTORE)
+INSTANTIATE_TEST_SUITE_P(
+ ObjectStore,
+ StoreTestDeferredSetup,
+ ::testing::Values(
+ "bluestore"));
+#endif
+
+
+struct deferred_test_t {
+ uint32_t bdev_block_size;
+ uint32_t min_alloc_size;
+ uint32_t max_blob_size;
+ uint32_t prefer_deferred_size;
+};
+
+void PrintTo(const deferred_test_t& t, ::std::ostream* os)
+{
+ *os << t.bdev_block_size << "/" << t.min_alloc_size << "/"
+ << t.max_blob_size << "/" << t.prefer_deferred_size;
+}
+
+class DeferredWriteTest : public StoreTestFixture,
+ public ::testing::WithParamInterface<deferred_test_t> {
+public:
+ DeferredWriteTest()
+ : StoreTestFixture("bluestore")
+ {}
+ void SetUp() override {
+ //do nothing
+ }
+protected:
+ void DeferredSetup() {
+ StoreTestFixture::SetUp();
+ }
+public:
+ std::vector<uint32_t> offsets = {0, 3000, 4096, 20000, 32768, 65000, 65536, 80000, 128 * 1024};
+ std::vector<uint32_t> lengths = {1, 1000, 4096, 12000, 32768, 30000, 80000, 128 * 1024};
+};
+
+TEST_P(DeferredWriteTest, NewData) {
+ const bool print = false;
+ deferred_test_t t = GetParam();
+ SetVal(g_conf(), "bdev_block_size", stringify(t.bdev_block_size).c_str());
+ SetVal(g_conf(), "bluestore_min_alloc_size", stringify(t.min_alloc_size).c_str());
+ SetVal(g_conf(), "bluestore_max_blob_size", stringify(t.max_blob_size).c_str());
+ SetVal(g_conf(), "bluestore_prefer_deferred_size", stringify(t.prefer_deferred_size).c_str());
+ g_conf().apply_changes(nullptr);
+ DeferredSetup();
+
+ int r;
+ coll_t cid;
+ const PerfCounters* logger = store->get_perf_counters();
+ ObjectStore::CollectionHandle ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ for (auto offset:offsets) {
+ for (auto length:lengths) {
+ std::string hname = fmt::format("test-{}-{}", offset, length);
+ ghobject_t hoid(hobject_t(hname, "", CEPH_NOSNAP, 0, -1, ""));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ if (print)
+ std::cout << hname << std::endl;
+
+ auto w_new = logger->get(l_bluestore_write_new);
+ auto w_big_deferred = logger->get(l_bluestore_write_big_deferred);
+ auto i_deferred_w = logger->get(l_bluestore_issued_deferred_writes);
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(length, 'x'));
+ t.write(cid, hoid, offset, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ uint32_t first_db = offset / t.bdev_block_size;
+ uint32_t last_db = (offset + length - 1) / t.bdev_block_size;
+
+ uint32_t write_size = (last_db - first_db + 1) * t.bdev_block_size;
+ if (write_size < t.prefer_deferred_size) {
+ // expect no direct writes
+ ASSERT_EQ(w_new , logger->get(l_bluestore_write_new));
+ } else {
+ // expect no deferred
+ ASSERT_EQ(w_big_deferred , logger->get(l_bluestore_write_big_deferred));
+ ASSERT_EQ(i_deferred_w , logger->get(l_bluestore_issued_deferred_writes));
+ }
+ }
+ }
+ }
+}
+
+#if defined(WITH_BLUESTORE)
+INSTANTIATE_TEST_SUITE_P(
+ BlueStore,
+ DeferredWriteTest,
+ ::testing::Values(
+ // bdev alloc blob deferred
+ deferred_test_t{4 * 1024, 4 * 1024, 16 * 1024, 32 * 1024},
+ deferred_test_t{4 * 1024, 16 * 1024, 64 * 1024, 64 * 1024},
+ deferred_test_t{4 * 1024, 64 * 1024, 64 * 1024, 4 * 1024},
+ deferred_test_t{4 * 1024, 4 * 1024, 64 * 1024, 0 * 1024},
+ deferred_test_t{4 * 1024, 16 * 1024, 32 * 1024, 32 * 1024},
+ deferred_test_t{4 * 1024, 16 * 1024, 64 * 1024, 128 * 1024}
+ ));
+#endif
+
+void doMany4KWritesTest(ObjectStore* store,
+ unsigned max_objects,
+ unsigned max_ops,
+ unsigned max_object_size,
+ unsigned max_write_size,
+ unsigned write_alignment)
+{
+ MixedGenerator gen(555);
+ gen_type rng(time(NULL));
+ coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD));
+ store_statfs_t res_stat;
+
+ SyntheticWorkloadState test_obj(store,
+ &gen,
+ &rng,
+ cid,
+ max_object_size,
+ max_write_size,
+ write_alignment);
+ test_obj.init();
+ for (unsigned i = 0; i < max_objects; ++i) {
+ if (!(i % 500)) cerr << "seeding object " << i << std::endl;
+ test_obj.touch();
+ }
+ for (unsigned i = 0; i < max_ops; ++i) {
+ if (!(i % 200)) {
+ cerr << "Op " << i << std::endl;
+ test_obj.print_internal_state();
+ }
+ test_obj.write();
+ }
+ test_obj.wait_for_done();
+ test_obj.statfs(res_stat);
+ if (!(res_stat.data_stored <= max_object_size) ||
+ !(res_stat.allocated <= max_object_size)) {
+ // this will provide more insight on the mismatch and
+ // helps to avoid any races during stats collection
+ test_obj.fsck(false);
+ // retrieving stats once again and assert if still broken
+ test_obj.statfs(res_stat);
+ ASSERT_LE(res_stat.data_stored, max_object_size);
+ ASSERT_LE(res_stat.allocated, max_object_size);
+ }
+ test_obj.shutdown();
+}
+
+TEST_P(StoreTestSpecificAUSize, Many4KWritesTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply"
+ << std::endl;
+ return;
+ }
+
+ StartDeferred(0x10000);
+
+ const unsigned max_object = 4*1024*1024;
+ doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0);
+}
+
+TEST_P(StoreTestSpecificAUSize, Many4KWritesNoCSumTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply"
+ << std::endl;
+ return;
+ }
+ StartDeferred(0x10000);
+ SetVal(g_conf(), "bluestore_csum_type", "none");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ const unsigned max_object = 4*1024*1024;
+
+ doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0 );
+}
+
+TEST_P(StoreTestSpecificAUSize, TooManyBlobsTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply"
+ << std::endl;
+ return;
+ }
+ StartDeferred(0x10000);
+ const unsigned max_object = 4*1024*1024;
+ doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0);
+}
+
+#if defined(WITH_BLUESTORE)
+void get_mempool_stats(uint64_t* total_bytes, uint64_t* total_items)
+{
+ uint64_t meta_allocated = mempool::bluestore_cache_meta::allocated_bytes();
+ uint64_t onode_allocated = mempool::bluestore_cache_onode::allocated_bytes();
+ uint64_t other_allocated = mempool::bluestore_cache_other::allocated_bytes();
+
+ uint64_t meta_items = mempool::bluestore_cache_meta::allocated_items();
+ uint64_t onode_items = mempool::bluestore_cache_onode::allocated_items();
+ uint64_t other_items = mempool::bluestore_cache_other::allocated_items();
+ cout << "meta(" << meta_allocated << "/" << meta_items
+ << ") onode(" << onode_allocated << "/" << onode_items
+ << ") other(" << other_allocated << "/" << other_items
+ << ")" << std::endl;
+ *total_bytes = meta_allocated + onode_allocated + other_allocated;
+ *total_items = onode_items;
+}
+
+TEST_P(StoreTestSpecificAUSize, OnodeSizeTracking) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_compression_mode", "none");
+ SetVal(g_conf(), "bluestore_csum_type", "none");
+ SetVal(g_conf(), "bluestore_cache_size_hdd", "400000000");
+ SetVal(g_conf(), "bluestore_cache_size_ssd", "400000000");
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, ""));
+ size_t obj_size = 4 * 1024 * 1024;
+ uint64_t total_bytes_prev;
+ uint64_t total_bytes, total_bytes2;
+ uint64_t total_onodes;
+ get_mempool_stats(&total_bytes, &total_onodes);
+ total_bytes_prev = total_bytes;
+ // 5u for onode_cache_shards vector
+ ASSERT_EQ(total_onodes, 5u);
+ ASSERT_EQ(total_bytes, 40u);
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig, orig2;
+
+ bl.append(std::string(obj_size, 'a'));
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ get_mempool_stats(&total_bytes, &total_onodes);
+ ASSERT_GT(total_bytes - total_bytes_prev, 0u);
+ ASSERT_EQ(total_onodes, 6u);
+
+ {
+ ObjectStore::Transaction t;
+ t.truncate(cid, hoid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ for(size_t i = 0; i < 1; ++i) {
+ bufferlist bl;
+ bl.append(std::string(block_size * (i+1), 'a'));
+ for( size_t j = 0; j < obj_size; j+= bl.length()) {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, j, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ get_mempool_stats(&total_bytes2, &total_onodes);
+ ASSERT_NE(total_bytes2, 0u);
+ ASSERT_EQ(total_onodes, 6u);
+ }
+ {
+ cout <<" mempool dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ mempool::dump(&f);
+ f.close_section();
+ f.flush(cout);
+ cout << std::endl;
+ }
+ {
+ bufferlist bl;
+ for (size_t i = 0; i < obj_size; i += 0x1000) {
+ store->read(ch, hoid, i, 0x1000, bl);
+ }
+ }
+ get_mempool_stats(&total_bytes, &total_onodes);
+ ASSERT_NE(total_bytes, 0u);
+ ASSERT_EQ(total_onodes, 6u);
+
+ {
+ cout <<" mempool dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ mempool::dump(&f);
+ f.close_section();
+ f.flush(cout);
+ cout << std::endl;
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwrite) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, ""));
+
+ const PerfCounters* logger = store->get_perf_counters();
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'a'));
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // overwrite at the beginning
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'b'));
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // append
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'c'));
+ t.write(cid, hoid, block_size * 2, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // append with a gap
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'd'));
+ t.write(cid, hoid, block_size * 5, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 2u);
+ }
+ {
+ // overwrite at end
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'e'));
+
+ // Currently we are unable to reuse blob when overwriting in a single step
+ t.write(cid, hoid, block_size * 6, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 2u);
+ }
+ {
+ // fill the gap
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'f'));
+
+ t.write(cid, hoid, block_size * 4, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // we need to wait some time for mempool
+ // thread to update stats to be able to check blob/extent numbers from
+ // perf counters.
+ sleep(1);
+
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+
+ bl.clear();
+ expected.clear();
+ r = store->read(ch, hoid, block_size, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+
+ bl.clear();
+ expected.clear();
+ r = store->read(ch, hoid, block_size * 2, block_size * 2, bl);
+ ASSERT_EQ(r, (int)block_size * 2);
+ expected.append(string(block_size * 2, 'c'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+
+ bl.clear();
+ expected.clear();
+ r = store->read(ch, hoid, block_size * 4, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'f'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+
+ bl.clear();
+ expected.clear();
+ r = store->read(ch, hoid, block_size * 5, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'd'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+
+ bl.clear();
+ expected.clear();
+ r = store->read(ch, hoid, block_size * 5, block_size * 3, bl);
+ ASSERT_EQ(r, (int)block_size * 3);
+ expected.append(string(block_size, 'd'));
+ expected.append(string(block_size * 2, 'e'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 1u);
+
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionSmallAppend) {
+ CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get();
+ if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {
+ GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping";
+ }
+
+ size_t block_size = 65536;
+ StartDeferred(block_size);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+ const PerfCounters* logger = store->get_perf_counters();
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // [1] append zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append_zero(4096);
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 4096u);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(4096, r);
+ ASSERT_TRUE(in.is_zero());
+ }
+
+ {
+ // [2] append non-zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(4096, 'c'));
+
+ t.write(cid, hoid, 4096, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u*2);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 4096u);
+
+ bufferlist in, _exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(4096 * 2, r);
+ _exp.append_zero(4096);
+ _exp.append(bl);
+ ASSERT_TRUE(bl_eq(_exp, in));
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionSmallOverwrite) {
+ CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get();
+ if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {
+ GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping";
+ }
+ if (smr) {
+ GTEST_SKIP() << "smr, skipping";
+ }
+
+ size_t block_size = 65536;
+ StartDeferred(block_size);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+ const PerfCounters* logger = store->get_perf_counters();
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ // {setting up the scenario} append non-zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(4096, 'c'));
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 0u);
+
+ bufferlist in, _exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(4096, r);
+ _exp.append(bl);
+ ASSERT_TRUE(bl_eq(_exp, in));
+ }
+
+ {
+ // [1] overwrite non-zeros with zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append_zero(4096);
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u*2);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 0u);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(4096, r);
+ ASSERT_TRUE(in.is_zero());
+ }
+
+ {
+ // [2] overwrite zeros with non-zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(4096, 'c'));
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_small), 3u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u*3);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 0u);
+
+ bufferlist in, _exp;
+ r = store->read(ch, hoid, 0, 0x4000, in);
+ ASSERT_EQ(4096, r);
+ _exp.append(bl);
+ ASSERT_TRUE(bl_eq(_exp, in));
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionBigAppend) {
+ CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get();
+ if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {
+ GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping";
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+ const PerfCounters* logger = store->get_perf_counters();
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ // [1] append zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append_zero(block_size * 2);
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*2);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, block_size * 8, in);
+ ASSERT_EQ(block_size * 2, r);
+ ASSERT_TRUE(in.is_zero());
+ }
+
+ {
+ // [2] append non-zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'c'));
+
+ t.write(cid, hoid, block_size * 2, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*4);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2);
+
+ bufferlist in, _exp;
+ r = store->read(ch, hoid, 0, block_size * 8, in);
+ ASSERT_EQ(block_size * 4, r);
+ _exp.append_zero(block_size * 2);
+ _exp.append(bl);
+ ASSERT_TRUE(bl_eq(_exp, in));
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionBigOverwrite) {
+ CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get();
+ if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {
+ GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping";
+ }
+ if (smr) {
+ GTEST_SKIP() << "smr, skipping";
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+ const PerfCounters* logger = store->get_perf_counters();
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ // {setting up the scenario} append non-zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'c'));
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*2);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 0u);
+
+ bufferlist in, _exp;
+ r = store->read(ch, hoid, 0, block_size * 8, in);
+ ASSERT_EQ(block_size * 2, r);
+ _exp.append(bl);
+ ASSERT_TRUE(bl_eq(_exp, in));
+ }
+
+ {
+ // [1] overwrite non-zeros with zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append_zero(block_size * 2);
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*4);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2);
+
+ bufferlist in;
+ r = store->read(ch, hoid, 0, block_size * 8, in);
+ ASSERT_EQ(block_size * 2, r);
+ ASSERT_TRUE(in.is_zero());
+ }
+
+ {
+ // [2] overwrite zeros with non-zeros
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'c'));
+
+ t.write(cid, hoid, 0, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 3u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*6);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2);
+
+ bufferlist in, _exp;
+ r = store->read(ch, hoid, 0, block_size * 8, in);
+ ASSERT_EQ(block_size * 2, r);
+ _exp.append(bl);
+ ASSERT_TRUE(bl_eq(_exp, in));
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred" << std::endl;
+ return;
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "131072");
+ SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
+
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+ ghobject_t hoid2(hobject_t("test2", "", CEPH_NOSNAP, 0, -1, ""));
+
+ PerfCounters* logger = const_cast<PerfCounters*>(store->get_perf_counters());
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, bl2;
+
+ bl.append(std::string(block_size * 2, 'c'));
+ bl2.append(std::string(block_size * 3, 'd'));
+
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ t.set_alloc_hint(cid, hoid2, block_size * 4, block_size * 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ);
+ t.write(cid, hoid2, 0, bl2.length(), bl2, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u);
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 5);
+ }
+
+ // overwrite at the beginning, 4K alignment
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'b'));
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 3u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u);
+
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'c'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+
+ // overwrite at the end, 4K alignment
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'g'));
+ t.write(cid, hoid, block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 4u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u);
+
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'g'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+
+ // overwrite at 4K, 12K alignment
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'e'));
+ t.write(cid, hoid2, block_size , bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 5u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u);
+
+ // makes sure deferred has been submitted
+ // and do all the checks again
+ sleep(g_conf().get_val<double>("bluestore_max_defer_interval") + 2);
+
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 5u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u);
+
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'g'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid2, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'd'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid2, block_size, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'e'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid2, block_size * 2, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'd'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 5);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 2u);
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size * 2, 'f'));
+
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 6u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u);
+
+ {
+ ObjectStore::Transaction t;
+ t.zero(cid, hoid, 0, 100);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, 100, bl);
+ ASSERT_EQ(r, (int)100);
+ expected.append(string(100, 0));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 100, block_size * 2 - 100, bl);
+ ASSERT_EQ(r, (int)block_size * 2 - 100);
+ expected.append(string(block_size * 2 - 100, 'f'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ sleep(2);
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2 - 100);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 2);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 1u);
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size, 'g'));
+
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 7u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u);
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'g'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size, block_size, bl);
+ ASSERT_EQ(r, (int)block_size);
+ expected.append(string(block_size, 'f'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 2);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 1u);
+
+ // check whether full overwrite bypass deferred
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size * 2, 'h'));
+
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 8u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u);
+
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size * 2, bl);
+ ASSERT_EQ(r, (int)block_size * 2);
+ expected.append(string(block_size * 2, 'h'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 2);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size * 32, 'a'));
+
+ // this will create two 128K aligned blobs
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ t.write(cid, hoid, bl.length(), bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 10u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u);
+
+ // check whether overwrite (less than prefer_deferred_size) partially overlapping two adjacent blobs goes
+ // deferred
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size * 3, 'b'));
+
+ t.write(cid, hoid, 0x20000 - block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 11u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 6u);
+
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, 0x20000 - block_size, bl);
+ ASSERT_EQ(r, 0x20000 - block_size);
+ expected.append(string(r, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ expected.clear();
+
+ r = store->read(ch, hoid, 0x20000 - block_size, block_size * 3, bl);
+ ASSERT_EQ(r, 3 * block_size);
+ expected.append(string(r, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ expected.clear();
+
+ r = store->read(ch, hoid, 0x20000 + 2 * block_size, block_size * 30, bl);
+ ASSERT_EQ(r, 30 * block_size);
+ expected.append(string(r, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ expected.clear();
+ }
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 64);
+ }
+
+ // check whether overwrite (larger than prefer_deferred_size) partially
+ // overlapping two adjacent blobs goes deferred
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size * 30, 'c'));
+
+ t.write(cid, hoid, 0x10000 + block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ sleep(2);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 12u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 8u);
+
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, 0x11000, bl);
+ ASSERT_EQ(r, 0x11000);
+ expected.append(string(r, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ expected.clear();
+
+ r = store->read(ch, hoid, 0x11000, block_size * 30, bl);
+ ASSERT_EQ(r, block_size * 30);
+ expected.append(string(r, 'c'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ expected.clear();
+
+ r = store->read(ch, hoid, block_size * 47, 0x10000 + block_size, bl);
+ ASSERT_EQ(r, 0x10000 + block_size);
+ expected.append(string(r, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ expected.clear();
+ }
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 64);
+ }
+
+ logger->reset();
+ // check whether overwrite (prefer_deferred_size < 120K < 2 * prefer_defer_size) partially
+ // overlapping two adjacent blobs goes partly deferred
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(block_size * 30, 'e'));
+
+ t.write(cid, hoid, 0x20000 - block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ sleep(2);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), block_size);
+
+ {
+ struct store_statfs_t statfs;
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64);
+ ASSERT_LE(statfs.allocated, (unsigned)block_size * 64);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid2);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite2) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred" << std::endl;
+ return;
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+ SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
+
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+ PerfCounters* logger = const_cast<PerfCounters*>(store->get_perf_counters());
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(128 * 1024, 'c'));
+
+ t.write(cid, hoid, 0x1000, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length());
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 3u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 0);
+ }
+
+ logger->reset();
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(128 * 1024, 'c'));
+
+ t.write(cid, hoid, 0x2000, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length());
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 3u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 57344);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite3) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred" << std::endl;
+ return;
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+ SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536");
+
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+ PerfCounters* logger = const_cast<PerfCounters*>(store->get_perf_counters());
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ logger->reset();
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(4096 * 1024, 'c'));
+
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length());
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 64u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 0u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 0u);
+ }
+ logger->reset();
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(4096 * 1024, 'c'));
+
+ t.write(cid, hoid, 0x1000, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ ASSERT_EQ(logger->get(l_bluestore_write_big), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length());
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 65u);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 61440);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, DeferredDifferentChunks) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred" << std::endl;
+ return;
+ }
+
+ size_t alloc_size = 4096;
+ size_t large_object_size = 1 * 1024 * 1024;
+ size_t prefer_deferred_size = 65536;
+ StartDeferred(alloc_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "131072");
+ SetVal(g_conf(), "bluestore_prefer_deferred_size",
+ stringify(prefer_deferred_size).c_str());
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ const PerfCounters* logger = store->get_perf_counters();
+ size_t exp_bluestore_write_big = 0;
+ size_t exp_bluestore_write_big_deferred = 0;
+
+ ObjectStore::CollectionHandle ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (size_t expected_write_size = 1024; expected_write_size <= prefer_deferred_size; expected_write_size *= 2) {
+ //create object with hint
+ ghobject_t hoid(hobject_t("test-"+to_string(expected_write_size), "", CEPH_NOSNAP, 0, -1, ""));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.set_alloc_hint(cid, hoid, large_object_size, expected_write_size,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ //fill object
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(large_object_size, 'h'));
+ t.write(cid, hoid, 0, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ++exp_bluestore_write_big;
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), exp_bluestore_write_big);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), exp_bluestore_write_big_deferred);
+
+ // check whether write will properly use deferred
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(alloc_size + 2, 'z'));
+ t.write(cid, hoid, large_object_size - 2 * alloc_size - 1, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ++exp_bluestore_write_big;
+ if (expected_write_size < prefer_deferred_size)
+ ++exp_bluestore_write_big_deferred;
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_write_big), exp_bluestore_write_big);
+ ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), exp_bluestore_write_big_deferred);
+ }
+ ch.reset(nullptr);
+ CloseAndReopen();
+ ch = store->open_collection(cid);
+ // check values
+ for (size_t expected_write_size = 1024; expected_write_size <= 65536; expected_write_size *= 2) {
+ ghobject_t hoid(hobject_t("test-"+to_string(expected_write_size), "", CEPH_NOSNAP, 0, -1, ""));
+ {
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, large_object_size, bl);
+ ASSERT_EQ(r, large_object_size);
+ expected.append(string(large_object_size - 2 * alloc_size - 1, 'h'));
+ expected.append(string(alloc_size + 2, 'z'));
+ expected.append(string(alloc_size - 1, 'h'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ }
+ }
+ {
+ ObjectStore::Transaction t;
+ for (size_t expected_write_size = 1024; expected_write_size <= 65536; expected_write_size *= 2) {
+ ghobject_t hoid(hobject_t("test-"+to_string(expected_write_size), "", CEPH_NOSNAP, 0, -1, ""));
+ t.remove(cid, hoid);
+ }
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwriteReverse) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no overwrite" << std::endl;
+ return;
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, ""));
+
+ auto ch = store->create_new_collection(cid);
+
+ const PerfCounters* logger = store->get_perf_counters();
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size * 2, 'a'));
+ t.write(cid, hoid, block_size * 10, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // prepend existing
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'b'));
+ t.write(cid, hoid, block_size * 9, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size * 9, block_size * 2, bl);
+ ASSERT_EQ(r, (int)block_size * 2);
+ expected.append(string(block_size, 'b'));
+ expected.append(string(block_size, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 1u);
+ }
+
+
+ {
+ // prepend existing with a gap
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'c'));
+ t.write(cid, hoid, block_size * 7, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size * 7, block_size * 3, bl);
+ ASSERT_EQ(r, (int)block_size * 3);
+ expected.append(string(block_size, 'c'));
+ expected.append(string(block_size, 0));
+ expected.append(string(block_size, 'b'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 2u);
+ }
+
+ {
+ // append after existing with a gap
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'd'));
+ t.write(cid, hoid, block_size * 13, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size * 11, block_size * 3, bl);
+ ASSERT_EQ(r, (int)block_size * 3);
+ expected.append(string(block_size, 'a'));
+ expected.append(string(block_size, 0));
+ expected.append(string(block_size, 'd'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 3u);
+ }
+
+ {
+ // append twice to the next max_blob slot
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'e'));
+ t.write(cid, hoid, block_size * 17, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ t.write(cid, hoid, block_size * 19, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size * 17, block_size * 3, bl);
+ ASSERT_EQ(r, (int)block_size * 3);
+ expected.append(string(block_size, 'e'));
+ expected.append(string(block_size, 0));
+ expected.append(string(block_size, 'e'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 5u);
+ }
+ {
+ // fill gaps at the second slot
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'f'));
+ t.write(cid, hoid, block_size * 16, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ t.write(cid, hoid, block_size * 18, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, block_size * 16, block_size * 4, bl);
+ ASSERT_EQ(r, (int)block_size * 4);
+ expected.append(string(block_size, 'f'));
+ expected.append(string(block_size, 'e'));
+ expected.append(string(block_size, 'f'));
+ expected.append(string(block_size, 'e'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 2u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 4u);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BlobReuseOnSmallOverwrite) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no overwrite" << std::endl;
+ return;
+ }
+
+ size_t block_size = 4096;
+ StartDeferred(block_size);
+ SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, ""));
+
+ const PerfCounters* logger = store->get_perf_counters();
+ auto ch = store->create_new_collection(cid);
+
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(block_size, 'a'));
+ t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ t.write(cid, hoid, block_size * 2, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // write small into the gap
+ ObjectStore::Transaction t;
+ bufferlist bl;
+
+ bl.append(std::string(3, 'b'));
+ t.write(cid, hoid, block_size + 1, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ // We need to issue a read to trigger cache stat update that refresh
+ // perf counters. additionally we need to wait some time for mempool
+ // thread to update stats.
+ sleep(1);
+ bufferlist bl, expected;
+ r = store->read(ch, hoid, 0, block_size * 3, bl);
+ ASSERT_EQ(r, (int)block_size * 3);
+ expected.append(string(block_size, 'a'));
+ expected.append(string(1, 0));
+ expected.append(string(3, 'b'));
+ expected.append(string(block_size - 4, 0));
+ expected.append(string(block_size, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+
+ ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_extents), 3u);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+// The test case to reproduce an issue when write happens
+// to a zero space between the extents sharing the same spanning blob
+// with unloaded shard map.
+// Second extent might be filled with zeros this way due to wrong result
+// returned by has_any_extents() call in do_write_small. The latter is caused
+// by incompletly loaded extent map.
+TEST_P(StoreTestSpecificAUSize, SmallWriteOnShardedExtents) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ size_t block_size = 0x10000;
+ StartDeferred(block_size);
+
+ SetVal(g_conf(), "bluestore_csum_type", "xxhash64");
+ SetVal(g_conf(), "bluestore_max_blob_size", "524288"); // for sure
+
+ g_conf().apply_changes(nullptr);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid1(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ //doing some tricks to have sharded extents/spanning objects
+ ObjectStore::Transaction t;
+ bufferlist bl, bl2;
+
+ bl.append(std::string(0x80000, 'a'));
+ t.write(cid, hoid1, 0, bl.length(), bl, 0);
+ t.zero(cid, hoid1, 0x719e0, 0x75b0 );
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ bl2.append(std::string(0x70000, 'b'));
+ t.write(cid, hoid1, 0, bl2.length(), bl2, 0);
+ t.zero(cid, hoid1, 0, 0x50000);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ }
+ ch.reset();
+ store->umount();
+ store->mount();
+ ch = store->open_collection(cid);
+
+ {
+ // do a write to zero space in between some extents sharing the same blob
+ ObjectStore::Transaction t;
+ bufferlist bl, bl2;
+
+ bl.append(std::string(0x6520, 'c'));
+ t.write(cid, hoid1, 0x71c00, bl.length(), bl, 0);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, expected;
+
+ r = store->read(ch, hoid1, 0x70000, 0x9c00, bl);
+ ASSERT_EQ(r, (int)0x9c00);
+ expected.append(string(0x19e0, 'a'));
+ expected.append(string(0x220, 0));
+ expected.append(string(0x6520, 'c'));
+ expected.append(string(0xe70, 0));
+ expected.append(string(0xc70, 'a'));
+ ASSERT_TRUE(bl_eq(expected, bl));
+ bl.clear();
+
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid1);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, ReproBug56488Test) {
+
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: no deferred" << std::endl;
+ return;
+ }
+
+ size_t alloc_size = 65536;
+ size_t write_size = 4096;
+ SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", stringify(1 << 30).c_str());
+
+ g_conf().apply_changes(nullptr);
+ StartDeferred(alloc_size);
+
+ int r;
+ coll_t cid;
+ const PerfCounters* logger = store->get_perf_counters();
+
+ ObjectStore::CollectionHandle ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ auto issued_dw = logger->get(l_bluestore_issued_deferred_writes);
+ auto issued_dw_bytes = logger->get(l_bluestore_issued_deferred_write_bytes);
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(write_size, 'x'));
+ t.write(cid, hoid, 0, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), issued_dw + 1);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes),
+ issued_dw_bytes + write_size);
+ }
+ {
+ ghobject_t hoid(hobject_t("test-a", "", CEPH_NOSNAP, 0, -1, ""));
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ auto issued_dw = logger->get(l_bluestore_issued_deferred_writes);
+ auto issued_dw_bytes = logger->get(l_bluestore_issued_deferred_write_bytes);
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append(std::string(write_size * 2, 'x'));
+ t.write(cid, hoid, alloc_size - write_size, bl.length(), bl,
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), issued_dw + 2);
+ ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes),
+ issued_dw_bytes + write_size * 2);
+ }
+ {
+ ObjectStore::Transaction t;
+ ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+ t.remove(cid, hoid);
+ ghobject_t hoid_a(hobject_t("test-a", "", CEPH_NOSNAP, 0, -1, ""));
+ t.remove(cid, hoid_a);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+#endif //#if defined(WITH_BLUESTORE)
+
+TEST_P(StoreTest, KVDBHistogramTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ int NUM_OBJS = 200;
+ int r = 0;
+ coll_t cid;
+ string base("testobj.");
+ bufferlist a;
+ bufferptr ap(0x1000);
+ memset(ap.c_str(), 'a', 0x1000);
+ a.append(ap);
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (int i = 0; i < NUM_OBJS; ++i) {
+ ObjectStore::Transaction t;
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", i);
+ ghobject_t hoid(hobject_t(sobject_t(base + string(buf), CEPH_NOSNAP)));
+ t.write(cid, hoid, 0, 0x1000, a);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ std::unique_ptr<Formatter> f(Formatter::create("store_test", "json-pretty", "json-pretty"));
+ store->generate_db_histogram(f.get());
+ f->flush(cout);
+ cout << std::endl;
+}
+
+TEST_P(StoreTest, KVDBStatsTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "rocksdb_perf", "true");
+ SetVal(g_conf(), "rocksdb_collect_compaction_stats", "true");
+ SetVal(g_conf(), "rocksdb_collect_extended_stats","true");
+ SetVal(g_conf(), "rocksdb_collect_memory_stats","true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ int r = store->umount();
+ ASSERT_EQ(r, 0);
+ r = store->mount(); //to force rocksdb stats
+ ASSERT_EQ(r, 0);
+
+ int NUM_OBJS = 200;
+ coll_t cid;
+ string base("testobj.");
+ bufferlist a;
+ bufferptr ap(0x1000);
+ memset(ap.c_str(), 'a', 0x1000);
+ a.append(ap);
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (int i = 0; i < NUM_OBJS; ++i) {
+ ObjectStore::Transaction t;
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", i);
+ ghobject_t hoid(hobject_t(sobject_t(base + string(buf), CEPH_NOSNAP)));
+ t.write(cid, hoid, 0, 0x1000, a);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ std::unique_ptr<Formatter> f(Formatter::create("store_test", "json-pretty", "json-pretty"));
+ store->get_db_statistics(f.get());
+ f->flush(cout);
+ cout << std::endl;
+}
+
+#if defined(WITH_BLUESTORE)
+TEST_P(StoreTestSpecificAUSize, garbageCollection) {
+ int r;
+ coll_t cid;
+ int buf_len = 256 * 1024;
+ int overlap_offset = 64 * 1024;
+ int write_offset = buf_len;
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: assertions about allocations need to be adjusted" << std::endl;
+ return;
+ }
+
+#define WRITE_AT(offset, _length) {\
+ ObjectStore::Transaction t;\
+ if ((uint64_t)_length != bl.length()) { \
+ buffer::ptr p(bl.c_str(), _length);\
+ bufferlist bl_tmp;\
+ bl_tmp.push_back(p);\
+ t.write(cid, hoid, offset, bl_tmp.length(), bl_tmp);\
+ } else {\
+ t.write(cid, hoid, offset, bl.length(), bl);\
+ }\
+ r = queue_transaction(store, ch, std::move(t));\
+ ASSERT_EQ(r, 0);\
+ }
+
+ StartDeferred(65536);
+
+ SetVal(g_conf(), "bluestore_compression_max_blob_size", "524288");
+ SetVal(g_conf(), "bluestore_compression_min_blob_size", "262144");
+ SetVal(g_conf(), "bluestore_max_blob_size", "524288");
+ SetVal(g_conf(), "bluestore_compression_mode", "force");
+ g_conf().apply_changes(nullptr);
+
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ {
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 5, in);
+ ASSERT_EQ(-ENOENT, r);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ std::string data;
+ data.resize(buf_len);
+
+ {
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ bufferlist bl;
+
+ for(size_t i = 0; i < data.size(); i++)
+ data[i] = i % 256;
+
+ bl.append(data);
+
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(0, buf_len);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+ }
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(write_offset - 2 * overlap_offset, buf_len);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x20000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0u);
+ }
+
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(write_offset - overlap_offset, buf_len);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x20000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x10000u);
+ }
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(write_offset - 3 * overlap_offset, buf_len);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x20000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x20000u);
+ }
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(write_offset + 1, overlap_offset-1);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x20000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x20000u);
+ }
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(write_offset + 1, overlap_offset);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x3ffffu);
+ }
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(0, buf_len-1);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x40001u);
+ }
+ SetVal(g_conf(), "bluestore_gc_enable_total_threshold", "1"); //forbid GC when saving = 0
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(1, overlap_offset-2);
+ WRITE_AT(overlap_offset * 2 + 1, overlap_offset-2);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x10000);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x40001u);
+ }
+ {
+ struct store_statfs_t statfs;
+ WRITE_AT(overlap_offset + 1, overlap_offset-2);
+ int r = store->statfs(&statfs);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(statfs.data_compressed_allocated, 0x0);
+ const PerfCounters* counters = store->get_perf_counters();
+ ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x40007u);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_block_size",
+ stringify(0x280005000).c_str()); //10 Gb + 4K
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+ StartDeferred(0x4000);
+ store->umount();
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ store->mount();
+
+}
+
+TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_block_size",
+ stringify(0x280005000).c_str()); //10 Gb + 20K
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+ StartDeferred(0x1000);
+ store->umount();
+ ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly
+ store->mount();
+}
+
+namespace {
+ ghobject_t make_object(const char* name, int64_t pool) {
+ sobject_t soid{name, CEPH_NOSNAP};
+ uint32_t hash = std::hash<sobject_t>{}(soid);
+ return ghobject_t{hobject_t{soid, "", hash, pool, ""}};
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl;
+ return;
+ }
+ const size_t offs_base = 65536 / 2;
+
+
+ // Now we need standalone db to pass "false free fix" section below
+ // Due to new BlueFS allocation model (single allocator for main device)
+ // it might cause "false free" blob overwrite by BlueFS/DB stuff
+ // and hence fail the test case and corrupt data.
+ //
+
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", "4294967296");
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+ SetVal(g_conf(), "bluestore_max_blob_size",
+ stringify(2 * offs_base).c_str());
+ SetVal(g_conf(), "bluestore_extent_map_shard_max_size", "12000");
+
+ StartDeferred(0x10000);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ // fill the store with some data
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid = make_object("Object 1", pool);
+ ghobject_t hoid_dup = make_object("Object 1(dup)", pool);
+ ghobject_t hoid2 = make_object("Object 2", pool);
+ ghobject_t hoid_cloned = hoid2;
+ hoid_cloned.hobj.snap = 1;
+ ghobject_t hoid3 = make_object("Object 3", pool);
+ ghobject_t hoid3_cloned = hoid3;
+ hoid3_cloned.hobj.snap = 1;
+ bufferlist bl;
+ bl.append("1234512345");
+ int r;
+ const size_t repeats = 16;
+ {
+ auto ch = store->create_new_collection(cid);
+ cerr << "create collection + write" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid, i * offs_base, bl.length(), bl);
+ t.write(cid, hoid_dup, i * offs_base, bl.length(), bl);
+ }
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid2, i * offs_base, bl.length(), bl);
+ }
+ t.clone(cid, hoid2, hoid_cloned);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bstore->umount();
+ bool err_was_injected = false;
+ //////////// leaked pextent fix ////////////
+ cerr << "fix leaked pextents" << std::endl;
+ ASSERT_EQ(bstore->fsck(false), 0);
+ ASSERT_EQ(bstore->repair(false), 0);
+ bstore->mount();
+ if (!bstore->has_null_manager()) {
+ bstore->inject_leaked(0x30000);
+ err_was_injected = true;
+ }
+
+ bstore->umount();
+ if (err_was_injected) {
+ ASSERT_EQ(bstore->fsck(false), 1);
+ }
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ //////////// false free fix ////////////
+ cerr << "fix false free pextents" << std::endl;
+ bstore->mount();
+ if (!bstore->has_null_manager()) {
+ bstore->inject_false_free(cid, hoid);
+ err_was_injected = true;
+ }
+ bstore->umount();
+ if (err_was_injected) {
+ ASSERT_EQ(bstore->fsck(false), 2);
+ ASSERT_EQ(bstore->repair(false), 0);
+ }
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+
+ ///////// undecodable shared blob key / stray shared blob records ///////
+ bstore->mount();
+ cerr << "undecodable shared blob key" << std::endl;
+ bstore->inject_broken_shared_blob_key("undec1",
+ bufferlist());
+ bstore->inject_broken_shared_blob_key("undecodable key 2",
+ bufferlist());
+ bstore->inject_broken_shared_blob_key("undecodable key 3",
+ bufferlist());
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 3);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ cerr << "misreferencing" << std::endl;
+ bstore->mount();
+ bstore->inject_misreference(cid, hoid, cid, hoid_dup, 0);
+ bstore->inject_misreference(cid, hoid, cid, hoid_dup, (offs_base * repeats) / 2);
+ bstore->inject_misreference(cid, hoid, cid, hoid_dup, offs_base * (repeats -1) );
+ int expected_errors = bstore->has_null_manager() ? 3 : 6;
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), expected_errors);
+ ASSERT_EQ(bstore->repair(false), 0);
+
+ ASSERT_EQ(bstore->fsck(true), 0);
+
+ // reproducing issues #21040 & 20983
+ SetVal(g_conf(), "bluestore_debug_inject_bug21040", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ bstore->mount();
+
+ cerr << "repro bug #21040" << std::endl;
+ {
+ auto ch = store->open_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ bl.append("0123456789012345");
+ t.write(cid, hoid3, offs_base, bl.length(), bl);
+ bl.clear();
+ bl.append('!');
+ t.write(cid, hoid3, 0, bl.length(), bl);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.clone(cid, hoid3, hoid3_cloned);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bstore->umount();
+ // depending on statfs tracking we might meet or miss relevant error
+ // hence error count >= 3
+ ASSERT_GE(bstore->fsck(false), 3);
+ ASSERT_LE(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ }
+
+ cerr << "Zombie spanning blob" << std::endl;
+ {
+ bstore->mount();
+ ghobject_t hoid4 = make_object("Object 4", pool);
+ auto ch = store->open_collection(cid);
+ {
+ bufferlist bl;
+ string s(0x1000, 'a');
+ bl.append(s);
+ ObjectStore::Transaction t;
+ for(size_t i = 0; i < 0x10; i++) {
+ t.write(cid, hoid4, i * bl.length(), bl.length(), bl);
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ sleep(5);
+ {
+ bstore->inject_zombie_spanning_blob(cid, hoid4, 12345);
+ bstore->inject_zombie_spanning_blob(cid, hoid4, 23456);
+ bstore->inject_zombie_spanning_blob(cid, hoid4, 23457);
+ }
+
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 1);
+ ASSERT_LE(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ }
+
+ //////////// verify invalid statfs ///////////
+ cerr << "fix invalid statfs" << std::endl;
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_stats", "true");
+ SetVal(g_conf(),
+ "bluestore_debug_inject_allocation_from_file_failure", "1");
+ store_statfs_t statfs0;
+ store_statfs_t statfs;
+ bstore->mount();
+ ASSERT_EQ(bstore->statfs(&statfs0), 0);
+ statfs = statfs0;
+ statfs.allocated += 0x10000;
+ statfs.data_stored += 0x10000;
+ ASSERT_FALSE(statfs0 == statfs);
+ // this enforces global stats usage
+ bstore->inject_statfs("bluestore_statfs", statfs);
+ bstore->umount();
+
+ ASSERT_GE(bstore->fsck(false), 1); // global stats mismatch might omitted when
+ // NCB restore is applied. Hence using >= for
+ // error count
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ ASSERT_EQ(bstore->mount(), 0);
+ ASSERT_EQ(bstore->statfs(&statfs), 0);
+ // adjust free/internal meta space to success in comparison
+ statfs0.available = statfs.available;
+ statfs0.internal_metadata = statfs.internal_metadata;
+ ASSERT_EQ(statfs0, statfs);
+
+ SetVal(g_conf(),
+ "bluestore_debug_inject_allocation_from_file_failure", "0");
+ cerr << "fix invalid statfs2" << std::endl;
+ ASSERT_EQ(bstore->statfs(&statfs0), 0);
+ statfs = statfs0;
+ statfs.allocated += 0x20000;
+ statfs.data_stored += 0x20000;
+ ASSERT_FALSE(statfs0 == statfs);
+ // this enforces global stats usage
+ bstore->inject_statfs("bluestore_statfs", statfs);
+ bstore->umount();
+
+ ASSERT_EQ(bstore->fsck(false), 2);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ ASSERT_EQ(bstore->mount(), 0);
+ ASSERT_EQ(bstore->statfs(&statfs), 0);
+ // adjust free/internal meta space to success in comparison
+ statfs0.available = statfs.available;
+ statfs0.internal_metadata = statfs.internal_metadata;
+ ASSERT_EQ(statfs0, statfs);
+
+ cerr << "Completing" << std::endl;
+}
+
+TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: smr repair is different" << std::endl;
+ return;
+ }
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+
+ StartDeferred(0x10000);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ int r;
+
+ cerr << "initializing" << std::endl;
+ {
+ const size_t col_count = 16;
+ const size_t obj_count = 1024;
+ ObjectStore::CollectionHandle ch[col_count];
+ ghobject_t hoid[col_count][obj_count];
+
+ unique_ptr<coll_t> cid[col_count];
+
+ for (size_t i = 0; i < col_count; i++) {
+ cid[i].reset(new coll_t(spg_t(pg_t(0, i), shard_id_t::NO_SHARD)));
+ ch[i] = store->create_new_collection(*cid[i]);
+ for (size_t j = 0; j < obj_count; j++) {
+ hoid[i][j] = make_object(stringify(j).c_str(), i);
+ }
+ }
+
+ for (size_t i = 0; i < col_count; i++) {
+ ObjectStore::Transaction t;
+ t.create_collection(*cid[i], 0);
+ r = queue_transaction(store, ch[i], std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ cerr << "onode preparing" << std::endl;
+ bufferlist bl;
+ string s(0x1000, 'a');
+ bl.append(s);
+
+ for (size_t i = 0; i < col_count; i++) {
+ for (size_t j = 0; j < obj_count; j++) {
+ ObjectStore::Transaction t;
+ t.write(*cid[i], hoid[i][j], bl.length(), bl.length(), bl);
+ r = queue_transaction(store, ch[i], std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+ cerr << "Zombie spanning blob injection" << std::endl;
+
+ sleep(5);
+
+ for (size_t i = 0; i < col_count; i++) {
+ for (size_t j = 0; j < obj_count; j++) {
+ bstore->inject_zombie_spanning_blob(*cid[i], hoid[i][j], 12345);
+ }
+ }
+
+ cerr << "fscking/fixing" << std::endl;
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), col_count * obj_count);
+ ASSERT_LE(bstore->quick_fix(), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ }
+
+ cerr << "Completing" << std::endl;
+ bstore->mount();
+}
+
+TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+
+ const size_t block_size = 0x1000;
+ StartDeferred(block_size);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ // fill the store with some data
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid = make_object("Object 1", pool);
+ ghobject_t hoid_cloned = hoid;
+ hoid_cloned.hobj.snap = 1;
+ ghobject_t hoid2 = make_object("Object 2", pool);
+
+ string s(block_size, 1);
+ bufferlist bl;
+ bl.append(s);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // check the scenario when shared blob contains
+ // references to extents from two objects which don't overlapp
+ // o1 -> 0x2000~1K
+ // o2 -> 0x4000~1k
+ cerr << "introduce 2 non-overlapped extents in a shared blob"
+ << std::endl;
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, bl.length(), bl);
+ t.write(cid, hoid2, 0, bl.length(), bl); // to make a gap in allocations
+ t.write(cid, hoid, block_size * 2 , bl.length(), bl);
+ t.clone(cid, hoid, hoid_cloned);
+ t.zero(cid, hoid, 0, bl.length());
+ t.zero(cid, hoid_cloned, block_size * 2, bl.length());
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bstore->umount();
+ bstore->mount();
+ {
+ string key;
+ _key_encode_u64(1, &key);
+ bluestore_shared_blob_t sb(1);
+ sb.ref_map.get(0x822000, block_size);
+ sb.ref_map.get(0x824000, block_size);
+ sb.ref_map.get(0x824000, block_size);
+ bufferlist bl;
+ encode(sb, bl);
+ bstore->inject_broken_shared_blob_key(key, bl);
+ }
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 2);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ cerr << "Completing" << std::endl;
+ bstore->mount();
+}
+
+TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: smr repair is different" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+
+ StartDeferred(0x10000);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ int r;
+
+ // initializing
+ cerr << "initializing" << std::endl;
+ {
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid = make_object("Object", pool);
+ ghobject_t hoid_cloned = hoid;
+ hoid_cloned.hobj.snap = 1;
+
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("0123456789012345");
+ t.write(cid, hoid, 0, bl.length(), bl);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.clone(cid, hoid, hoid_cloned);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ }
+ // injecting an error and checking
+ cerr << "injecting" << std::endl;
+ sleep(3); // need some time for the previous write to land
+ bstore->inject_no_shared_blob_key();
+ bstore->inject_stray_shared_blob_key(12345678);
+
+ {
+ cerr << "fscking/fixing" << std::endl;
+ // we need to check for null-manager before umount()
+ bool has_null_manager = bstore->has_null_manager();
+ bstore->umount();
+ // depending on the allocation map's source we can
+ // either observe or don't observe an additional
+ // extent leak detection. Hence adjusting the expected
+ // value
+ size_t expected_error_count =
+ has_null_manager ?
+ 4: // 4 sb ref mismatch errors [+ 1 optional statfs, hence ASSERT_GE]
+ 7; // 4 sb ref mismatch errors + 1 statfs + 1 block leak + 1 non-free
+ ASSERT_GE(bstore->fsck(false), expected_error_count);
+ // repair might report less errors than fsck above showed
+ // as some errors, e.g. statfs mismatch, are implicitly fixed
+ // before the detection during the previous repair steps...
+ ASSERT_LE(bstore->repair(false), expected_error_count);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ }
+
+ cerr << "Completing" << std::endl;
+ bstore->mount();
+}
+
+TEST_P(StoreTest, BluestoreRepairGlobalStats) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ const size_t offs_base = 65536 / 2;
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ // start with global stats
+ bstore->inject_global_statfs({});
+ bstore->umount();
+ SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "false");
+ bstore->mount();
+
+ // fill the store with some data
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid = make_object("Object 1", pool);
+ ghobject_t hoid_dup = make_object("Object 1(dup)", pool);
+ ghobject_t hoid2 = make_object("Object 2", pool);
+ ghobject_t hoid_cloned = hoid2;
+ hoid_cloned.hobj.snap = 1;
+ ghobject_t hoid3 = make_object("Object 3", pool);
+ ghobject_t hoid3_cloned = hoid3;
+ hoid3_cloned.hobj.snap = 1;
+ bufferlist bl;
+ bl.append("1234512345");
+ int r;
+ const size_t repeats = 16;
+ {
+ auto ch = store->create_new_collection(cid);
+ cerr << "create collection + write" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid, i * offs_base, bl.length(), bl);
+ t.write(cid, hoid_dup, i * offs_base, bl.length(), bl);
+ }
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid2, i * offs_base, bl.length(), bl);
+ }
+ t.clone(cid, hoid2, hoid_cloned);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bstore->umount();
+
+ // enable per-pool stats collection hence causing fsck to fail
+ cerr << "per-pool statfs" << std::endl;
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_stats", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ ASSERT_EQ(bstore->fsck(false), 1);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ bstore->mount();
+}
+
+TEST_P(StoreTest, BluestoreRepairGlobalStatsFixOnMount) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ const size_t offs_base = 65536 / 2;
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ // start with global stats
+ bstore->inject_global_statfs({});
+ bstore->umount();
+ SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "false");
+ bstore->mount();
+
+ // fill the store with some data
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid = make_object("Object 1", pool);
+ ghobject_t hoid_dup = make_object("Object 1(dup)", pool);
+ ghobject_t hoid2 = make_object("Object 2", pool);
+ ghobject_t hoid_cloned = hoid2;
+ hoid_cloned.hobj.snap = 1;
+ ghobject_t hoid3 = make_object("Object 3", pool);
+ ghobject_t hoid3_cloned = hoid3;
+ hoid3_cloned.hobj.snap = 1;
+ bufferlist bl;
+ bl.append("1234512345");
+ int r;
+ const size_t repeats = 16;
+ {
+ auto ch = store->create_new_collection(cid);
+ cerr << "create collection + write" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid, i * offs_base, bl.length(), bl);
+ t.write(cid, hoid_dup, i * offs_base, bl.length(), bl);
+ }
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid2, i * offs_base, bl.length(), bl);
+ }
+ t.clone(cid, hoid2, hoid_cloned);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bstore->umount();
+
+ // enable per-pool stats collection hence causing fsck to fail
+ cerr << "per-pool statfs" << std::endl;
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_stats", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ ASSERT_EQ(bstore->fsck(false), 1);
+
+ SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "true");
+ bstore->mount();
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ bstore->mount();
+}
+
+TEST_P(StoreTest, BluestoreStatistics) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "rocksdb_perf", "true");
+ SetVal(g_conf(), "rocksdb_collect_compaction_stats", "true");
+ SetVal(g_conf(), "rocksdb_collect_extended_stats","true");
+ SetVal(g_conf(), "rocksdb_collect_memory_stats","true");
+
+ // disable cache
+ SetVal(g_conf(), "bluestore_cache_size_ssd", "0");
+ SetVal(g_conf(), "bluestore_cache_size_hdd", "0");
+ SetVal(g_conf(), "bluestore_cache_size", "0");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ int r = store->umount();
+ ASSERT_EQ(r, 0);
+ r = store->mount();
+ ASSERT_EQ(r, 0);
+
+ BlueStore* bstore = NULL;
+ EXPECT_NO_THROW(bstore = dynamic_cast<BlueStore*> (store.get()));
+
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_db_statistics", "", CEPH_NOSNAP, 0, 0, ""));
+ auto ch = bstore->create_new_collection(cid);
+ bufferlist bl;
+ bl.append("0123456789abcdefghi");
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, hoid);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "Write object" << std::endl;
+ r = queue_transaction(bstore, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bufferlist readback;
+ r = store->read(ch, hoid, 0, bl.length(), readback);
+ ASSERT_EQ(static_cast<int>(bl.length()), r);
+ ASSERT_TRUE(bl_eq(bl, readback));
+ }
+ std::unique_ptr<Formatter> f(Formatter::create("store_test", "json-pretty", "json-pretty"));
+ EXPECT_NO_THROW(store->get_db_statistics(f.get()));
+ f->flush(cout);
+ cout << std::endl;
+}
+
+TEST_P(StoreTest, BluestoreStrayOmapDetection)
+{
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ ghobject_t oid = make_object("Object 1", pool);
+ ghobject_t oid2 = make_object("Object 2", pool);
+ // fill the store with some data
+ auto ch = store->create_new_collection(cid);
+ bufferlist h;
+ h.append("header");
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oid);
+ t.omap_setheader(cid, oid, h);
+ t.touch(cid, oid2);
+ t.omap_setheader(cid, oid2, h);
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // inject stray omap
+ bstore->inject_stray_omap(123456, "somename");
+
+ bstore->umount();
+ // check we detect injected stray omap..
+
+ ASSERT_EQ(bstore->fsck(false), 1);
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ bstore->mount();
+}
+
+TEST_P(StoreTest, BluestorePerPoolOmapFixOnMount)
+{
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ ghobject_t oid = make_object("Object 1", pool);
+ ghobject_t oid2 = make_object("Object 2", pool);
+ // fill the store with some data
+ auto ch = store->create_new_collection(cid);
+ map<string, bufferlist> omap;
+ bufferlist h;
+ h.append("header");
+ {
+ omap["omap_key"].append("omap value");
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, oid);
+ t.omap_setheader(cid, oid, h);
+ t.touch(cid, oid2);
+ t.omap_setheader(cid, oid2, h);
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // inject legacy omaps
+ bstore->inject_legacy_omap();
+ bstore->inject_legacy_omap(cid, oid);
+ bstore->inject_legacy_omap(cid, oid2);
+
+ bstore->umount();
+
+ // check we injected an issue
+ SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ ASSERT_EQ(bstore->fsck(false), 3);
+
+ // set autofix and mount
+ SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ bstore->mount();
+ bstore->umount();
+
+ // check we fixed it..
+ ASSERT_EQ(bstore->fsck(false), 0);
+ bstore->mount();
+
+ //
+ // Now repro https://tracker.ceph.com/issues/43824
+ //
+ // inject legacy omaps again
+ bstore->inject_legacy_omap();
+ bstore->inject_legacy_omap(cid, oid);
+ bstore->inject_legacy_omap(cid, oid2);
+ bstore->umount();
+
+ // check we injected an issue
+ SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "true");
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ bstore->mount();
+ ch = store->open_collection(cid);
+
+ {
+ // write to onode which will partiall revert per-pool
+ // omap repair done on mount due to #43824.
+ // And object removal will leave stray per-pool omap recs
+ //
+ ObjectStore::Transaction t;
+ bufferlist bl;
+ bl.append("data");
+ //this triggers onode rec update and hence legacy omap
+ t.write(cid, oid, 0, bl.length(), bl);
+ t.remove(cid, oid2); // this will trigger stray per-pool omap
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bstore->umount();
+ // check omap's been fixed.
+ ASSERT_EQ(bstore->fsck(false), 0); // this will fail without fix for #43824
+
+ bstore->mount();
+}
+
+class hugepaged_raw;
+
+static bool is_hugepaged(const bufferptr& bp)
+{
+ const auto& ibp =
+ static_cast<const ceph::buffer_instrumentation::instrumented_bptr&>(bp);
+ return ibp.is_raw_marked<BlockDevice::hugepaged_raw_marker_t>();
+}
+
+// disabled by default b/c of the dependency on huge page ssome test
+// environments might not offer without extra configuration.
+TEST_P(StoreTestDeferredSetup, DISABLED_BluestoreHugeReads)
+{
+ if (string(GetParam()) != "bluestore") {
+ return;
+ }
+
+ constexpr static size_t HUGE_BUFFER_SIZE{2_M};
+ cout << "Configuring huge page pools" << std::endl;
+ {
+ SetVal(g_conf(), "bdev_read_preallocated_huge_buffers",
+ fmt::format("{}=2", HUGE_BUFFER_SIZE).c_str());
+ SetVal(g_conf(), "bluestore_max_blob_size",
+ std::to_string(HUGE_BUFFER_SIZE).c_str());
+ // let's verify the per-IOContext no-cache override
+ SetVal(g_conf(), "bluestore_default_buffered_read", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ }
+ DeferredSetup();
+
+ coll_t cid;
+ ghobject_t hoid(hobject_t("test_huge_buffers", "", CEPH_NOSNAP, 0, 0, ""));
+ auto ch = store->create_new_collection(cid);
+
+ bufferlist bl;
+ {
+ bufferptr bp{HUGE_BUFFER_SIZE};
+ // non-zeros! Otherwise the deduplication will take place.
+ ::memset(bp.c_str(), 0x42, HUGE_BUFFER_SIZE);
+ bl.push_back(std::move(bp));
+ ASSERT_EQ(bl.get_num_buffers(), 1);
+ ASSERT_EQ(bl.length(), HUGE_BUFFER_SIZE);
+ }
+
+ cout << "Write object" << std::endl;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ t.touch(cid, hoid);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ const auto r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // force cache clear
+ {
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ ch = store->open_collection(cid);
+ }
+
+ // we want to extend the life-time of all huge paged-backed
+ // bufferlists to validate the behaviour on pool exhaustion.
+ bufferlist bl_1_huge, bl_2_huge, bl_3_plain;
+
+ cout << "Read object 1st time" << std::endl;
+ {
+ const auto r = store->read(ch, hoid, 0, HUGE_BUFFER_SIZE, bl_1_huge);
+ ASSERT_EQ(static_cast<int>(HUGE_BUFFER_SIZE), r);
+ ASSERT_TRUE(bl_eq(bl, bl_1_huge));
+ ASSERT_EQ(bl_1_huge.get_num_buffers(), 1);
+ ASSERT_TRUE(is_hugepaged(bl_1_huge.front()));
+ }
+
+ cout << "Read object 2nd time" << std::endl;
+ {
+ const auto r = store->read(ch, hoid, 0, HUGE_BUFFER_SIZE, bl_2_huge);
+ ASSERT_EQ(static_cast<int>(HUGE_BUFFER_SIZE), r);
+ ASSERT_TRUE(bl_eq(bl, bl_2_huge));
+ ASSERT_EQ(bl_2_huge.get_num_buffers(), 1);
+ ASSERT_TRUE(is_hugepaged(bl_2_huge.front()));
+ }
+
+ cout << "Read object 3rd time" << std::endl;
+ {
+ const auto r = store->read(ch, hoid, 0, HUGE_BUFFER_SIZE, bl_3_plain);
+ ASSERT_EQ(static_cast<int>(HUGE_BUFFER_SIZE), r);
+ ASSERT_TRUE(bl_eq(bl, bl_3_plain));
+ ASSERT_EQ(bl_3_plain.get_num_buffers(), 1);
+ ASSERT_FALSE(is_hugepaged(bl_3_plain.front()));
+ }
+}
+
+TEST_P(StoreTest, SpuriousReadErrorTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ int r;
+ auto logger = store->get_perf_counters();
+ coll_t cid;
+ auto ch = store->create_new_collection(cid);
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist test_data;
+ bufferptr ap(0x2000);
+ memset(ap.c_str(), 'a', 0x2000);
+ test_data.append(ap);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, 0x2000, test_data);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ // force cache clear
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ }
+ ch = store->open_collection(cid);
+
+ cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
+ SetVal(g_conf(), "bluestore_retry_disk_reads", "0");
+ SetVal(g_conf(), "bluestore_debug_inject_csum_err_probability", "1");
+ g_ceph_context->_conf.apply_changes(nullptr);
+ {
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(-EIO, r);
+ ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
+ }
+
+ cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
+ SetVal(g_conf(), "bluestore_retry_disk_reads", "255");
+ SetVal(g_conf(), "bluestore_debug_inject_csum_err_probability", "0.8");
+ /**
+ * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
+ * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
+ * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
+ */
+ g_ceph_context->_conf.apply_changes(nullptr);
+ {
+ for (int i = 0; i < 25; ++i) {
+ bufferlist in;
+ r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(0x2000, r);
+ ASSERT_TRUE(bl_eq(test_data, in));
+ }
+ ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
+ }
+}
+
+TEST_P(StoreTest, mergeRegionTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "true");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "true");
+ SetVal(g_conf(), "bdev_debug_inflight_ios", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ uint32_t chunk_size = g_ceph_context->_conf->bdev_block_size;
+ int r = -1;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl5;
+ bl5.append("abcde");
+ uint64_t offset = 0;
+ { // 1. same region
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, offset, 5, bl5);
+ t.write(cid, hoid, 0xa + offset, 5, bl5);
+ t.write(cid, hoid, 0x14 + offset, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 2. adjacent regions
+ ObjectStore::Transaction t;
+ offset = chunk_size;
+ t.write(cid, hoid, offset, 5, bl5);
+ t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 3. front merge
+ ObjectStore::Transaction t;
+ offset = chunk_size * 2;
+ t.write(cid, hoid, offset, 5, bl5);
+ t.write(cid, hoid, offset + chunk_size - 2, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 4. back merge
+ ObjectStore::Transaction t;
+ bufferlist blc2;
+ blc2.append_zero(chunk_size + 2);
+
+ offset = chunk_size * 3;
+ t.write(cid, hoid, offset, chunk_size + 2, blc2);
+ t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 5. overlapping
+ ObjectStore::Transaction t;
+ uint64_t final_len = 0;
+ offset = chunk_size * 10;
+ bufferlist bl2c2;
+ bl2c2.append_zero(chunk_size * 2);
+ t.write(cid, hoid, offset + chunk_size * 3 - 3, chunk_size * 2, bl2c2);
+ bl2c2.append_zero(2);
+ t.write(cid, hoid, offset + chunk_size - 2, chunk_size * 2 + 2, bl2c2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ final_len = (offset + chunk_size * 3 - 3) + (chunk_size * 2);
+ bufferlist bl;
+ r = store->read(ch, hoid, 0, final_len, bl);
+ ASSERT_EQ(final_len, static_cast<uint64_t>(r));
+ }
+}
+
+TEST_P(StoreTest, FixSMRWritePointer) {
+ if(string(GetParam()) != "bluestore")
+ return;
+ if (!smr)
+ return;
+ int r = store->umount();
+ ASSERT_EQ(0, r);
+
+ // copied from StoreTestFixture
+ std::string path = GetParam() + ".test_temp_dir"s;
+
+ std::string p = path + "/block";
+ BlockDevice* bdev = BlockDevice::create(g_ceph_context, p, nullptr, nullptr, nullptr, nullptr);
+ r = bdev->open(p);
+ ASSERT_EQ(0, r);
+ ASSERT_EQ(true, bdev->is_smr());
+
+ std::vector<uint64_t> wp = bdev->get_zones();
+ uint64_t first_seq_zone = bdev->get_conventional_region_size() / bdev->get_zone_size();
+
+ IOContext ioc(g_ceph_context, NULL, true);
+ bufferlist bl;
+ bl.append(std::string(1024 * 1024, 'x'));
+ r = bdev->aio_write(wp[first_seq_zone], bl, &ioc, false);
+ ASSERT_EQ(0, r);
+ bdev->aio_submit(&ioc);
+ ioc.aio_wait();
+ bdev->close();
+ delete bdev;
+
+ r = store->mount();
+ ASSERT_EQ(0, r);
+}
+
+
+TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsHdd) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
+ StartDeferred(0x1000);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(g_ceph_context->_conf->bluestore_max_blob_size_hdd, '0');
+ bl.append(s);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "write" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ const PerfCounters* logger = store->get_perf_counters();
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsSsd) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_debug_enforce_settings", "ssd");
+ StartDeferred(0x1000);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(g_ceph_context->_conf->bluestore_max_blob_size_ssd * 8, '0');
+ bl.append(s);
+ t.write(cid, hoid, 0, bl.length(), bl);
+ cerr << "write" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ const PerfCounters* logger = store->get_perf_counters();
+ ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 8u);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, ReproNoBlobMultiTest) {
+
+ if(string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP (FIXME): bluestore gc does not seem to do the trick here" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", "4294967296");
+ SetVal(g_conf(), "bluestore_block_size", "12884901888");
+ SetVal(g_conf(), "bluestore_max_blob_size", "524288");
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ ghobject_t hoid2 = hoid;
+ hoid2.hobj.snap = 1;
+
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ bool exists = store->exists(ch, hoid);
+ ASSERT_TRUE(!exists);
+
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ exists = store->exists(ch, hoid);
+ ASSERT_EQ(true, exists);
+ }
+ {
+ uint64_t offs = 0;
+ bufferlist bl;
+ const int size = 0x100;
+ bufferptr ap(size);
+ memset(ap.c_str(), 'a', size);
+ bl.append(ap);
+ int i = 0;
+ uint64_t blob_size = 524288;
+ uint64_t total = 0;
+ for (i = 0; i <= 512; i++) {
+ offs = 0 + i * size;
+ ObjectStore::Transaction t;
+ ghobject_t hoid2 = hoid;
+ hoid2.hobj.snap = i + 1;
+ while (offs < 128 * 1024 * 1024) {
+
+ t.write(cid, hoid, offs, ap.length(), bl);
+ offs += blob_size;
+ total += ap.length();
+ }
+ t.clone(cid, hoid, hoid2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ cerr << "Total written = " << total << std::endl;
+ }
+ {
+ cerr << "Finalizing" << std::endl;
+ const PerfCounters* logger = store->get_perf_counters();
+ ASSERT_GE(logger->get(l_bluestore_gc_merged), 1024*1024*1024);
+ }
+}
+
+void doManySetAttr(ObjectStore* store,
+ std::function<void(ObjectStore*)> do_check_fn)
+{
+ MixedGenerator gen(447);
+ gen_type rng(time(NULL));
+ coll_t cid(spg_t(pg_t(0, 447), shard_id_t::NO_SHARD));
+
+ SyntheticWorkloadState test_obj(store, &gen, &rng, cid, 0, 0, 0);
+ test_obj.init();
+ size_t object_count = 256;
+ for (size_t i = 0; i < object_count; ++i) {
+ if (!(i % 10)) cerr << "seeding object " << i << std::endl;
+ test_obj.touch();
+ }
+ for (size_t i = 0; i < object_count; ++i) {
+ if (!(i % 100)) {
+ cerr << "Op " << i << std::endl;
+ test_obj.print_internal_state();
+ }
+ test_obj.set_fixed_attrs(1024, 64, 4096); // 1024 attributes, 64 bytes name and 4K value
+ }
+ test_obj.wait_for_done();
+
+ std::cout << "done" << std::endl;
+ do_check_fn(store);
+ AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
+ ceph_assert(admin_socket);
+
+ ceph::bufferlist in, out;
+ ostringstream err;
+
+ auto r = admin_socket->execute_command(
+ { "{\"prefix\": \"bluefs stats\"}" },
+ in, err, &out);
+ if (r != 0) {
+ cerr << "failure querying: " << cpp_strerror(r) << std::endl;
+ } else {
+ std::cout << std::string(out.c_str(), out.length()) << std::endl;
+ }
+ test_obj.shutdown();
+}
+
+TEST_P(StoreTestSpecificAUSize, SpilloverTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
+ SetVal(g_conf(), "bluestore_volume_selection_policy", "rocksdb_original");
+ // original RocksDB settings used before https://github.com/ceph/ceph/pull/47221/
+ // which enable BlueFS spillover.
+ SetVal(g_conf(), "bluestore_rocksdb_options",
+ "compression=kNoCompression,max_write_buffer_number=4,"
+ "min_write_buffer_number_to_merge=1,recycle_log_file_num=4,"
+ "write_buffer_size=268435456,writable_file_max_buffer_size=0,"
+ "compaction_readahead_size=2097152,max_background_compactions=2,"
+ "max_total_wal_size=1073741824");
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+ doManySetAttr(store.get(),
+ [&](ObjectStore* _store) {
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (_store);
+ ceph_assert(bstore);
+ bstore->compact();
+ const PerfCounters* logger = bstore->get_bluefs_perf_counters();
+ //experimentally it was discovered that this case results in 400+MB spillover
+ //using lower 300MB threshold just to be safe enough
+ std::cout << "DB used:" << logger->get(l_bluefs_db_used_bytes) << std::endl;
+ std::cout << "SLOW used:" << logger->get(l_bluefs_slow_used_bytes) << std::endl;
+ ASSERT_GE(logger->get(l_bluefs_slow_used_bytes), 16 * 1024 * 1024);
+
+ struct store_statfs_t statfs;
+ osd_alert_list_t alerts;
+ int r = store->statfs(&statfs, &alerts);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(alerts.count("BLUEFS_SPILLOVER"), 1);
+ std::cout << "spillover_alert:" << alerts.find("BLUEFS_SPILLOVER")->second
+ << std::endl;
+ }
+ );
+}
+
+TEST_P(StoreTestSpecificAUSize, SpilloverFixedTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
+ SetVal(g_conf(), "bluestore_volume_selection_policy", "use_some_extra");
+ SetVal(g_conf(), "bluestore_volume_selection_reserved", "1"); // just use non-zero to enable
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+ doManySetAttr(store.get(),
+ [&](ObjectStore* _store) {
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (_store);
+ ceph_assert(bstore);
+ bstore->compact();
+ const PerfCounters* logger = bstore->get_bluefs_perf_counters();
+ ASSERT_EQ(0, logger->get(l_bluefs_slow_used_bytes));
+ }
+ );
+}
+
+TEST_P(StoreTestSpecificAUSize, SpilloverFixed2Test) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
+ SetVal(g_conf(), "bluestore_volume_selection_policy", "use_some_extra");
+ //default 2.0 factor results in too high threshold, using less value
+ // that results in less but still present spillover.
+ SetVal(g_conf(), "bluestore_volume_selection_reserved_factor", "0.5");
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+ doManySetAttr(store.get(),
+ [&](ObjectStore* _store) {
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (_store);
+ ceph_assert(bstore);
+ bstore->compact();
+ const PerfCounters* logger = bstore->get_bluefs_perf_counters();
+ ASSERT_LE(logger->get(l_bluefs_slow_used_bytes), 300 * 1024 * 1024); // see SpilloverTest for 300MB choice rationale
+ }
+ );
+}
+
+TEST_P(StoreTestSpecificAUSize, SpilloverFixed3Test) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+ SetVal(g_conf(), "bluestore_block_db_size", "3221225472");
+ SetVal(g_conf(), "bluestore_volume_selection_policy", "fit_to_fast");
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+ doManySetAttr(store.get(),
+ [&](ObjectStore* _store) {
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (_store);
+ ceph_assert(bstore);
+ bstore->compact();
+ const PerfCounters* logger = bstore->get_bluefs_perf_counters();
+ ASSERT_EQ(logger->get(l_bluefs_slow_used_bytes), 0); // reffering to SpilloverFixedTest
+ }
+ );
+}
+
+TEST_P(StoreTestSpecificAUSize, Ticket45195Repro) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_default_buffered_write", "true");
+ SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+ SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(0x1000);
+
+ int r;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ size_t large_object_size = 1 * 1024 * 1024;
+ size_t expected_write_size = 0x8000;
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ t.set_alloc_hint(cid, hoid, large_object_size, expected_write_size,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(0xc000, '0');
+ bl.append(s);
+ t.write(cid, hoid, 0xb000, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(0x10000, '1');
+ bl.append(s);
+ t.write(cid, hoid, 0x16000, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(0x4000, '1');
+ bl.append(s);
+ t.write(cid, hoid, 0x1b000, bl.length(), bl);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl;
+ r = store->read(ch, hoid, 0xb000, 0xb000, bl);
+ ASSERT_EQ(r, 0xb000);
+
+ store->umount();
+ store->mount();
+
+ ch = store->open_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ bufferlist bl, orig;
+ string s(0xf000, '3');
+ bl.append(s);
+ t.write(cid, hoid, 0xf000, bl.length(), bl);
+ cerr << "write4" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ r = store->read(ch, hoid, 0xb000, 0x10000, bl);
+ ASSERT_EQ(r, 0x10000);
+}
+
+TEST_P(StoreTestOmapUpgrade, WithOmapHeader) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_debug_legacy_omap", "true");
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred();
+ int64_t poolid = 11;
+ coll_t cid(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD));
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, poolid, ""));
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ map<string, bufferlist> attrs;
+ bufferlist expected_header;
+ expected_header.append("this is a header");
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ bufferlist header;
+ header.append(expected_header);
+ t.omap_setheader(cid, hoid, header);
+ map<string, bufferlist> start_set;
+ bufferlist bl;
+ bl.append(string("value"));
+ start_set.emplace(string("key1"), bl);
+ t.omap_setkeys(cid, hoid, start_set);
+ r = queue_transaction(store, ch, std::move(t));
+ }
+ {
+ map<string,bufferlist> res;
+ bufferlist h;
+ r = store->omap_get(ch, hoid, &h, &res);
+ ASSERT_EQ(r, 0);
+ ASSERT_TRUE(bl_eq(h, expected_header));
+ ASSERT_EQ(res.size(), 1);
+ ASSERT_EQ(res.begin()->first, "key1");
+ }
+ store->umount();
+ ASSERT_EQ(store->fsck(false), 0);
+ SetVal(g_conf(), "bluestore_debug_legacy_omap", "false");
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true");
+ g_conf().apply_changes(nullptr);
+ ASSERT_EQ(store->fsck(false), 2);
+ ASSERT_EQ(store->quick_fix(), 0);
+ store->mount();
+ ch = store->open_collection(cid);
+ {
+ map<string,bufferlist> res;
+ bufferlist h;
+ r = store->omap_get(ch, hoid, &h, &res);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(res.size(), 1);
+ ASSERT_EQ(res.begin()->first, "key1");
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BluefsWriteInSingleDiskEnvTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(0x1000);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+ ceph_assert(bstore);
+ bstore->inject_bluefs_file("db.slow", "store_test_injection_slow", 1 << 20ul);
+ bstore->inject_bluefs_file("db.wal", "store_test_injection_wal", 1 << 20ul);
+ bstore->inject_bluefs_file("db", "store_test_injection_wal", 1 << 20ul);
+
+ AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
+ ceph_assert(admin_socket);
+
+ ceph::bufferlist in, out;
+ ostringstream err;
+ auto r = admin_socket->execute_command(
+ { "{\"prefix\": \"bluefs stats\"}" },
+ in, err, &out);
+ if (r != 0) {
+ cerr << "failure querying: " << cpp_strerror(r) << std::endl;
+ } else {
+ std::cout << std::string(out.c_str(), out.length()) << std::endl;
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, BluefsWriteInNoWalDiskEnvTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_block_db_path", "db");
+ SetVal(g_conf(), "bluestore_block_db_size", stringify(1ull << 31).c_str());
+ SetVal(g_conf(), "bluestore_block_db_create", "true");
+
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(0x1000);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+ ceph_assert(bstore);
+ bstore->inject_bluefs_file("db.slow", "store_test_injection_slow", 1 << 20ul);
+ bstore->inject_bluefs_file("db.wal", "store_test_injection_wal", 1 << 20ul);
+ bstore->inject_bluefs_file("db", "store_test_injection_wal", 1 << 20ul);
+
+ AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
+ ceph_assert(admin_socket);
+
+ ceph::bufferlist in, out;
+ ostringstream err;
+ auto r = admin_socket->execute_command(
+ { "{\"prefix\": \"bluefs stats\"}" },
+ in, err, &out);
+ if (r != 0) {
+ cerr << "failure querying: " << cpp_strerror(r) << std::endl;
+ }
+ else {
+ std::cout << std::string(out.c_str(), out.length()) << std::endl;
+ }
+}
+
+TEST_P(StoreTestOmapUpgrade, NoOmapHeader) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_debug_legacy_omap", "true");
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred();
+ int64_t poolid = 11;
+ coll_t cid(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD));
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, poolid, ""));
+ auto ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ map<string, bufferlist> attrs;
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ map<string, bufferlist> start_set;
+ bufferlist bl;
+ bl.append(string("value"));
+ start_set.emplace(string("key1"), bl);
+ t.omap_setkeys(cid, hoid, start_set);
+ r = queue_transaction(store, ch, std::move(t));
+ }
+ {
+ map<string,bufferlist> res;
+ bufferlist h;
+ r = store->omap_get(ch, hoid, &h, &res);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(h.length(), 0);
+ ASSERT_EQ(res.size(), 1);
+ ASSERT_EQ(res.begin()->first, "key1");
+ }
+ store->umount();
+ ASSERT_EQ(store->fsck(false), 0);
+ SetVal(g_conf(), "bluestore_debug_legacy_omap", "false");
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true");
+ g_conf().apply_changes(nullptr);
+ ASSERT_EQ(store->fsck(false), 2);
+ ASSERT_EQ(store->quick_fix(), 0);
+ store->mount();
+ ch = store->open_collection(cid);
+ {
+ map<string,bufferlist> res;
+ bufferlist h;
+ r = store->omap_get(ch, hoid, &h, &res);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(res.size(), 1);
+ ASSERT_EQ(res.begin()->first, "key1");
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestOmapUpgrade, LargeLegacyToPG) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_debug_legacy_omap", "true");
+ g_conf().apply_changes(nullptr);
+
+ int64_t poolid;
+ coll_t cid;
+ ghobject_t hoid;
+ ObjectStore::CollectionHandle ch;
+ StartDeferred();
+ poolid = 11;
+ cid = coll_t(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD));
+ ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ //ASSERT_EQ(false, g_conf().get_val<bool>("bluestore_debug_inject_upgrade_bug53062"));
+ map<string, bufferlist> attrs;
+ bufferlist expected_header;
+ expected_header.append("this is a header");
+
+ size_t object_count = 1000;
+ make_omap_data(object_count, poolid, cid);
+ //checking just written data
+ check_omap_data(object_count, poolid, cid);
+
+ store->umount();
+ ASSERT_EQ(store->fsck(false), 0);
+ SetVal(g_conf(), "bluestore_debug_legacy_omap", "false");
+ SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true");
+ g_conf().apply_changes(nullptr);
+ ASSERT_EQ(store->fsck(false), 1001);
+ ASSERT_EQ(store->quick_fix(), 0);
+ store->mount();
+ ch = store->open_collection(cid);
+
+ //checking quick_fix() data
+ check_omap_data(object_count, poolid, cid);
+
+ {
+ ObjectStore::Transaction t;
+ for (size_t o = 0; o < object_count; o++)
+ {
+ std::string oid = generate_monotonic_name(object_count, o, 3.71, 0.5);
+ ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 0, poolid, ""));
+ t.remove(cid, hoid);
+ }
+ t.remove_collection(cid);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+#endif // WITH_BLUESTORE
+
+int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ for (auto& i : args) {
+ if (i == "--smr"s) {
+#if defined(HAVE_LIBZBD)
+ derr << "Adjusting tests for smr mode." << dendl;
+ smr = true;
+#else
+ derr << "smr mode selected, but support not compiled in" << dendl;
+ return 1;
+#endif
+ }
+ }
+
+ // make sure we can adjust any config settings
+ g_ceph_context->_conf._clear_safe_to_start_threads();
+
+ g_ceph_context->_conf.set_val_or_die("osd_journal_size", "400");
+ g_ceph_context->_conf.set_val_or_die("filestore_index_retry_probability", "0.5");
+ g_ceph_context->_conf.set_val_or_die("filestore_op_thread_timeout", "1000");
+ g_ceph_context->_conf.set_val_or_die("filestore_op_thread_suicide_timeout", "10000");
+ //g_ceph_context->_conf.set_val_or_die("filestore_fiemap", "true");
+ g_ceph_context->_conf.set_val_or_die("bluestore_fsck_on_mkfs", "false");
+ g_ceph_context->_conf.set_val_or_die("bluestore_fsck_on_mount", "false");
+ g_ceph_context->_conf.set_val_or_die("bluestore_fsck_on_umount", "false");
+ g_ceph_context->_conf.set_val_or_die("bluestore_debug_small_allocations", "4");
+ g_ceph_context->_conf.set_val_or_die("bluestore_debug_freelist", "true");
+ g_ceph_context->_conf.set_val_or_die("bluestore_clone_cow", "true");
+ g_ceph_context->_conf.set_val_or_die("bluestore_max_alloc_size", "196608");
+ // set small cache sizes so we see trimming during Synthetic tests
+ g_ceph_context->_conf.set_val_or_die("bluestore_cache_size_hdd", "4000000");
+ g_ceph_context->_conf.set_val_or_die("bluestore_cache_size_ssd", "4000000");
+ g_ceph_context->_conf.set_val_or_die(
+ "bluestore_debug_inject_allocation_from_file_failure", "0.66");
+
+ // very short *_max prealloc so that we fall back to async submits
+ g_ceph_context->_conf.set_val_or_die("bluestore_blobid_prealloc", "10");
+ g_ceph_context->_conf.set_val_or_die("bluestore_nid_prealloc", "10");
+ g_ceph_context->_conf.set_val_or_die("bluestore_debug_randomize_serial_transaction",
+ "10");
+
+ g_ceph_context->_conf.set_val_or_die("bdev_debug_aio", "true");
+
+ // specify device size
+ g_ceph_context->_conf.set_val_or_die("bluestore_block_size",
+ stringify(DEF_STORE_TEST_BLOCKDEV_SIZE));
+
+ g_ceph_context->_conf.set_val_or_die(
+ "enable_experimental_unrecoverable_data_corrupting_features", "*");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make ceph_test_objectstore &&
+ * ./ceph_test_objectstore \
+ * --gtest_filter=*.collect_metadata* --log-to-stderr=true --debug-filestore=20
+ * "
+ * End:
+ */
diff --git a/src/test/objectstore/store_test_fixture.cc b/src/test/objectstore/store_test_fixture.cc
new file mode 100644
index 000000000..a3bdc7a36
--- /dev/null
+++ b/src/test/objectstore/store_test_fixture.cc
@@ -0,0 +1,135 @@
+#include <stdlib.h>
+#include <string>
+#include <iostream>
+#include <assert.h>
+#include <gtest/gtest.h>
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "os/ObjectStore.h"
+
+#if defined(WITH_BLUESTORE)
+#include "os/bluestore/BlueStore.h"
+#endif
+#include "store_test_fixture.h"
+
+using namespace std;
+
+static void rm_r(const string& path)
+{
+ string cmd = string("rm -r ") + path;
+ cout << "==> " << cmd << std::endl;
+ int r = ::system(cmd.c_str());
+ if (r) {
+ if (r == -1) {
+ r = errno;
+ cerr << "system() failed to fork() " << cpp_strerror(r)
+ << ", continuing anyway" << std::endl;
+ } else {
+ cerr << "failed with exit code " << r
+ << ", continuing anyway" << std::endl;
+ }
+ }
+}
+
+void StoreTestFixture::SetUp()
+{
+
+ int r = ::mkdir(data_dir.c_str(), 0777);
+ if (r < 0) {
+ r = -errno;
+ cerr << __func__ << ": unable to create " << data_dir << ": " << cpp_strerror(r) << std::endl;
+ }
+ ASSERT_EQ(0, r);
+
+ store = ObjectStore::create(g_ceph_context,
+ type,
+ data_dir,
+ "store_test_temp_journal");
+ if (!store) {
+ cerr << __func__ << ": objectstore type " << type << " doesn't exist yet!" << std::endl;
+ }
+ ASSERT_TRUE(store);
+#if defined(WITH_BLUESTORE)
+ if (type == "bluestore") {
+ BlueStore *s = static_cast<BlueStore*>(store.get());
+ // better test coverage!
+ s->set_cache_shards(5);
+ }
+#endif
+ ASSERT_EQ(0, store->mkfs());
+ ASSERT_EQ(0, store->mount());
+
+ // we keep this stuff 'unsafe' out of test case scope to be able to update ANY
+ // config settings. Hence setting it to 'safe' here to proceed with the test
+ // case
+ g_conf().set_safe_to_start_threads();
+}
+
+void StoreTestFixture::TearDown()
+{
+ if (store) {
+ int r = store->umount();
+ EXPECT_EQ(0, r);
+ rm_r(data_dir);
+ }
+ // we keep this stuff 'unsafe' out of test case scope to be able to update ANY
+ // config settings. Hence setting it to 'unsafe' here as test case is closing.
+ g_conf()._clear_safe_to_start_threads();
+ PopSettings(0);
+ if (!orig_death_test_style.empty()) {
+ ::testing::FLAGS_gtest_death_test_style = orig_death_test_style;
+ orig_death_test_style.clear();
+ }
+}
+
+void StoreTestFixture::SetVal(ConfigProxy& _conf, const char* key, const char* val)
+{
+ ceph_assert(!conf || conf == &_conf);
+ conf = &_conf;
+ std::string skey(key);
+ std::string prev_val;
+ conf->get_val(skey, &prev_val);
+ conf->set_val_or_die(key, val);
+ saved_settings.emplace(skey, prev_val);
+}
+
+void StoreTestFixture::PopSettings(size_t pos)
+{
+ if (conf) {
+ ceph_assert(pos == 0 || pos <= saved_settings.size()); // for sanity
+ while(pos < saved_settings.size())
+ {
+ auto& e = saved_settings.top();
+ conf->set_val_or_die(e.first, e.second);
+ saved_settings.pop();
+ }
+ conf->apply_changes(NULL);
+ }
+}
+
+void StoreTestFixture::CloseAndReopen() {
+ ceph_assert(store != nullptr);
+ g_conf()._clear_safe_to_start_threads();
+ int r = store->umount();
+ EXPECT_EQ(0, r);
+ ch.reset(nullptr);
+ store.reset(nullptr);
+ store = ObjectStore::create(g_ceph_context,
+ type,
+ data_dir,
+ "store_test_temp_journal");
+ if (!store) {
+ cerr << __func__ << ": objectstore type " << type << " failed to reopen!" << std::endl;
+ }
+ ASSERT_TRUE(store);
+#if defined(WITH_BLUESTORE)
+ if (type == "bluestore") {
+ BlueStore *s = static_cast<BlueStore*>(store.get());
+ // better test coverage!
+ s->set_cache_shards(5);
+ }
+#endif
+ ASSERT_EQ(0, store->mount());
+ g_conf().set_safe_to_start_threads();
+}
diff --git a/src/test/objectstore/store_test_fixture.h b/src/test/objectstore/store_test_fixture.h
new file mode 100644
index 000000000..3f25fd493
--- /dev/null
+++ b/src/test/objectstore/store_test_fixture.h
@@ -0,0 +1,52 @@
+#include <string>
+#include <stack>
+#include <memory>
+#include <gtest/gtest.h>
+#include "common/config_fwd.h"
+
+class ObjectStore;
+
+class StoreTestFixture : virtual public ::testing::Test {
+ const std::string type;
+ const std::string data_dir;
+
+ std::stack<std::pair<std::string, std::string>> saved_settings;
+ ConfigProxy* conf = nullptr;
+
+ std::string orig_death_test_style;
+
+public:
+ std::unique_ptr<ObjectStore> store;
+ ObjectStore::CollectionHandle ch;
+
+ explicit StoreTestFixture(const std::string& type)
+ : type(type), data_dir(type + ".test_temp_dir")
+ {}
+
+ void SetUp() override;
+ void TearDown() override;
+ void SetDeathTestStyle(const char* new_style) {
+ if (orig_death_test_style.empty()) {
+ orig_death_test_style = ::testing::FLAGS_gtest_death_test_style;
+ }
+ ::testing::FLAGS_gtest_death_test_style = new_style;
+ }
+
+ void SetVal(ConfigProxy& conf, const char* key, const char* val);
+ struct SettingsBookmark {
+ StoreTestFixture& s;
+ size_t pos;
+
+ SettingsBookmark(StoreTestFixture& _s, size_t p) : s(_s), pos(p)
+ {}
+
+ ~SettingsBookmark() {
+ s.PopSettings(pos);
+ }
+ };
+ SettingsBookmark BookmarkSettings() {
+ return SettingsBookmark(*this, saved_settings.size());
+ }
+ void PopSettings(size_t);
+ void CloseAndReopen();
+};
diff --git a/src/test/objectstore/test_bdev.cc b/src/test/objectstore/test_bdev.cc
new file mode 100755
index 000000000..628b586bc
--- /dev/null
+++ b/src/test/objectstore/test_bdev.cc
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <gtest/gtest.h>
+#include "global/global_init.h"
+#include "global/global_context.h"
+#include "common/ceph_context.h"
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+
+#include "blk/BlockDevice.h"
+
+using namespace std;
+
+class TempBdev {
+public:
+ TempBdev(uint64_t size)
+ : path{get_temp_bdev(size)}
+ {}
+ ~TempBdev() {
+ rm_temp_bdev(path);
+ }
+ const std::string path;
+private:
+ static string get_temp_bdev(uint64_t size)
+ {
+ static int n = 0;
+ string fn = "ceph_test_bluefs.tmp.block." + stringify(getpid())
+ + "." + stringify(++n);
+ int fd = ::open(fn.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644);
+ ceph_assert(fd >= 0);
+ int r = ::ftruncate(fd, size);
+ ceph_assert(r >= 0);
+ ::close(fd);
+ return fn;
+ }
+ static void rm_temp_bdev(string f)
+ {
+ ::unlink(f.c_str());
+ }
+};
+
+TEST(KernelDevice, Ticket45337) {
+ // Large (>=2 GB) writes are incomplete when bluefs_buffered_io = true
+
+ uint64_t size = 1048576ull * 8192;
+ TempBdev bdev{ size };
+
+ const bool buffered = true;
+
+ std::unique_ptr<BlockDevice> b(
+ BlockDevice::create(g_ceph_context, bdev.path, NULL, NULL,
+ [](void* handle, void* aio) {}, NULL));
+ bufferlist bl;
+ // writing a bit less than 4GB
+ for (auto i = 0; i < 4000; i++) {
+ string s(1048576, 'a' + (i % 28));
+ bl.append(s);
+ }
+ uint64_t magic_offs = bl.length();
+ string s(4086, 'z');
+ s += "0123456789";
+ bl.append(s);
+
+ {
+ int r = b->open(bdev.path);
+ if (r < 0) {
+ std::cerr << "open " << bdev.path << " failed" << std::endl;
+ return;
+ }
+ }
+ std::unique_ptr<IOContext> ioc(new IOContext(g_ceph_context, NULL));
+
+ auto r = b->aio_write(0, bl, ioc.get(), buffered);
+ ASSERT_EQ(r, 0);
+
+ if (ioc->has_pending_aios()) {
+ b->aio_submit(ioc.get());
+ ioc->aio_wait();
+ }
+
+ char outbuf[0x1000];
+ r = b->read_random(magic_offs, sizeof(outbuf), outbuf, buffered);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(memcmp(s.c_str(), outbuf, sizeof(outbuf)), 0);
+
+ b->close();
+}
+
+int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ map<string,string> defaults = {
+ { "debug_bdev", "1/20" }
+ };
+
+ auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ g_ceph_context->_conf.set_val(
+ "enable_experimental_unrecoverable_data_corrupting_features",
+ "*");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
new file mode 100644
index 000000000..4f77d8597
--- /dev/null
+++ b/src/test/objectstore/test_bluefs.cc
@@ -0,0 +1,1422 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <random>
+#include <thread>
+#include <stack>
+#include <gtest/gtest.h>
+#include "global/global_init.h"
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "include/scope_guard.h"
+#include "common/errno.h"
+
+#include "os/bluestore/Allocator.h"
+#include "os/bluestore/BlueFS.h"
+
+using namespace std;
+
+std::unique_ptr<char[]> gen_buffer(uint64_t size)
+{
+ std::unique_ptr<char[]> buffer = std::make_unique<char[]>(size);
+ std::independent_bits_engine<std::default_random_engine, CHAR_BIT, unsigned char> e;
+ std::generate(buffer.get(), buffer.get()+size, std::ref(e));
+ return buffer;
+}
+
+class TempBdev {
+public:
+ TempBdev(uint64_t size)
+ : path{get_temp_bdev(size)}
+ {}
+ ~TempBdev() {
+ rm_temp_bdev(path);
+ }
+ const std::string path;
+private:
+ static string get_temp_bdev(uint64_t size)
+ {
+ static int n = 0;
+ string fn = "ceph_test_bluefs.tmp.block." + stringify(getpid())
+ + "." + stringify(++n);
+ int fd = ::open(fn.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644);
+ ceph_assert(fd >= 0);
+ int r = ::ftruncate(fd, size);
+ ceph_assert(r >= 0);
+ ::close(fd);
+ return fn;
+ }
+ static void rm_temp_bdev(string f)
+ {
+ ::unlink(f.c_str());
+ }
+};
+
+class ConfSaver {
+ std::stack<std::pair<std::string, std::string>> saved_settings;
+ ConfigProxy& conf;
+public:
+ ConfSaver(ConfigProxy& conf) : conf(conf) {
+ conf._clear_safe_to_start_threads();
+ };
+ ~ConfSaver() {
+ conf._clear_safe_to_start_threads();
+ while(saved_settings.size() > 0) {
+ auto& e = saved_settings.top();
+ conf.set_val_or_die(e.first, e.second);
+ saved_settings.pop();
+ }
+ conf.set_safe_to_start_threads();
+ conf.apply_changes(nullptr);
+ }
+ void SetVal(const char* key, const char* val) {
+ std::string skey(key);
+ std::string prev_val;
+ conf.get_val(skey, &prev_val);
+ conf.set_val_or_die(skey, val);
+ saved_settings.emplace(skey, prev_val);
+ }
+ void ApplyChanges() {
+ conf.set_safe_to_start_threads();
+ conf.apply_changes(nullptr);
+ }
+};
+
+TEST(BlueFS, mkfs) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ uuid_d fsid;
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+}
+
+TEST(BlueFS, mkfs_mount) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576);
+ ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576);
+ fs.umount();
+}
+
+TEST(BlueFS, write_read) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
+ h->append("foo", 3);
+ h->append("bar", 3);
+ h->append("baz", 3);
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ {
+ BlueFS::FileReader *h;
+ ASSERT_EQ(0, fs.open_for_read("dir", "file", &h));
+ bufferlist bl;
+ ASSERT_EQ(9, fs.read(h, 0, 1024, &bl, NULL));
+ ASSERT_EQ(0, strncmp("foobarbaz", bl.c_str(), 9));
+ delete h;
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, small_appends) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
+ for (unsigned i = 0; i < 10000; ++i) {
+ h->append("abcdeabcdeabcdeabcdeabcdeabc", 23);
+ }
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write("dir", "file_sync", &h, false));
+ for (unsigned i = 0; i < 1000; ++i) {
+ h->append("abcdeabcdeabcdeabcdeabcdeabc", 23);
+ ASSERT_EQ(0, fs.fsync(h));
+ }
+ fs.close_writer(h);
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, very_large_write) {
+ // we'll write a ~5G file, so allocate more than that for the whole fs
+ uint64_t size = 1048576 * 1024 * 6ull;
+ TempBdev bdev{size};
+ BlueFS fs(g_ceph_context);
+
+ bool old = g_ceph_context->_conf.get_val<bool>("bluefs_buffered_io");
+ g_ceph_context->_conf.set_val("bluefs_buffered_io", "false");
+ uint64_t total_written = 0;
+
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ char buf[1048571]; // this is biggish, but intentionally not evenly aligned
+ for (unsigned i = 0; i < sizeof(buf); ++i) {
+ buf[i] = i;
+ }
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false));
+ for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) {
+ h->append(buf, sizeof(buf));
+ total_written += sizeof(buf);
+ }
+ fs.fsync(h);
+ for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) {
+ h->append(buf, sizeof(buf));
+ total_written += sizeof(buf);
+ }
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ {
+ BlueFS::FileReader *h;
+ ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h));
+ bufferlist bl;
+ ASSERT_EQ(h->file->fnode.size, total_written);
+ for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) {
+ bl.clear();
+ fs.read(h, i * sizeof(buf), sizeof(buf), &bl, NULL);
+ int r = memcmp(buf, bl.c_str(), sizeof(buf));
+ if (r) {
+ cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r
+ << std::endl;
+ }
+ ASSERT_EQ(0, r);
+ }
+ for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) {
+ bl.clear();
+ fs.read(h, i * sizeof(buf), sizeof(buf), &bl, NULL);
+ int r = memcmp(buf, bl.c_str(), sizeof(buf));
+ if (r) {
+ cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r
+ << std::endl;
+ }
+ ASSERT_EQ(0, r);
+ }
+ delete h;
+ ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h));
+ ASSERT_EQ(h->file->fnode.size, total_written);
+ auto huge_buf = std::make_unique<char[]>(h->file->fnode.size);
+ auto l = h->file->fnode.size;
+ int64_t r = fs.read(h, 0, l, NULL, huge_buf.get());
+ ASSERT_EQ(r, l);
+ delete h;
+ }
+ fs.umount();
+
+ g_ceph_context->_conf.set_val("bluefs_buffered_io", stringify((int)old));
+}
+
+TEST(BlueFS, very_large_write2) {
+ // we'll write a ~5G file, so allocate more than that for the whole fs
+ uint64_t size_full = 1048576 * 1024 * 6ull;
+ uint64_t size = 1048576 * 1024 * 5ull;
+ TempBdev bdev{ size_full };
+ BlueFS fs(g_ceph_context);
+
+ bool old = g_ceph_context->_conf.get_val<bool>("bluefs_buffered_io");
+ g_ceph_context->_conf.set_val("bluefs_buffered_io", "false");
+ uint64_t total_written = 0;
+
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+
+ char fill_arr[1 << 20]; // 1M
+ for (size_t i = 0; i < sizeof(fill_arr); ++i) {
+ fill_arr[i] = (char)i;
+ }
+ std::unique_ptr<char[]> buf;
+ buf.reset(new char[size]);
+ for (size_t i = 0; i < size; i += sizeof(fill_arr)) {
+ memcpy(buf.get() + i, fill_arr, sizeof(fill_arr));
+ }
+ {
+ BlueFS::FileWriter* h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false));
+ fs.append_try_flush(h, buf.get(), size);
+ total_written = size;
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ memset(buf.get(), 0, size);
+ {
+ BlueFS::FileReader* h;
+ ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h));
+ ASSERT_EQ(h->file->fnode.size, total_written);
+ auto l = h->file->fnode.size;
+ int64_t r = fs.read(h, 0, l, NULL, buf.get());
+ ASSERT_EQ(r, l);
+ for (size_t i = 0; i < size; i += sizeof(fill_arr)) {
+ ceph_assert(memcmp(buf.get() + i, fill_arr, sizeof(fill_arr)) == 0);
+ }
+ delete h;
+ }
+ fs.umount();
+
+ g_ceph_context->_conf.set_val("bluefs_buffered_io", stringify((int)old));
+}
+
+#define ALLOC_SIZE 4096
+
+void write_data(BlueFS &fs, uint64_t rationed_bytes)
+{
+ int j=0, r=0;
+ uint64_t written_bytes = 0;
+ rationed_bytes -= ALLOC_SIZE;
+ stringstream ss;
+ string dir = "dir.";
+ ss << std::this_thread::get_id();
+ dir.append(ss.str());
+ dir.append(".");
+ dir.append(to_string(j));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ while (1) {
+ string file = "file.";
+ file.append(to_string(j));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(ALLOC_SIZE);
+ bufferptr bp = buffer::claim_char(ALLOC_SIZE, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ r = fs.fsync(h);
+ if (r < 0) {
+ break;
+ }
+ written_bytes += g_conf()->bluefs_alloc_size;
+ j++;
+ if ((rationed_bytes - written_bytes) <= g_conf()->bluefs_alloc_size) {
+ break;
+ }
+ }
+}
+
+void create_single_file(BlueFS &fs)
+{
+ BlueFS::FileWriter *h;
+ stringstream ss;
+ string dir = "dir.test";
+ ASSERT_EQ(0, fs.mkdir(dir));
+ string file = "testfile";
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(ALLOC_SIZE);
+ bufferptr bp = buffer::claim_char(ALLOC_SIZE, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ fs.fsync(h);
+ fs.close_writer(h);
+}
+
+void write_single_file(BlueFS &fs, uint64_t rationed_bytes)
+{
+ stringstream ss;
+ const string dir = "dir.test";
+ const string file = "testfile";
+ uint64_t written_bytes = 0;
+ rationed_bytes -= ALLOC_SIZE;
+ while (1) {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(ALLOC_SIZE);
+ bufferptr bp = buffer::claim_char(ALLOC_SIZE, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ int r = fs.fsync(h);
+ if (r < 0) {
+ break;
+ }
+ written_bytes += g_conf()->bluefs_alloc_size;
+ if ((rationed_bytes - written_bytes) <= g_conf()->bluefs_alloc_size) {
+ break;
+ }
+ }
+}
+
+bool writes_done = false;
+
+void sync_fs(BlueFS &fs)
+{
+ while (1) {
+ if (writes_done == true)
+ break;
+ fs.sync_metadata(false);
+ sleep(1);
+ }
+}
+
+
+void do_join(std::thread& t)
+{
+ t.join();
+}
+
+void join_all(std::vector<std::thread>& v)
+{
+ std::for_each(v.begin(),v.end(),do_join);
+}
+
+#define NUM_WRITERS 3
+#define NUM_SYNC_THREADS 1
+
+#define NUM_SINGLE_FILE_WRITERS 1
+#define NUM_MULTIPLE_FILE_WRITERS 2
+
+TEST(BlueFS, test_flush_1) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ g_ceph_context->_conf.set_val(
+ "bluefs_alloc_size",
+ "65536");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ std::vector<std::thread> write_thread_multiple;
+ uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
+ uint64_t per_thread_bytes = (effective_size/(NUM_MULTIPLE_FILE_WRITERS + NUM_SINGLE_FILE_WRITERS));
+ for (int i=0; i<NUM_MULTIPLE_FILE_WRITERS ; i++) {
+ write_thread_multiple.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes));
+ }
+
+ create_single_file(fs);
+ std::vector<std::thread> write_thread_single;
+ for (int i=0; i<NUM_SINGLE_FILE_WRITERS; i++) {
+ write_thread_single.push_back(std::thread(write_single_file, std::ref(fs), per_thread_bytes));
+ }
+
+ join_all(write_thread_single);
+ join_all(write_thread_multiple);
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, test_flush_2) {
+ uint64_t size = 1048576 * 256;
+ TempBdev bdev{size};
+ g_ceph_context->_conf.set_val(
+ "bluefs_alloc_size",
+ "65536");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ uint64_t effective_size = size - (128 * 1048576); // leaving the last 32 MB for log compaction
+ uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS));
+ std::vector<std::thread> write_thread_multiple;
+ for (int i=0; i<NUM_WRITERS; i++) {
+ write_thread_multiple.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes));
+ }
+
+ join_all(write_thread_multiple);
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, test_flush_3) {
+ uint64_t size = 1048576 * 256;
+ TempBdev bdev{size};
+ g_ceph_context->_conf.set_val(
+ "bluefs_alloc_size",
+ "65536");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ std::vector<std::thread> write_threads;
+ uint64_t effective_size = size - (64 * 1048576); // leaving the last 11 MB for log compaction
+ uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS));
+ for (int i=0; i<NUM_WRITERS; i++) {
+ write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes));
+ }
+
+ std::vector<std::thread> sync_threads;
+ for (int i=0; i<NUM_SYNC_THREADS; i++) {
+ sync_threads.push_back(std::thread(sync_fs, std::ref(fs)));
+ }
+
+ join_all(write_threads);
+ writes_done = true;
+ join_all(sync_threads);
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, test_simple_compaction_sync) {
+ g_ceph_context->_conf.set_val(
+ "bluefs_compact_log_sync",
+ "true");
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ for (int i=0; i<10; i++) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(4096);
+ bufferptr bp = buffer::claim_char(4096, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ fs.fsync(h);
+ }
+ }
+ }
+ {
+ for (int i=0; i<10; i+=2) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ fs.unlink(dir, file);
+ fs.sync_metadata(false);
+ }
+ ASSERT_EQ(0, fs.rmdir(dir));
+ fs.sync_metadata(false);
+ }
+ }
+ fs.compact_log();
+ fs.umount();
+}
+
+TEST(BlueFS, test_simple_compaction_async) {
+ g_ceph_context->_conf.set_val(
+ "bluefs_compact_log_sync",
+ "false");
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ for (int i=0; i<10; i++) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(4096);
+ bufferptr bp = buffer::claim_char(4096, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ fs.fsync(h);
+ }
+ }
+ }
+ {
+ for (int i=0; i<10; i+=2) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ fs.unlink(dir, file);
+ fs.sync_metadata(false);
+ }
+ ASSERT_EQ(0, fs.rmdir(dir));
+ fs.sync_metadata(false);
+ }
+ }
+ fs.compact_log();
+ fs.umount();
+}
+
+TEST(BlueFS, test_compaction_sync) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ g_ceph_context->_conf.set_val(
+ "bluefs_alloc_size",
+ "65536");
+ g_ceph_context->_conf.set_val(
+ "bluefs_compact_log_sync",
+ "true");
+ const char* canary_dir = "dir.after_compact_test";
+ const char* canary_file = "file.after_compact_test";
+ const char* canary_data = "some random data";
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ std::vector<std::thread> write_threads;
+ uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
+ uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS));
+ for (int i=0; i<NUM_WRITERS; i++) {
+ write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes));
+ }
+
+ std::vector<std::thread> sync_threads;
+ for (int i=0; i<NUM_SYNC_THREADS; i++) {
+ sync_threads.push_back(std::thread(sync_fs, std::ref(fs)));
+ }
+
+ join_all(write_threads);
+ writes_done = true;
+ join_all(sync_threads);
+ fs.compact_log();
+
+ {
+ ASSERT_EQ(0, fs.mkdir(canary_dir));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(canary_dir, canary_file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ h->append(canary_data, strlen(canary_data));
+ int r = fs.fsync(h);
+ ASSERT_EQ(r, 0);
+ }
+ }
+ fs.umount();
+
+ fs.mount();
+ {
+ BlueFS::FileReader *h;
+ ASSERT_EQ(0, fs.open_for_read(canary_dir, canary_file, &h));
+ ASSERT_NE(nullptr, h);
+ bufferlist bl;
+ ASSERT_EQ(strlen(canary_data), fs.read(h, 0, 1024, &bl, NULL));
+ std::cout << bl.c_str() << std::endl;
+ ASSERT_EQ(0, strncmp(canary_data, bl.c_str(), strlen(canary_data)));
+ delete h;
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, test_compaction_async) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ g_ceph_context->_conf.set_val(
+ "bluefs_alloc_size",
+ "65536");
+ g_ceph_context->_conf.set_val(
+ "bluefs_compact_log_sync",
+ "false");
+ const char* canary_dir = "dir.after_compact_test";
+ const char* canary_file = "file.after_compact_test";
+ const char* canary_data = "some random data";
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ std::vector<std::thread> write_threads;
+ uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
+ uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS));
+ for (int i=0; i<NUM_WRITERS; i++) {
+ write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes));
+ }
+
+ std::vector<std::thread> sync_threads;
+ for (int i=0; i<NUM_SYNC_THREADS; i++) {
+ sync_threads.push_back(std::thread(sync_fs, std::ref(fs)));
+ }
+
+ join_all(write_threads);
+ writes_done = true;
+ join_all(sync_threads);
+ fs.compact_log();
+
+ {
+ ASSERT_EQ(0, fs.mkdir(canary_dir));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(canary_dir, canary_file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ h->append(canary_data, strlen(canary_data));
+ int r = fs.fsync(h);
+ ASSERT_EQ(r, 0);
+ }
+ }
+ fs.umount();
+
+ fs.mount();
+ {
+ BlueFS::FileReader *h;
+ ASSERT_EQ(0, fs.open_for_read(canary_dir, canary_file, &h));
+ ASSERT_NE(nullptr, h);
+ bufferlist bl;
+ ASSERT_EQ(strlen(canary_data), fs.read(h, 0, 1024, &bl, NULL));
+ std::cout << bl.c_str() << std::endl;
+ ASSERT_EQ(0, strncmp(canary_data, bl.c_str(), strlen(canary_data)));
+ delete h;
+ }
+ fs.umount();
+}
+
+TEST(BlueFS, test_replay) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ g_ceph_context->_conf.set_val(
+ "bluefs_alloc_size",
+ "65536");
+ g_ceph_context->_conf.set_val(
+ "bluefs_compact_log_sync",
+ "false");
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ std::vector<std::thread> write_threads;
+ uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
+ uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS));
+ for (int i=0; i<NUM_WRITERS; i++) {
+ write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes));
+ }
+
+ std::vector<std::thread> sync_threads;
+ for (int i=0; i<NUM_SYNC_THREADS; i++) {
+ sync_threads.push_back(std::thread(sync_fs, std::ref(fs)));
+ }
+
+ join_all(write_threads);
+ writes_done = true;
+ join_all(sync_threads);
+ fs.compact_log();
+ }
+ fs.umount();
+ // remount and check log can replay safe?
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ fs.umount();
+}
+
+TEST(BlueFS, test_replay_growth) {
+ uint64_t size = 1048576LL * (2 * 1024 + 128);
+ TempBdev bdev{size};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_alloc_size", "4096");
+ conf.SetVal("bluefs_shared_alloc_size", "4096");
+ conf.SetVal("bluefs_compact_log_sync", "false");
+ conf.SetVal("bluefs_min_log_runway", "32768");
+ conf.SetVal("bluefs_max_log_runway", "65536");
+ conf.SetVal("bluefs_allocator", "stupid");
+ conf.SetVal("bluefs_sync_write", "true");
+ conf.ApplyChanges();
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mkdir("dir"));
+
+ char data[2000];
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
+ for (size_t i = 0; i < 10000; i++) {
+ h->append(data, 2000);
+ fs.fsync(h);
+ }
+ fs.close_writer(h);
+ fs.umount(true); //do not compact on exit!
+
+ // remount and check log can replay safe?
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ fs.umount();
+}
+
+TEST(BlueFS, test_tracker_50965) {
+ uint64_t size_wal = 1048576 * 64;
+ TempBdev bdev_wal{size_wal};
+ uint64_t size_db = 1048576 * 128;
+ TempBdev bdev_db{size_db};
+ uint64_t size_slow = 1048576 * 256;
+ TempBdev bdev_slow{size_slow};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_min_flush_size", "65536");
+ conf.ApplyChanges();
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_WAL, bdev_wal.path, false, 0));
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0));
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, true, true }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true }));
+
+ string dir_slow = "dir.slow";
+ ASSERT_EQ(0, fs.mkdir(dir_slow));
+ string dir_db = "dir_db";
+ ASSERT_EQ(0, fs.mkdir(dir_db));
+
+ string file_slow = "file";
+ BlueFS::FileWriter *h_slow;
+ ASSERT_EQ(0, fs.open_for_write(dir_slow, file_slow, &h_slow, false));
+ ASSERT_NE(nullptr, h_slow);
+
+ string file_db = "file";
+ BlueFS::FileWriter *h_db;
+ ASSERT_EQ(0, fs.open_for_write(dir_db, file_db, &h_db, false));
+ ASSERT_NE(nullptr, h_db);
+
+ bufferlist bl1;
+ std::unique_ptr<char[]> buf1 = gen_buffer(70000);
+ bufferptr bp1 = buffer::claim_char(70000, buf1.get());
+ bl1.push_back(bp1);
+ h_slow->append(bl1.c_str(), bl1.length());
+ fs.flush(h_slow);
+
+ uint64_t h_slow_dirty_seq_1 = fs.debug_get_dirty_seq(h_slow);
+
+ bufferlist bl2;
+ std::unique_ptr<char[]> buf2 = gen_buffer(1000);
+ bufferptr bp2 = buffer::claim_char(1000, buf2.get());
+ bl2.push_back(bp2);
+ h_db->append(bl2.c_str(), bl2.length());
+ fs.fsync(h_db);
+
+ uint64_t h_slow_dirty_seq_2 = fs.debug_get_dirty_seq(h_slow);
+ bool h_slow_dev_dirty = fs.debug_get_is_dev_dirty(h_slow, BlueFS::BDEV_SLOW);
+
+ //problem if allocations are stable in log but slow device is not flushed yet
+ ASSERT_FALSE(h_slow_dirty_seq_1 != 0 &&
+ h_slow_dirty_seq_2 == 0 &&
+ h_slow_dev_dirty == true);
+
+ fs.close_writer(h_slow);
+ fs.close_writer(h_db);
+
+ fs.umount();
+}
+
+TEST(BlueFS, test_truncate_stable_53129) {
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_min_flush_size", "65536");
+ conf.ApplyChanges();
+
+ uint64_t size_wal = 1048576 * 64;
+ TempBdev bdev_wal{size_wal};
+ uint64_t size_db = 1048576 * 128;
+ TempBdev bdev_db{size_db};
+ uint64_t size_slow = 1048576 * 256;
+ TempBdev bdev_slow{size_slow};
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_WAL, bdev_wal.path, false, 0));
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0));
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, true, true }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true }));
+
+ string dir_slow = "dir.slow";
+ ASSERT_EQ(0, fs.mkdir(dir_slow));
+ string dir_db = "dir_db";
+ ASSERT_EQ(0, fs.mkdir(dir_db));
+
+ string file_slow = "file";
+ BlueFS::FileWriter *h_slow;
+ ASSERT_EQ(0, fs.open_for_write(dir_slow, file_slow, &h_slow, false));
+ ASSERT_NE(nullptr, h_slow);
+
+ string file_db = "file";
+ BlueFS::FileWriter *h_db;
+ ASSERT_EQ(0, fs.open_for_write(dir_db, file_db, &h_db, false));
+ ASSERT_NE(nullptr, h_db);
+
+ bufferlist bl1;
+ std::unique_ptr<char[]> buf1 = gen_buffer(70000);
+ bufferptr bp1 = buffer::claim_char(70000, buf1.get());
+ bl1.push_back(bp1);
+ // add 70000 bytes
+ h_slow->append(bl1.c_str(), bl1.length());
+ fs.flush(h_slow);
+ // and truncate to 60000 bytes
+ fs.truncate(h_slow, 60000);
+
+ // write something to file on DB device
+ bufferlist bl2;
+ std::unique_ptr<char[]> buf2 = gen_buffer(1000);
+ bufferptr bp2 = buffer::claim_char(1000, buf2.get());
+ bl2.push_back(bp2);
+ h_db->append(bl2.c_str(), bl2.length());
+ // and force bluefs log to flush
+ fs.fsync(h_db);
+
+ // This is the actual test point.
+ // We completed truncate, and we expect
+ // - size to be 60000
+ // - data to be stable on slow device
+ // OR
+ // - size = 0 or file does not exist
+ // - dev_dirty is irrelevant
+ bool h_slow_dev_dirty = fs.debug_get_is_dev_dirty(h_slow, BlueFS::BDEV_SLOW);
+ // Imagine power goes down here.
+
+ fs.close_writer(h_slow);
+ fs.close_writer(h_db);
+
+ fs.umount();
+
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true }));
+
+ uint64_t size;
+ utime_t mtime;
+ ASSERT_EQ(0, fs.stat("dir.slow", "file", &size, &mtime));
+ // check file size 60000
+ ASSERT_EQ(size, 60000);
+ // check that dev_dirty was false (data stable on media)
+ ASSERT_EQ(h_slow_dev_dirty, false);
+
+ fs.umount();
+}
+
+TEST(BlueFS, test_update_ino1_delta_after_replay) {
+ uint64_t size = 1048576LL * (2 * 1024 + 128);
+ TempBdev bdev{size};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_alloc_size", "4096");
+ conf.SetVal("bluefs_shared_alloc_size", "4096");
+ conf.SetVal("bluefs_compact_log_sync", "false");
+ conf.SetVal("bluefs_min_log_runway", "32768");
+ conf.SetVal("bluefs_max_log_runway", "65536");
+ conf.SetVal("bluefs_allocator", "stupid");
+ conf.ApplyChanges();
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mkdir("dir"));
+
+ char data[2000];
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
+ for (size_t i = 0; i < 100; i++) {
+ h->append(data, 2000);
+ fs.fsync(h);
+ }
+ fs.close_writer(h);
+ fs.umount(true); //do not compact on exit!
+
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.open_for_write("dir", "file2", &h, false));
+ for (size_t i = 0; i < 100; i++) {
+ h->append(data, 2000);
+ fs.fsync(h);
+ }
+ fs.close_writer(h);
+ fs.umount();
+
+ // remount and check log can replay safe?
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ fs.umount();
+}
+
+TEST(BlueFS, broken_unlink_fsync_seq) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ /*
+ * This reproduces a weird file op sequence (unlink+fsync) that Octopus
+ * RocksDB might issue to BlueFS when recycle_log_file_num setting is 0
+ * See https://tracker.ceph.com/issues/55636 for more details
+ *
+ */
+ char buf[1048571]; // this is biggish, but intentionally not evenly aligned
+ for (unsigned i = 0; i < sizeof(buf); ++i) {
+ buf[i] = i;
+ }
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false));
+
+ h->append(buf, sizeof(buf));
+ fs.flush(h);
+ h->append(buf, sizeof(buf));
+ fs.unlink("dir", "file");
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ fs.umount();
+
+ // remount and check log can replay safe?
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ fs.umount();
+}
+
+TEST(BlueFS, truncate_fsync) {
+ uint64_t bdev_size = 128 * 1048576;
+ uint64_t block_size = 4096;
+ uint64_t reserved = 1048576;
+ TempBdev bdev{bdev_size};
+ uuid_d fsid;
+ const char* DIR_NAME="dir";
+ const char* FILE_NAME="file1";
+
+ size_t sizes[] = {3, 1024, 4096, 1024 * 4096};
+ for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) {
+ const size_t content_size= sizes[i];
+ const size_t read_size = p2roundup(content_size, size_t(block_size));
+ const std::string content(content_size, 'x');
+ {
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, reserved));
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false));
+ h->append(content.c_str(), content.length());
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ {
+ BlueFS::FileReader *h;
+ ASSERT_EQ(0, fs.open_for_read(DIR_NAME, FILE_NAME, &h));
+ bufferlist bl;
+ ASSERT_EQ(content.length(), fs.read(h, 0, read_size, &bl, NULL));
+ ASSERT_EQ(0, strncmp(content.c_str(), bl.c_str(), content.length()));
+ delete h;
+ }
+ {
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, true));
+ fs.truncate(h, 0);
+ fs.fsync(h);
+ fs.close_writer(h);
+ }
+ }
+ {
+ //this was broken due to https://tracker.ceph.com/issues/55307
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, reserved));
+ ASSERT_EQ(0, fs.mount());
+ BlueFS::FileReader *h;
+ ASSERT_EQ(0, fs.open_for_read(DIR_NAME, FILE_NAME, &h));
+ bufferlist bl;
+ ASSERT_EQ(0, fs.read(h, 0, read_size, &bl, NULL));
+ delete h;
+ fs.umount();
+ }
+ }
+}
+
+TEST(BlueFS, test_shared_alloc) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev_slow{size};
+ uint64_t size_db = 1048576 * 8;
+ TempBdev bdev_db{size_db};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_shared_alloc_size", "1048576");
+
+ bluefs_shared_alloc_context_t shared_alloc;
+ uint64_t shared_alloc_unit = 4096;
+ shared_alloc.set(
+ Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+ size, shared_alloc_unit, 0, 0, "test shared allocator"),
+ shared_alloc_unit);
+ shared_alloc.a->init_add_free(0, size);
+
+ BlueFS fs(g_ceph_context);
+ // DB device is fully utilized
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, size_db - 0x1000));
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0,
+ &shared_alloc));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ for (int i=0; i<10; i++) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(4096);
+ bufferptr bp = buffer::claim_char(4096, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ fs.fsync(h);
+ }
+ }
+ }
+ {
+ for (int i=0; i<10; i+=2) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ fs.unlink(dir, file);
+ fs.sync_metadata(false);
+ }
+ ASSERT_EQ(0, fs.rmdir(dir));
+ fs.sync_metadata(false);
+ }
+ }
+ fs.compact_log();
+ auto *logger = fs.get_perf_counters();
+ ASSERT_NE(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0);
+ auto num_files = logger->get(l_bluefs_num_files);
+ fs.umount();
+ fs.mount();
+ ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+ fs.umount();
+}
+
+TEST(BlueFS, test_shared_alloc_sparse) {
+ uint64_t size = 1048576 * 128 * 2;
+ uint64_t main_unit = 4096;
+ uint64_t bluefs_alloc_unit = 1048576;
+ TempBdev bdev_slow{size};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_shared_alloc_size",
+ stringify(bluefs_alloc_unit).c_str());
+
+ bluefs_shared_alloc_context_t shared_alloc;
+ shared_alloc.set(
+ Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+ size, main_unit, 0, 0, "test shared allocator"),
+ main_unit);
+ // prepare sparse free space but let's have a continuous chunk at
+ // the beginning to fit initial log's fnode into superblock,
+ // we don't have any tricks to deal with sparse allocations
+ // (and hence long fnode) at mkfs
+ shared_alloc.a->init_add_free(bluefs_alloc_unit, 4 * bluefs_alloc_unit);
+ for(uint64_t i = 5 * bluefs_alloc_unit; i < size; i += 2 * main_unit) {
+ shared_alloc.a->init_add_free(i, main_unit);
+ }
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0,
+ &shared_alloc));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ for (int i=0; i<10; i++) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(4096);
+ bufferptr bp = buffer::claim_char(4096, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ fs.fsync(h);
+ }
+ }
+ }
+ {
+ for (int i=0; i<10; i+=2) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ fs.unlink(dir, file);
+ fs.sync_metadata(false);
+ }
+ ASSERT_EQ(0, fs.rmdir(dir));
+ fs.sync_metadata(false);
+ }
+ }
+ fs.compact_log();
+ auto *logger = fs.get_perf_counters();
+ ASSERT_NE(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0);
+ auto num_files = logger->get(l_bluefs_num_files);
+ fs.umount();
+
+ fs.mount();
+ ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+ fs.umount();
+}
+
+TEST(BlueFS, test_4k_shared_alloc) {
+ uint64_t size = 1048576 * 128 * 2;
+ uint64_t main_unit = 4096;
+ uint64_t bluefs_alloc_unit = main_unit;
+ TempBdev bdev_slow{size};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_shared_alloc_size",
+ stringify(bluefs_alloc_unit).c_str());
+
+ bluefs_shared_alloc_context_t shared_alloc;
+ shared_alloc.set(
+ Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+ size, main_unit, 0, 0, "test shared allocator"),
+ main_unit);
+ shared_alloc.a->init_add_free(bluefs_alloc_unit, size - bluefs_alloc_unit);
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0,
+ &shared_alloc));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ for (int i=0; i<10; i++) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+ bufferlist bl;
+ std::unique_ptr<char[]> buf = gen_buffer(4096);
+ bufferptr bp = buffer::claim_char(4096, buf.get());
+ bl.push_back(bp);
+ h->append(bl.c_str(), bl.length());
+ fs.fsync(h);
+ }
+ }
+ }
+ {
+ for (int i=0; i<10; i+=2) {
+ string dir = "dir.";
+ dir.append(to_string(i));
+ for (int j=0; j<10; j++) {
+ string file = "file.";
+ file.append(to_string(j));
+ fs.unlink(dir, file);
+ fs.sync_metadata(false);
+ }
+ ASSERT_EQ(0, fs.rmdir(dir));
+ fs.sync_metadata(false);
+ }
+ }
+ fs.compact_log();
+ auto *logger = fs.get_perf_counters();
+ ASSERT_EQ(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0);
+ ASSERT_EQ(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0);
+ auto num_files = logger->get(l_bluefs_num_files);
+ fs.umount();
+
+ fs.mount();
+ ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+ fs.umount();
+}
+
+void create_files(BlueFS &fs,
+ atomic_bool& stop_creating,
+ atomic_bool& started_creating)
+{
+ uint32_t i = 0;
+ stringstream ss;
+ string dir = "dir.";
+ ss << std::this_thread::get_id();
+ dir.append(ss.str());
+ dir.append(".");
+ dir.append(to_string(i));
+ ASSERT_EQ(0, fs.mkdir(dir));
+ while (!stop_creating.load()) {
+ string file = "file.";
+ file.append(to_string(i));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+ ASSERT_NE(nullptr, h);
+ fs.close_writer(h);
+ i++;
+ started_creating = true;
+ }
+}
+
+
+TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) {
+ uint64_t size = 1048576 * 128;
+ TempBdev bdev{size};
+ ConfSaver conf(g_ceph_context->_conf);
+
+ conf.SetVal("bluefs_alloc_size", "65536");
+ conf.SetVal("bluefs_compact_log_sync", "false");
+ // make sure fsync always trigger log compact
+ conf.SetVal("bluefs_log_compact_min_ratio", "0");
+ conf.SetVal("bluefs_log_compact_min_size", "0");
+ conf.ApplyChanges();
+
+ for (int i=0; i<10; ++i) {
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+ {
+ atomic_bool stop_creating{false};
+ atomic_bool started_creating{false};
+ std::thread create_thread;
+ create_thread = std::thread(create_files,
+ std::ref(fs),
+ std::ref(stop_creating),
+ std::ref(started_creating));
+ while (!started_creating.load()) {
+ }
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("foo"));
+ ASSERT_EQ(0, fs.open_for_write("foo", "bar", &h, false));
+ fs.fsync(h);
+ fs.close_writer(h);
+
+ stop_creating = true;
+ do_join(create_thread);
+
+ fs.umount(true); //do not compact on exit!
+ ASSERT_EQ(0, fs.mount());
+ fs.umount();
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ map<string,string> defaults = {
+ { "debug_bluefs", "1/20" },
+ { "debug_bdev", "1/20" }
+ };
+
+ auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ g_ceph_context->_conf.set_val(
+ "enable_experimental_unrecoverable_data_corrupting_features",
+ "*");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc
new file mode 100644
index 000000000..18ccaff91
--- /dev/null
+++ b/src/test/objectstore/test_bluestore_types.cc
@@ -0,0 +1,2346 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "os/bluestore/bluestore_types.h"
+#include "gtest/gtest.h"
+#include "include/stringify.h"
+#include "common/ceph_time.h"
+#include "os/bluestore/BlueStore.h"
+#include "os/bluestore/simple_bitmap.h"
+#include "os/bluestore/AvlAllocator.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "global/global_context.h"
+#include "perfglue/heap_profiler.h"
+
+#include <sstream>
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+using namespace std;
+
+TEST(bluestore, sizeof) {
+#define P(t) cout << STRINGIFY(t) << "\t" << sizeof(t) << std::endl
+ P(BlueStore::Onode);
+ P(BlueStore::Extent);
+ P(BlueStore::Blob);
+ P(BlueStore::SharedBlob);
+ P(BlueStore::ExtentMap);
+ P(BlueStore::extent_map_t);
+ P(BlueStore::blob_map_t);
+ P(BlueStore::BufferSpace);
+ P(BlueStore::Buffer);
+ P(bluestore_onode_t);
+ P(bluestore_blob_t);
+ P(PExtentVector);
+ P(ghobject_t);
+ P(bluestore_shared_blob_t);
+ P(bluestore_extent_ref_map_t);
+ P(bluestore_extent_ref_map_t::record_t);
+ P(bluestore_blob_use_tracker_t);
+ P(std::atomic_int);
+ P(BlueStore::SharedBlobRef);
+ P(boost::intrusive::set_base_hook<>);
+ P(boost::intrusive::unordered_set_base_hook<>);
+ P(bufferlist);
+ P(bufferptr);
+ P(range_seg_t);
+ P(sb_info_t);
+ P(SimpleBitmap);
+ cout << "map<uint64_t,uint64_t>\t" << sizeof(map<uint64_t,uint64_t>) << std::endl;
+ cout << "map<char,char>\t" << sizeof(map<char,char>) << std::endl;
+}
+
+void dump_mempools()
+{
+ ostringstream ostr;
+ auto f = Formatter::create_unique("json-pretty", "json-pretty", "json-pretty");
+ ostr << "Mempools: ";
+ f->open_object_section("mempools");
+ mempool::dump(f.get());
+ f->close_section();
+ f->flush(ostr);
+ cout << ostr.str() << std::endl;
+}
+/*void get_mempool_stats(uint64_t* total_bytes, uint64_t* total_items)
+{
+ uint64_t meta_allocated = mempool::bluestore_cache_meta::allocated_bytes();
+ uint64_t onode_allocated = mempool::bluestore_cache_onode::allocated_bytes();
+ uint64_t other_allocated = mempool::bluestore_cache_other::allocated_bytes();
+
+ uint64_t meta_items = mempool::bluestore_cache_meta::allocated_items();
+ uint64_t onode_items = mempool::bluestore_cache_onode::allocated_items();
+ uint64_t other_items = mempool::bluestore_cache_other::allocated_items();
+ cout << "meta(" << meta_allocated << "/" << meta_items
+ << ") onode(" << onode_allocated << "/" << onode_items
+ << ") other(" << other_allocated << "/" << other_items
+ << ")" << std::endl;
+ *total_bytes = meta_allocated + onode_allocated + other_allocated;
+ *total_items = onode_items;
+}*/
+
+TEST(sb_info_space_efficient_map_t, basic) {
+ sb_info_space_efficient_map_t sb_info;
+ const size_t num_shared = 1000;
+ for (size_t i = 0; i < num_shared; i += 2) {
+ auto& sbi = sb_info.add_maybe_stray(i);
+ sbi.pool_id = i;
+ }
+ ASSERT_TRUE(sb_info.find(0) != sb_info.end());
+ ASSERT_TRUE(sb_info.find(1) == sb_info.end());
+ ASSERT_TRUE(sb_info.find(2) != sb_info.end());
+ ASSERT_TRUE(sb_info.find(4)->pool_id == 4);
+ ASSERT_TRUE(sb_info.find(num_shared) == sb_info.end());
+
+ // ordered insertion
+ sb_info.add_or_adopt(num_shared).pool_id = num_shared;
+ ASSERT_TRUE(sb_info.find(num_shared) != sb_info.end());
+ ASSERT_TRUE(sb_info.find(num_shared)->pool_id == num_shared);
+
+ // out of order insertion
+ sb_info.add_or_adopt(1).pool_id = 1;
+ ASSERT_TRUE(sb_info.find(1) != sb_info.end());
+ ASSERT_TRUE(sb_info.find(1)->pool_id == 1);
+
+ // ordered insertion
+ sb_info.add_maybe_stray(num_shared + 1).pool_id = num_shared + 1;
+ ASSERT_TRUE(sb_info.find(num_shared + 1) != sb_info.end());
+ ASSERT_TRUE(sb_info.find(num_shared + 1)->pool_id == num_shared + 1);
+
+ // out of order insertion
+ sb_info.add_maybe_stray(105).pool_id = 105;
+ ASSERT_TRUE(sb_info.find(105) != sb_info.end());
+ ASSERT_TRUE(sb_info.find(105)->pool_id == 105);
+}
+
+TEST(sb_info_space_efficient_map_t, size) {
+ const size_t num_shared = 10000000;
+ sb_info_space_efficient_map_t sb_info;
+
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard* oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard* bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+
+ for (size_t i = 0; i < num_shared; i++) {
+ auto& sbi = sb_info.add_or_adopt(i);
+ // primarily to silent the 'unused' warning
+ ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
+ }
+ dump_mempools();
+}
+
+TEST(bluestore_extent_ref_map_t, add)
+{
+ bluestore_extent_ref_map_t m;
+ m.get(10, 10);
+ ASSERT_EQ(1u, m.ref_map.size());
+ cout << m << std::endl;
+ m.get(20, 10);
+ cout << m << std::endl;
+ ASSERT_EQ(1u, m.ref_map.size());
+ ASSERT_EQ(20u, m.ref_map[10].length);
+ ASSERT_EQ(1u, m.ref_map[10].refs);
+ m.get(40, 10);
+ cout << m << std::endl;
+ ASSERT_EQ(2u, m.ref_map.size());
+ m.get(30, 10);
+ cout << m << std::endl;
+ ASSERT_EQ(1u, m.ref_map.size());
+ m.get(50, 10);
+ cout << m << std::endl;
+ ASSERT_EQ(1u, m.ref_map.size());
+ m.get(5, 5);
+ cout << m << std::endl;
+ ASSERT_EQ(1u, m.ref_map.size());
+}
+
+TEST(bluestore_extent_ref_map_t, get)
+{
+ bluestore_extent_ref_map_t m;
+ m.get(00, 30);
+ cout << m << std::endl;
+ m.get(10, 10);
+ cout << m << std::endl;
+ ASSERT_EQ(3u, m.ref_map.size());
+ ASSERT_EQ(10u, m.ref_map[0].length);
+ ASSERT_EQ(1u, m.ref_map[0].refs);
+ ASSERT_EQ(10u, m.ref_map[10].length);
+ ASSERT_EQ(2u, m.ref_map[10].refs);
+ ASSERT_EQ(10u, m.ref_map[20].length);
+ ASSERT_EQ(1u, m.ref_map[20].refs);
+ m.get(20, 5);
+ cout << m << std::endl;
+ ASSERT_EQ(3u, m.ref_map.size());
+ ASSERT_EQ(15u, m.ref_map[10].length);
+ ASSERT_EQ(2u, m.ref_map[10].refs);
+ ASSERT_EQ(5u, m.ref_map[25].length);
+ ASSERT_EQ(1u, m.ref_map[25].refs);
+ m.get(5, 20);
+ cout << m << std::endl;
+ ASSERT_EQ(4u, m.ref_map.size());
+ ASSERT_EQ(5u, m.ref_map[0].length);
+ ASSERT_EQ(1u, m.ref_map[0].refs);
+ ASSERT_EQ(5u, m.ref_map[5].length);
+ ASSERT_EQ(2u, m.ref_map[5].refs);
+ ASSERT_EQ(15u, m.ref_map[10].length);
+ ASSERT_EQ(3u, m.ref_map[10].refs);
+ ASSERT_EQ(5u, m.ref_map[25].length);
+ ASSERT_EQ(1u, m.ref_map[25].refs);
+ m.get(25, 3);
+ cout << m << std::endl;
+ ASSERT_EQ(5u, m.ref_map.size());
+ ASSERT_EQ(5u, m.ref_map[0].length);
+ ASSERT_EQ(1u, m.ref_map[0].refs);
+ ASSERT_EQ(5u, m.ref_map[5].length);
+ ASSERT_EQ(2u, m.ref_map[5].refs);
+ ASSERT_EQ(15u, m.ref_map[10].length);
+ ASSERT_EQ(3u, m.ref_map[10].refs);
+ ASSERT_EQ(3u, m.ref_map[25].length);
+ ASSERT_EQ(2u, m.ref_map[25].refs);
+ ASSERT_EQ(2u, m.ref_map[28].length);
+ ASSERT_EQ(1u, m.ref_map[28].refs);
+}
+
+TEST(bluestore_extent_ref_map_t, put)
+{
+ bluestore_extent_ref_map_t m;
+ PExtentVector r;
+ bool maybe_unshared = false;
+ m.get(10, 30);
+ maybe_unshared = true;
+ m.put(10, 30, &r, &maybe_unshared);
+ cout << m << " " << r << " " << (int)maybe_unshared << std::endl;
+ ASSERT_EQ(0u, m.ref_map.size());
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(10u, r[0].offset);
+ ASSERT_EQ(30u, r[0].length);
+ ASSERT_TRUE(maybe_unshared);
+ r.clear();
+ m.get(10, 30);
+ m.get(20, 10);
+ maybe_unshared = true;
+ m.put(10, 30, &r, &maybe_unshared);
+ cout << m << " " << r << " " << (int)maybe_unshared << std::endl;
+ ASSERT_EQ(1u, m.ref_map.size());
+ ASSERT_EQ(10u, m.ref_map[20].length);
+ ASSERT_EQ(1u, m.ref_map[20].refs);
+ ASSERT_EQ(2u, r.size());
+ ASSERT_EQ(10u, r[0].offset);
+ ASSERT_EQ(10u, r[0].length);
+ ASSERT_EQ(30u, r[1].offset);
+ ASSERT_EQ(10u, r[1].length);
+ ASSERT_TRUE(maybe_unshared);
+ r.clear();
+ m.get(30, 10);
+ m.get(30, 10);
+ maybe_unshared = true;
+ m.put(20, 15, &r, &maybe_unshared);
+ cout << m << " " << r << " " << (int)maybe_unshared << std::endl;
+ ASSERT_EQ(2u, m.ref_map.size());
+ ASSERT_EQ(5u, m.ref_map[30].length);
+ ASSERT_EQ(1u, m.ref_map[30].refs);
+ ASSERT_EQ(5u, m.ref_map[35].length);
+ ASSERT_EQ(2u, m.ref_map[35].refs);
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(20u, r[0].offset);
+ ASSERT_EQ(10u, r[0].length);
+ ASSERT_FALSE(maybe_unshared);
+ r.clear();
+ maybe_unshared = true;
+ m.put(33, 5, &r, &maybe_unshared);
+ cout << m << " " << r << " " << (int)maybe_unshared << std::endl;
+ ASSERT_EQ(3u, m.ref_map.size());
+ ASSERT_EQ(3u, m.ref_map[30].length);
+ ASSERT_EQ(1u, m.ref_map[30].refs);
+ ASSERT_EQ(3u, m.ref_map[35].length);
+ ASSERT_EQ(1u, m.ref_map[35].refs);
+ ASSERT_EQ(2u, m.ref_map[38].length);
+ ASSERT_EQ(2u, m.ref_map[38].refs);
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(33u, r[0].offset);
+ ASSERT_EQ(2u, r[0].length);
+ ASSERT_FALSE(maybe_unshared);
+ r.clear();
+ maybe_unshared = true;
+ m.put(38, 2, &r, &maybe_unshared);
+ cout << m << " " << r << " " << (int)maybe_unshared << std::endl;
+ ASSERT_TRUE(maybe_unshared);
+}
+
+TEST(bluestore_extent_ref_map_t, contains)
+{
+ bluestore_extent_ref_map_t m;
+ m.get(10, 30);
+ ASSERT_TRUE(m.contains(10, 30));
+ ASSERT_TRUE(m.contains(10, 10));
+ ASSERT_TRUE(m.contains(30, 10));
+ ASSERT_FALSE(m.contains(0, 10));
+ ASSERT_FALSE(m.contains(0, 20));
+ ASSERT_FALSE(m.contains(0, 100));
+ ASSERT_FALSE(m.contains(40, 10));
+ ASSERT_FALSE(m.contains(30, 11));
+ m.get(40, 10);
+ m.get(40, 10);
+ ASSERT_TRUE(m.contains(30, 11));
+ ASSERT_TRUE(m.contains(30, 20));
+ ASSERT_TRUE(m.contains(10, 40));
+ ASSERT_FALSE(m.contains(0, 50));
+ ASSERT_FALSE(m.contains(40, 20));
+ m.get(60, 100);
+ ASSERT_TRUE(m.contains(60, 10));
+ ASSERT_TRUE(m.contains(40, 10));
+ ASSERT_FALSE(m.contains(40, 11));
+ ASSERT_FALSE(m.contains(40, 20));
+ ASSERT_FALSE(m.contains(40, 30));
+ ASSERT_FALSE(m.contains(40, 3000));
+ ASSERT_FALSE(m.contains(4000, 30));
+}
+
+TEST(bluestore_extent_ref_map_t, intersects)
+{
+ bluestore_extent_ref_map_t m;
+ m.get(10, 30);
+ ASSERT_TRUE(m.intersects(10, 30));
+ ASSERT_TRUE(m.intersects(0, 11));
+ ASSERT_TRUE(m.intersects(10, 40));
+ ASSERT_TRUE(m.intersects(15, 40));
+ ASSERT_FALSE(m.intersects(0, 10));
+ ASSERT_FALSE(m.intersects(0, 5));
+ ASSERT_FALSE(m.intersects(40, 20));
+ ASSERT_FALSE(m.intersects(41, 20));
+ m.get(40, 10);
+ m.get(40, 10);
+ ASSERT_TRUE(m.intersects(0, 100));
+ ASSERT_TRUE(m.intersects(10, 35));
+ ASSERT_TRUE(m.intersects(45, 10));
+ ASSERT_FALSE(m.intersects(50, 5));
+ m.get(60, 100);
+ ASSERT_TRUE(m.intersects(45, 10));
+ ASSERT_TRUE(m.intersects(55, 10));
+ ASSERT_TRUE(m.intersects(50, 11));
+ ASSERT_FALSE(m.intersects(50, 10));
+ ASSERT_FALSE(m.intersects(51, 9));
+ ASSERT_FALSE(m.intersects(55, 1));
+}
+
+TEST(bluestore_blob_t, calc_csum)
+{
+ bufferlist bl;
+ bl.append("asdfghjkqwertyuizxcvbnm,");
+ bufferlist bl2;
+ bl2.append("xxxxXXXXyyyyYYYYzzzzZZZZ");
+ bufferlist f;
+ f.substr_of(bl, 0, 8);
+ bufferlist m;
+ m.substr_of(bl, 8, 8);
+ bufferlist e;
+ e.substr_of(bl, 16, 8);
+ bufferlist n;
+ n.append("12345678");
+
+ for (unsigned csum_type = Checksummer::CSUM_NONE + 1;
+ csum_type < Checksummer::CSUM_MAX;
+ ++csum_type) {
+ cout << "csum_type " << Checksummer::get_csum_type_string(csum_type)
+ << std::endl;
+
+ bluestore_blob_t b;
+ int bad_off;
+ uint64_t bad_csum;
+ ASSERT_EQ(0, b.verify_csum(0, bl, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+
+ b.init_csum(csum_type, 3, 24);
+ cout << " value size " << b.get_csum_value_size() << std::endl;
+ b.calc_csum(0, bl);
+ ASSERT_EQ(0, b.verify_csum(0, bl, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+ ASSERT_EQ(-1, b.verify_csum(0, bl2, &bad_off, &bad_csum));
+ ASSERT_EQ(0, bad_off);
+
+ ASSERT_EQ(0, b.verify_csum(0, f, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+ ASSERT_EQ(-1, b.verify_csum(8, f, &bad_off, &bad_csum));
+ ASSERT_EQ(8, bad_off);
+ ASSERT_EQ(-1, b.verify_csum(16, f, &bad_off, &bad_csum));
+ ASSERT_EQ(16, bad_off);
+
+ ASSERT_EQ(-1, b.verify_csum(0, m, &bad_off, &bad_csum));
+ ASSERT_EQ(0, bad_off);
+ ASSERT_EQ(0, b.verify_csum(8, m, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+ ASSERT_EQ(-1, b.verify_csum(16, m, &bad_off, &bad_csum));
+ ASSERT_EQ(16, bad_off);
+
+ ASSERT_EQ(-1, b.verify_csum(0, e, &bad_off, &bad_csum));
+ ASSERT_EQ(0, bad_off);
+ ASSERT_EQ(-1, b.verify_csum(8, e, &bad_off, &bad_csum));
+ ASSERT_EQ(8, bad_off);
+ ASSERT_EQ(0, b.verify_csum(16, e, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+
+ b.calc_csum(8, n);
+ ASSERT_EQ(0, b.verify_csum(0, f, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+ ASSERT_EQ(0, b.verify_csum(8, n, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+ ASSERT_EQ(0, b.verify_csum(16, e, &bad_off, &bad_csum));
+ ASSERT_EQ(-1, bad_off);
+ ASSERT_EQ(-1, b.verify_csum(0, bl, &bad_off, &bad_csum));
+ ASSERT_EQ(8, bad_off);
+ }
+}
+
+TEST(bluestore_blob_t, csum_bench)
+{
+ bufferlist bl;
+ bufferptr bp(10485760);
+ for (char *a = bp.c_str(); a < bp.c_str() + bp.length(); ++a)
+ *a = (unsigned long)a & 0xff;
+ bl.append(bp);
+ int count = 256;
+ for (unsigned csum_type = 1;
+ csum_type < Checksummer::CSUM_MAX;
+ ++csum_type) {
+ bluestore_blob_t b;
+ b.init_csum(csum_type, 12, bl.length());
+ ceph::mono_clock::time_point start = ceph::mono_clock::now();
+ for (int i = 0; i<count; ++i) {
+ b.calc_csum(0, bl);
+ }
+ ceph::mono_clock::time_point end = ceph::mono_clock::now();
+ auto dur = std::chrono::duration_cast<ceph::timespan>(end - start);
+ double mbsec = (double)count * (double)bl.length() / 1000000.0 / (double)dur.count() * 1000000000.0;
+ cout << "csum_type " << Checksummer::get_csum_type_string(csum_type)
+ << ", " << dur << " seconds, "
+ << mbsec << " MB/sec" << std::endl;
+ }
+}
+
+TEST(Blob, put_ref)
+{
+ {
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Blob b;
+ b.shared_blob = new BlueStore::SharedBlob(coll.get());
+ b.dirty_blob().allocated_test(bluestore_pextent_t(0x40715000, 0x2000));
+ b.dirty_blob().allocated_test(
+ bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x8000));
+ b.dirty_blob().allocated_test(bluestore_pextent_t(0x4071f000, 0x5000));
+ b.get_ref(coll.get(), 0, 0x1200);
+ b.get_ref(coll.get(), 0xae00, 0x4200);
+ ASSERT_EQ(0x5400u, b.get_referenced_bytes());
+ cout << b << std::endl;
+ PExtentVector r;
+
+ ASSERT_FALSE(b.put_ref(coll.get(), 0, 0x1200, &r));
+ ASSERT_EQ(0x4200u, b.get_referenced_bytes());
+ cout << " r " << r << std::endl;
+ cout << b << std::endl;
+
+ r.clear();
+ ASSERT_TRUE(b.put_ref(coll.get(), 0xae00, 0x4200, &r));
+ ASSERT_EQ(0u, b.get_referenced_bytes());
+ cout << " r " << r << std::endl;
+ cout << b << std::endl;
+ }
+
+ unsigned mas = 4096;
+ BlueStore store(g_ceph_context, "", 8192);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(0, mas * 2));
+ B.get_ref(coll.get(), 0, mas*2);
+ ASSERT_EQ(mas * 2, B.get_referenced_bytes());
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_TRUE(B.put_ref(coll.get(), 0, mas*2, &r));
+ ASSERT_EQ(0u, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_FALSE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(0, mas));
+ ASSERT_FALSE(b.is_allocated(mas, 0));
+ ASSERT_FALSE(b.get_extents()[0].is_valid());
+ ASSERT_EQ(mas*2, b.get_extents()[0].length);
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(123, mas * 2));
+ B.get_ref(coll.get(), 0, mas*2);
+ ASSERT_EQ(mas * 2, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r));
+ ASSERT_EQ(mas, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_TRUE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(0u, B.get_referenced_bytes());
+ ASSERT_EQ(0u, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(123u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_FALSE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.get_extents()[0].is_valid());
+ ASSERT_EQ(mas*2, b.get_extents()[0].length);
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas));
+ b.allocated_test(bluestore_pextent_t(2, mas));
+ b.allocated_test(bluestore_pextent_t(3, mas));
+ b.allocated_test(bluestore_pextent_t(4, mas));
+ B.get_ref(coll.get(), 0, mas*4);
+ ASSERT_EQ(mas * 4, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 3, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*4));
+ ASSERT_TRUE(b.is_allocated(mas, mas));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
+ ASSERT_EQ(mas * 2, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(mas*2, mas));
+ ASSERT_TRUE(b.is_allocated(0, mas*4));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r));
+ ASSERT_EQ(mas, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(2u, r.size());
+ ASSERT_EQ(3u, r[0].offset);
+ ASSERT_EQ(mas, r[0].length);
+ ASSERT_EQ(4u, r[1].offset);
+ ASSERT_EQ(mas, r[1].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*2));
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_TRUE(b.get_extents()[1].is_valid());
+ ASSERT_FALSE(b.get_extents()[2].is_valid());
+ ASSERT_EQ(3u, b.get_extents().size());
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas));
+ b.allocated_test(bluestore_pextent_t(2, mas));
+ b.allocated_test(bluestore_pextent_t(3, mas));
+ b.allocated_test(bluestore_pextent_t(4, mas));
+ b.allocated_test(bluestore_pextent_t(5, mas));
+ b.allocated_test(bluestore_pextent_t(6, mas));
+ B.get_ref(coll.get(), 0, mas*6);
+ ASSERT_EQ(mas * 6, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 5, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*6));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
+ ASSERT_EQ(mas * 4, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*6));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r));
+ ASSERT_EQ(mas * 3, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(2u, r.size());
+ ASSERT_EQ(3u, r[0].offset);
+ ASSERT_EQ(mas, r[0].length);
+ ASSERT_EQ(4u, r[1].offset);
+ ASSERT_EQ(mas, r[1].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*2));
+ ASSERT_TRUE(b.is_allocated(mas*4, mas*2));
+ ASSERT_EQ(5u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_TRUE(b.get_extents()[1].is_valid());
+ ASSERT_FALSE(b.get_extents()[2].is_valid());
+ ASSERT_TRUE(b.get_extents()[3].is_valid());
+ ASSERT_TRUE(b.get_extents()[4].is_valid());
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas * 6));
+ B.get_ref(coll.get(), 0, mas*6);
+ ASSERT_EQ(mas * 6, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 5, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*6));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
+ ASSERT_EQ(mas * 4, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*6));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r));
+ ASSERT_EQ(mas * 3, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x2001u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*2));
+ ASSERT_TRUE(b.is_allocated(mas*4, mas*2));
+ ASSERT_EQ(3u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_FALSE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(b.get_extents()[2].is_valid());
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas * 4));
+ b.allocated_test(bluestore_pextent_t(2, mas * 4));
+ b.allocated_test(bluestore_pextent_t(3, mas * 4));
+ B.get_ref(coll.get(), 0, mas*12);
+ ASSERT_EQ(mas * 12, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 11, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*12));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r));
+ ASSERT_EQ(mas * 10, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*12));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r));
+ ASSERT_EQ(mas * 3, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(3u, r.size());
+ ASSERT_EQ(0x2001u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(0x2u, r[1].offset);
+ ASSERT_EQ(mas*4, r[1].length);
+ ASSERT_EQ(0x3u, r[2].offset);
+ ASSERT_EQ(mas*2, r[2].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*8));
+ ASSERT_TRUE(b.is_allocated(mas*10, mas*2));
+ ASSERT_EQ(3u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_FALSE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(b.get_extents()[2].is_valid());
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas * 4));
+ b.allocated_test(bluestore_pextent_t(2, mas * 4));
+ b.allocated_test(bluestore_pextent_t(3, mas * 4));
+ B.get_ref(coll.get(), 0, mas*12);
+ ASSERT_EQ(mas * 12, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 11, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*12));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r));
+ ASSERT_EQ(mas * 10, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*12));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r));
+ ASSERT_EQ(mas * 3, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(3u, r.size());
+ ASSERT_EQ(0x2001u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(0x2u, r[1].offset);
+ ASSERT_EQ(mas*4, r[1].length);
+ ASSERT_EQ(0x3u, r[2].offset);
+ ASSERT_EQ(mas*2, r[2].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*8));
+ ASSERT_TRUE(b.is_allocated(mas*10, mas*2));
+ ASSERT_EQ(3u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_FALSE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(b.get_extents()[2].is_valid());
+ ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r));
+ ASSERT_EQ(mas * 2, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x1u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(2u, b.get_extents().size());
+ ASSERT_FALSE(b.get_extents()[0].is_valid());
+ ASSERT_TRUE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(B.put_ref(coll.get(), mas*10, mas*2, &r));
+ ASSERT_EQ(mas * 0, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x2003u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(1u, b.get_extents().size());
+ ASSERT_FALSE(b.get_extents()[0].is_valid());
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas * 4));
+ b.allocated_test(bluestore_pextent_t(2, mas * 4));
+ b.allocated_test(bluestore_pextent_t(3, mas * 4));
+ B.get_ref(coll.get(), 0, mas*12);
+ ASSERT_EQ(mas * 12, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 11, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*12));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r));
+ ASSERT_EQ(mas * 10, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*12));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r));
+ ASSERT_EQ(mas * 3, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(3u, r.size());
+ ASSERT_EQ(0x2001u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(0x2u, r[1].offset);
+ ASSERT_EQ(mas*4, r[1].length);
+ ASSERT_EQ(0x3u, r[2].offset);
+ ASSERT_EQ(mas*2, r[2].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*8));
+ ASSERT_TRUE(b.is_allocated(mas*10, mas*2));
+ ASSERT_EQ(3u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_FALSE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(b.get_extents()[2].is_valid());
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*10, mas*2, &r));
+ ASSERT_EQ(mas * 1, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x2003u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(2u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_FALSE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(B.put_ref(coll.get(), 0, mas, &r));
+ ASSERT_EQ(mas * 0, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x1u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(1u, b.get_extents().size());
+ ASSERT_FALSE(b.get_extents()[0].is_valid());
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(1, mas * 8));
+ B.get_ref(coll.get(), 0, mas*8);
+ ASSERT_EQ(mas * 8, B.get_referenced_bytes());
+ ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r));
+ ASSERT_EQ(mas * 7, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*8));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*7, mas, &r));
+ ASSERT_EQ(mas * 6, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*8));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
+ ASSERT_EQ(mas * 5, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, 8));
+ ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas*4, &r));
+ ASSERT_EQ(mas * 1, B.get_referenced_bytes());
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x2001u, r[0].offset);
+ ASSERT_EQ(mas*6, r[0].length);
+ ASSERT_TRUE(b.is_allocated(0, mas*2));
+ ASSERT_FALSE(b.is_allocated(mas*2, mas*6));
+ ASSERT_EQ(2u, b.get_extents().size());
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_FALSE(b.get_extents()[1].is_valid());
+ ASSERT_TRUE(B.put_ref(coll.get(), mas, mas, &r));
+ ASSERT_EQ(mas * 0, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x1u, r[0].offset);
+ ASSERT_EQ(mas*2, r[0].length);
+ ASSERT_EQ(1u, b.get_extents().size());
+ ASSERT_FALSE(b.get_extents()[0].is_valid());
+ }
+ // verify csum chunk size if factored in properly
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ PExtentVector r;
+ b.allocated_test(bluestore_pextent_t(0, mas*4));
+ b.init_csum(Checksummer::CSUM_CRC32C, 14, mas * 4);
+ B.get_ref(coll.get(), 0, mas*4);
+ ASSERT_EQ(mas * 4, B.get_referenced_bytes());
+ ASSERT_TRUE(b.is_allocated(0, mas*4));
+ ASSERT_FALSE(B.put_ref(coll.get(), 0, mas*3, &r));
+ ASSERT_EQ(mas * 1, B.get_referenced_bytes());
+ cout << "r " << r << " " << b << std::endl;
+ ASSERT_EQ(0u, r.size());
+ ASSERT_TRUE(b.is_allocated(0, mas*4));
+ ASSERT_TRUE(b.get_extents()[0].is_valid());
+ ASSERT_EQ(mas*4, b.get_extents()[0].length);
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ b.allocated_test(bluestore_pextent_t(0x40101000, 0x4000));
+ b.allocated_test(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET,
+ 0x13000));
+
+ b.allocated_test(bluestore_pextent_t(0x40118000, 0x7000));
+ B.get_ref(coll.get(), 0x0, 0x3800);
+ B.get_ref(coll.get(), 0x17c00, 0x6400);
+ ASSERT_EQ(0x3800u + 0x6400u, B.get_referenced_bytes());
+ b.set_flag(bluestore_blob_t::FLAG_SHARED);
+ b.init_csum(Checksummer::CSUM_CRC32C, 12, 0x1e000);
+
+ cout << "before: " << B << std::endl;
+ PExtentVector r;
+ ASSERT_FALSE(B.put_ref(coll.get(), 0x1800, 0x2000, &r));
+ ASSERT_EQ(0x3800u + 0x6400u - 0x2000u, B.get_referenced_bytes());
+ cout << "after: " << B << std::endl;
+ cout << "r " << r << std::endl;
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ b.allocated_test(bluestore_pextent_t(1, 0x5000));
+ b.allocated_test(bluestore_pextent_t(2, 0x5000));
+ B.get_ref(coll.get(), 0x0, 0xa000);
+ ASSERT_EQ(0xa000u, B.get_referenced_bytes());
+ cout << "before: " << B << std::endl;
+ PExtentVector r;
+ ASSERT_FALSE(B.put_ref(coll.get(), 0x8000, 0x2000, &r));
+ cout << "after: " << B << std::endl;
+ cout << "r " << r << std::endl;
+ ASSERT_EQ(0x8000u, B.get_referenced_bytes());
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(0x3002u, r[0].offset);
+ ASSERT_EQ(0x2000u, r[0].length);
+ }
+ {
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ b.allocated_test(bluestore_pextent_t(1, 0x7000));
+ b.allocated_test(bluestore_pextent_t(2, 0x7000));
+ B.get_ref(coll.get(), 0x0, 0xe000);
+ ASSERT_EQ(0xe000u, B.get_referenced_bytes());
+ cout << "before: " << B << std::endl;
+ PExtentVector r;
+ ASSERT_FALSE(B.put_ref(coll.get(), 0, 0xb000, &r));
+ ASSERT_EQ(0x3000u, B.get_referenced_bytes());
+ cout << "after: " << B << std::endl;
+ cout << "r " << r << std::endl;
+ ASSERT_EQ(0x3000u, B.get_referenced_bytes());
+ ASSERT_EQ(2u, r.size());
+ ASSERT_EQ(1u, r[0].offset);
+ ASSERT_EQ(0x7000u, r[0].length);
+ ASSERT_EQ(2u, r[1].offset);
+ ASSERT_EQ(0x3000u, r[1].length); // we have 0x1000 bytes less due to
+ // alignment caused by min_alloc_size = 0x2000
+ }
+ {
+ BlueStore store(g_ceph_context, "", 0x4000);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Blob B;
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ bluestore_blob_t& b = B.dirty_blob();
+ b.allocated_test(bluestore_pextent_t(1, 0x5000));
+ b.allocated_test(bluestore_pextent_t(2, 0x7000));
+ B.get_ref(coll.get(), 0x0, 0xc000);
+ ASSERT_EQ(0xc000u, B.get_referenced_bytes());
+ cout << "before: " << B << std::endl;
+ PExtentVector r;
+ ASSERT_FALSE(B.put_ref(coll.get(), 0x2000, 0xa000, &r));
+ cout << "after: " << B << std::endl;
+ cout << "r " << r << std::endl;
+ ASSERT_EQ(0x2000u, B.get_referenced_bytes());
+ ASSERT_EQ(2u, r.size());
+ ASSERT_EQ(0x4001u, r[0].offset);
+ ASSERT_EQ(0x1000u, r[0].length);
+ ASSERT_EQ(2u, r[1].offset);
+ ASSERT_EQ(0x7000u, r[1].length);
+ ASSERT_EQ(1u, b.get_extents()[0].offset);
+ ASSERT_EQ(0x4000u, b.get_extents()[0].length);
+ }
+}
+
+TEST(bluestore_blob_t, can_split)
+{
+ bluestore_blob_t a;
+ ASSERT_TRUE(a.can_split());
+ a.flags = bluestore_blob_t::FLAG_SHARED;
+ ASSERT_FALSE(a.can_split());
+ a.flags = bluestore_blob_t::FLAG_COMPRESSED;
+ ASSERT_FALSE(a.can_split());
+ a.flags = bluestore_blob_t::FLAG_HAS_UNUSED;
+ ASSERT_FALSE(a.can_split());
+}
+
+TEST(bluestore_blob_t, can_split_at)
+{
+ bluestore_blob_t a;
+ a.allocated_test(bluestore_pextent_t(0x10000, 0x2000));
+ a.allocated_test(bluestore_pextent_t(0x20000, 0x2000));
+ ASSERT_TRUE(a.can_split_at(0x1000));
+ ASSERT_TRUE(a.can_split_at(0x1800));
+ a.init_csum(Checksummer::CSUM_CRC32C, 12, 0x4000);
+ ASSERT_TRUE(a.can_split_at(0x1000));
+ ASSERT_TRUE(a.can_split_at(0x2000));
+ ASSERT_TRUE(a.can_split_at(0x3000));
+ ASSERT_FALSE(a.can_split_at(0x2800));
+}
+
+TEST(bluestore_blob_t, prune_tail)
+{
+ bluestore_blob_t a;
+ a.allocated_test(bluestore_pextent_t(0x10000, 0x2000));
+ a.allocated_test(bluestore_pextent_t(0x20000, 0x2000));
+ ASSERT_FALSE(a.can_prune_tail());
+ a.allocated_test(
+ bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x2000));
+ ASSERT_TRUE(a.can_prune_tail());
+ a.prune_tail();
+ ASSERT_FALSE(a.can_prune_tail());
+ ASSERT_EQ(2u, a.get_extents().size());
+ ASSERT_EQ(0x4000u, a.get_logical_length());
+
+ a.allocated_test(
+ bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x2000));
+ a.init_csum(Checksummer::CSUM_CRC32C_8, 12, 0x6000);
+ ASSERT_EQ(6u, a.csum_data.length());
+ ASSERT_TRUE(a.can_prune_tail());
+ a.prune_tail();
+ ASSERT_FALSE(a.can_prune_tail());
+ ASSERT_EQ(2u, a.get_extents().size());
+ ASSERT_EQ(0x4000u, a.get_logical_length());
+ ASSERT_EQ(4u, a.csum_data.length());
+
+ bluestore_blob_t b;
+ b.allocated_test(
+ bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x2000));
+ ASSERT_FALSE(a.can_prune_tail());
+}
+
+TEST(Blob, split)
+{
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ {
+ BlueStore::Blob L, R;
+ L.shared_blob = new BlueStore::SharedBlob(coll.get());
+ R.shared_blob = new BlueStore::SharedBlob(coll.get());
+ L.dirty_blob().allocated_test(bluestore_pextent_t(0x2000, 0x2000));
+ L.dirty_blob().init_csum(Checksummer::CSUM_CRC32C, 12, 0x2000);
+ L.get_ref(coll.get(), 0, 0x2000);
+ L.split(coll.get(), 0x1000, &R);
+ ASSERT_EQ(0x1000u, L.get_blob().get_logical_length());
+ ASSERT_EQ(4u, L.get_blob().csum_data.length());
+ ASSERT_EQ(1u, L.get_blob().get_extents().size());
+ ASSERT_EQ(0x2000u, L.get_blob().get_extents().front().offset);
+ ASSERT_EQ(0x1000u, L.get_blob().get_extents().front().length);
+ ASSERT_EQ(0x1000u, L.get_referenced_bytes());
+ ASSERT_EQ(0x1000u, R.get_blob().get_logical_length());
+ ASSERT_EQ(4u, R.get_blob().csum_data.length());
+ ASSERT_EQ(1u, R.get_blob().get_extents().size());
+ ASSERT_EQ(0x3000u, R.get_blob().get_extents().front().offset);
+ ASSERT_EQ(0x1000u, R.get_blob().get_extents().front().length);
+ ASSERT_EQ(0x1000u, R.get_referenced_bytes());
+ }
+ {
+ BlueStore::Blob L, R;
+ L.shared_blob = new BlueStore::SharedBlob(coll.get());
+ R.shared_blob = new BlueStore::SharedBlob(coll.get());
+ L.dirty_blob().allocated_test(bluestore_pextent_t(0x2000, 0x1000));
+ L.dirty_blob().allocated_test(bluestore_pextent_t(0x12000, 0x1000));
+ L.dirty_blob().init_csum(Checksummer::CSUM_CRC32C, 12, 0x2000);
+ L.get_ref(coll.get(), 0, 0x1000);
+ L.get_ref(coll.get(), 0x1000, 0x1000);
+ L.split(coll.get(), 0x1000, &R);
+ ASSERT_EQ(0x1000u, L.get_blob().get_logical_length());
+ ASSERT_EQ(4u, L.get_blob().csum_data.length());
+ ASSERT_EQ(1u, L.get_blob().get_extents().size());
+ ASSERT_EQ(0x2000u, L.get_blob().get_extents().front().offset);
+ ASSERT_EQ(0x1000u, L.get_blob().get_extents().front().length);
+ ASSERT_EQ(0x1000u, L.get_referenced_bytes());
+ ASSERT_EQ(0x1000u, R.get_blob().get_logical_length());
+ ASSERT_EQ(4u, R.get_blob().csum_data.length());
+ ASSERT_EQ(1u, R.get_blob().get_extents().size());
+ ASSERT_EQ(0x12000u, R.get_blob().get_extents().front().offset);
+ ASSERT_EQ(0x1000u, R.get_blob().get_extents().front().length);
+ ASSERT_EQ(0x1000u, R.get_referenced_bytes());
+ }
+}
+
+TEST(Blob, legacy_decode)
+{
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ bufferlist bl, bl2;
+ {
+ BlueStore::Blob B;
+
+ B.shared_blob = new BlueStore::SharedBlob(coll.get());
+ B.dirty_blob().allocated_test(bluestore_pextent_t(0x1, 0x2000));
+ B.dirty_blob().init_csum(Checksummer::CSUM_CRC32C, 12, 0x2000);
+ B.get_ref(coll.get(), 0, 0xff0);
+ B.get_ref(coll.get(), 0x1fff, 1);
+
+ bluestore_extent_ref_map_t fake_ref_map;
+ fake_ref_map.get(0, 0xff0);
+ fake_ref_map.get(0x1fff, 1);
+
+ size_t bound = 0, bound2 = 0;
+
+ B.bound_encode(
+ bound,
+ 1, /*struct_v*/
+ 0, /*sbid*/
+ false);
+ fake_ref_map.bound_encode(bound);
+
+ B.bound_encode(
+ bound2,
+ 2, /*struct_v*/
+ 0, /*sbid*/
+ true);
+
+ {
+ auto app = bl.get_contiguous_appender(bound);
+ auto app2 = bl2.get_contiguous_appender(bound2);
+ B.encode(
+ app,
+ 1, /*struct_v*/
+ 0, /*sbid*/
+ false);
+ fake_ref_map.encode(app);
+
+ B.encode(
+ app2,
+ 2, /*struct_v*/
+ 0, /*sbid*/
+ true);
+ }
+
+ auto p = bl.front().begin_deep();
+ auto p2 = bl2.front().begin_deep();
+ BlueStore::Blob Bres, Bres2;
+ Bres.shared_blob = new BlueStore::SharedBlob(coll.get());
+ Bres2.shared_blob = new BlueStore::SharedBlob(coll.get());
+
+ uint64_t sbid, sbid2;
+ Bres.decode(
+ p,
+ 1, /*struct_v*/
+ &sbid,
+ true,
+ coll.get());
+ Bres2.decode(
+ p2,
+ 2, /*struct_v*/
+ &sbid2,
+ true,
+ coll.get());
+
+ ASSERT_EQ(0xff0u + 1u, Bres.get_blob_use_tracker().get_referenced_bytes());
+ ASSERT_EQ(0xff0u + 1u, Bres2.get_blob_use_tracker().get_referenced_bytes());
+ ASSERT_TRUE(Bres.get_blob_use_tracker().equal(Bres2.get_blob_use_tracker()));
+ }
+}
+
+TEST(ExtentMap, seek_lextent)
+{
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Onode onode(coll.get(), ghobject_t(), "");
+ BlueStore::ExtentMap em(&onode,
+ g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size);
+ BlueStore::BlobRef br(new BlueStore::Blob);
+ br->shared_blob = new BlueStore::SharedBlob(coll.get());
+
+ ASSERT_EQ(em.extent_map.end(), em.seek_lextent(0));
+ ASSERT_EQ(em.extent_map.end(), em.seek_lextent(100));
+
+ em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, br));
+ auto a = em.find(100);
+ ASSERT_EQ(a, em.seek_lextent(0));
+ ASSERT_EQ(a, em.seek_lextent(99));
+ ASSERT_EQ(a, em.seek_lextent(100));
+ ASSERT_EQ(a, em.seek_lextent(101));
+ ASSERT_EQ(a, em.seek_lextent(199));
+ ASSERT_EQ(em.extent_map.end(), em.seek_lextent(200));
+
+ em.extent_map.insert(*new BlueStore::Extent(200, 0, 100, br));
+ auto b = em.find(200);
+ ASSERT_EQ(a, em.seek_lextent(0));
+ ASSERT_EQ(a, em.seek_lextent(99));
+ ASSERT_EQ(a, em.seek_lextent(100));
+ ASSERT_EQ(a, em.seek_lextent(101));
+ ASSERT_EQ(a, em.seek_lextent(199));
+ ASSERT_EQ(b, em.seek_lextent(200));
+ ASSERT_EQ(b, em.seek_lextent(299));
+ ASSERT_EQ(em.extent_map.end(), em.seek_lextent(300));
+
+ em.extent_map.insert(*new BlueStore::Extent(400, 0, 100, br));
+ auto d = em.find(400);
+ ASSERT_EQ(a, em.seek_lextent(0));
+ ASSERT_EQ(a, em.seek_lextent(99));
+ ASSERT_EQ(a, em.seek_lextent(100));
+ ASSERT_EQ(a, em.seek_lextent(101));
+ ASSERT_EQ(a, em.seek_lextent(199));
+ ASSERT_EQ(b, em.seek_lextent(200));
+ ASSERT_EQ(b, em.seek_lextent(299));
+ ASSERT_EQ(d, em.seek_lextent(300));
+ ASSERT_EQ(d, em.seek_lextent(399));
+ ASSERT_EQ(d, em.seek_lextent(400));
+ ASSERT_EQ(d, em.seek_lextent(499));
+ ASSERT_EQ(em.extent_map.end(), em.seek_lextent(500));
+}
+
+TEST(ExtentMap, has_any_lextents)
+{
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Onode onode(coll.get(), ghobject_t(), "");
+ BlueStore::ExtentMap em(&onode,
+ g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size);
+ BlueStore::BlobRef b(new BlueStore::Blob);
+ b->shared_blob = new BlueStore::SharedBlob(coll.get());
+
+ ASSERT_FALSE(em.has_any_lextents(0, 0));
+ ASSERT_FALSE(em.has_any_lextents(0, 1000));
+ ASSERT_FALSE(em.has_any_lextents(1000, 1000));
+
+ em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, b));
+ ASSERT_FALSE(em.has_any_lextents(0, 50));
+ ASSERT_FALSE(em.has_any_lextents(0, 100));
+ ASSERT_FALSE(em.has_any_lextents(50, 50));
+ ASSERT_TRUE(em.has_any_lextents(50, 51));
+ ASSERT_TRUE(em.has_any_lextents(50, 100051));
+ ASSERT_TRUE(em.has_any_lextents(100, 100));
+ ASSERT_TRUE(em.has_any_lextents(100, 1));
+ ASSERT_TRUE(em.has_any_lextents(199, 1));
+ ASSERT_TRUE(em.has_any_lextents(199, 2));
+ ASSERT_FALSE(em.has_any_lextents(200, 2));
+
+ em.extent_map.insert(*new BlueStore::Extent(200, 0, 100, b));
+ ASSERT_TRUE(em.has_any_lextents(199, 1));
+ ASSERT_TRUE(em.has_any_lextents(199, 2));
+ ASSERT_TRUE(em.has_any_lextents(200, 2));
+ ASSERT_TRUE(em.has_any_lextents(200, 200));
+ ASSERT_TRUE(em.has_any_lextents(299, 1));
+ ASSERT_FALSE(em.has_any_lextents(300, 1));
+
+ em.extent_map.insert(*new BlueStore::Extent(400, 0, 100, b));
+ ASSERT_TRUE(em.has_any_lextents(0, 10000));
+ ASSERT_TRUE(em.has_any_lextents(199, 1));
+ ASSERT_FALSE(em.has_any_lextents(300, 1));
+ ASSERT_FALSE(em.has_any_lextents(300, 100));
+ ASSERT_FALSE(em.has_any_lextents(399, 1));
+ ASSERT_TRUE(em.has_any_lextents(400, 1));
+ ASSERT_TRUE(em.has_any_lextents(400, 100));
+ ASSERT_TRUE(em.has_any_lextents(400, 1000));
+ ASSERT_TRUE(em.has_any_lextents(499, 1000));
+ ASSERT_FALSE(em.has_any_lextents(500, 1000));
+}
+
+void erase_and_delete(BlueStore::ExtentMap& em, size_t v)
+{
+ auto d = em.find(v);
+ ASSERT_NE(d, em.extent_map.end());
+ em.extent_map.erase(d);
+ delete &*d;
+}
+
+TEST(ExtentMap, compress_extent_map)
+{
+ BlueStore store(g_ceph_context, "", 4096);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Onode onode(coll.get(), ghobject_t(), "");
+ BlueStore::ExtentMap em(&onode,
+ g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size);
+ BlueStore::BlobRef b1(new BlueStore::Blob);
+ BlueStore::BlobRef b2(new BlueStore::Blob);
+ BlueStore::BlobRef b3(new BlueStore::Blob);
+ b1->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b2->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b3->shared_blob = new BlueStore::SharedBlob(coll.get());
+
+ em.extent_map.insert(*new BlueStore::Extent(0, 0, 100, b1));
+ em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, b2));
+ ASSERT_EQ(0, em.compress_extent_map(0, 10000));
+ ASSERT_EQ(2u, em.extent_map.size());
+
+ em.extent_map.insert(*new BlueStore::Extent(200, 100, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(300, 200, 100, b2));
+ ASSERT_EQ(0, em.compress_extent_map(0, 0));
+ ASSERT_EQ(0, em.compress_extent_map(100000, 1000));
+ ASSERT_EQ(2, em.compress_extent_map(0, 100000));
+ ASSERT_EQ(2u, em.extent_map.size());
+ erase_and_delete(em, 100);
+ em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(200, 100, 100, b3));
+ em.extent_map.insert(*new BlueStore::Extent(300, 200, 100, b2));
+ ASSERT_EQ(0, em.compress_extent_map(0, 1));
+ ASSERT_EQ(0, em.compress_extent_map(0, 100000));
+ ASSERT_EQ(4u, em.extent_map.size());
+
+ em.extent_map.insert(*new BlueStore::Extent(400, 300, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(500, 500, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(600, 600, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(700, 0, 100, b1));
+ em.extent_map.insert(*new BlueStore::Extent(800, 0, 100, b3));
+ ASSERT_EQ(0, em.compress_extent_map(0, 99));
+ ASSERT_EQ(0, em.compress_extent_map(800, 1000));
+ ASSERT_EQ(2, em.compress_extent_map(100, 500));
+ ASSERT_EQ(7u, em.extent_map.size());
+ erase_and_delete(em, 300);
+ erase_and_delete(em, 500);
+ erase_and_delete(em, 700);
+ em.extent_map.insert(*new BlueStore::Extent(400, 300, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(500, 400, 100, b2));
+ em.extent_map.insert(*new BlueStore::Extent(700, 500, 100, b2));
+ ASSERT_EQ(1, em.compress_extent_map(0, 1000));
+ ASSERT_EQ(6u, em.extent_map.size());
+}
+
+
+void clear_and_dispose(BlueStore::old_extent_map_t& old_em)
+{
+ auto oep = old_em.begin();
+ while (oep != old_em.end()) {
+ auto &lo = *oep;
+ oep = old_em.erase(oep);
+ delete &lo;
+ }
+}
+
+TEST(GarbageCollector, BasicTest)
+{
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ BlueStore store(g_ceph_context, "", 4096);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Onode onode(coll.get(), ghobject_t(), "");
+ BlueStore::ExtentMap em(&onode,
+ g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size);
+
+ BlueStore::old_extent_map_t old_extents;
+
+
+ /*
+ min_alloc_size = 4096
+ original disposition
+ extent1 <loffs = 100, boffs = 100, len = 10>
+ -> blob1<compressed, len_on_disk=4096, logical_len=8192>
+ extent2 <loffs = 200, boffs = 200, len = 10>
+ -> blob2<raw, len_on_disk=4096, llen=4096>
+ extent3 <loffs = 300, boffs = 300, len = 10>
+ -> blob1<compressed, len_on_disk=4096, llen=8192>
+ extent4 <loffs = 4096, boffs = 0, len = 10>
+ -> blob3<raw, len_on_disk=4096, llen=4096>
+ on write(300~100) resulted in
+ extent1 <loffs = 100, boffs = 100, len = 10>
+ -> blob1<compressed, len_on_disk=4096, logical_len=8192>
+ extent2 <loffs = 200, boffs = 200, len = 10>
+ -> blob2<raw, len_on_disk=4096, llen=4096>
+ extent3 <loffs = 300, boffs = 300, len = 100>
+ -> blob4<raw, len_on_disk=4096, llen=4096>
+ extent4 <loffs = 4096, boffs = 0, len = 10>
+ -> blob3<raw, len_on_disk=4096, llen=4096>
+ */
+ {
+ BlueStore::GarbageCollector gc(g_ceph_context);
+ int64_t saving;
+ BlueStore::BlobRef b1(new BlueStore::Blob);
+ BlueStore::BlobRef b2(new BlueStore::Blob);
+ BlueStore::BlobRef b3(new BlueStore::Blob);
+ BlueStore::BlobRef b4(new BlueStore::Blob);
+ b1->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b2->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b3->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b4->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b1->dirty_blob().set_compressed(0x2000, 0x1000);
+ b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x1000));
+ b2->dirty_blob().allocated_test(bluestore_pextent_t(1, 0x1000));
+ b3->dirty_blob().allocated_test(bluestore_pextent_t(2, 0x1000));
+ b4->dirty_blob().allocated_test(bluestore_pextent_t(3, 0x1000));
+ em.extent_map.insert(*new BlueStore::Extent(100, 100, 10, b1));
+ b1->get_ref(coll.get(), 100, 10);
+ em.extent_map.insert(*new BlueStore::Extent(200, 200, 10, b2));
+ b2->get_ref(coll.get(), 200, 10);
+ em.extent_map.insert(*new BlueStore::Extent(300, 300, 100, b4));
+ b4->get_ref(coll.get(), 300, 100);
+ em.extent_map.insert(*new BlueStore::Extent(4096, 0, 10, b3));
+ b3->get_ref(coll.get(), 0, 10);
+
+ old_extents.push_back(*new BlueStore::OldExtent(300, 300, 10, b1));
+
+ saving = gc.estimate(300, 100, em, old_extents, 4096);
+ ASSERT_EQ(saving, 1);
+ auto& to_collect = gc.get_extents_to_collect();
+ ASSERT_EQ(to_collect.num_intervals(), 1u);
+ {
+ auto it = to_collect.begin();
+ using p = decltype(*it);
+ auto v = p{100ul, 10ul};
+ ASSERT_EQ(*it, v);
+ }
+ em.clear();
+ clear_and_dispose(old_extents);
+ }
+ /*
+ original disposition
+ min_alloc_size = 0x10000
+ extent1 <loffs = 0, boffs = 0, len = 0x40000>
+ -> blob1<compressed, len_on_disk=0x20000, logical_len=0x40000>
+ Write 0x8000~37000 resulted in the following extent map prior to GC
+ for the last write_small(0x30000~0xf000):
+
+ extent1 <loffs = 0, boffs = 0, len = 0x8000>
+ -> blob1<compressed, len_on_disk=0x20000, logical_len=0x40000>
+ extent2 <loffs = 0x8000, boffs = 0x8000, len = 0x8000>
+ -> blob2<raw, len_on_disk=0x10000, llen=0x10000>
+ extent3 <loffs = 0x10000, boffs = 0, len = 0x20000>
+ -> blob3<raw, len_on_disk=0x20000, llen=0x20000>
+ extent4 <loffs = 0x30000, boffs = 0, len = 0xf000>
+ -> blob4<raw, len_on_disk=0x10000, llen=0x10000>
+ extent5 <loffs = 0x3f000, boffs = 0x3f000, len = 0x1000>
+ -> blob1<compressed, len_on_disk=0x20000, llen=0x40000>
+ */
+ {
+ BlueStore store(g_ceph_context, "", 0x10000);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Onode onode(coll.get(), ghobject_t(), "");
+ BlueStore::ExtentMap em(&onode,
+ g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size);
+
+ BlueStore::old_extent_map_t old_extents;
+ BlueStore::GarbageCollector gc(g_ceph_context);
+ int64_t saving;
+ BlueStore::BlobRef b1(new BlueStore::Blob);
+ BlueStore::BlobRef b2(new BlueStore::Blob);
+ BlueStore::BlobRef b3(new BlueStore::Blob);
+ BlueStore::BlobRef b4(new BlueStore::Blob);
+ b1->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b2->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b3->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b4->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b1->dirty_blob().set_compressed(0x40000, 0x20000);
+ b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x20000));
+ b2->dirty_blob().allocated_test(bluestore_pextent_t(1, 0x10000));
+ b3->dirty_blob().allocated_test(bluestore_pextent_t(2, 0x20000));
+ b4->dirty_blob().allocated_test(bluestore_pextent_t(3, 0x10000));
+
+ em.extent_map.insert(*new BlueStore::Extent(0, 0, 0x8000, b1));
+ b1->get_ref(coll.get(), 0, 0x8000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x8000, 0x8000, 0x8000, b2)); // new extent
+ b2->get_ref(coll.get(), 0x8000, 0x8000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x10000, 0, 0x20000, b3)); // new extent
+ b3->get_ref(coll.get(), 0, 0x20000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x30000, 0, 0xf000, b4)); // new extent
+ b4->get_ref(coll.get(), 0, 0xf000);
+ em.extent_map.insert(*new BlueStore::Extent(0x3f000, 0x3f000, 0x1000, b1));
+ b1->get_ref(coll.get(), 0x3f000, 0x1000);
+
+ old_extents.push_back(*new BlueStore::OldExtent(0x8000, 0x8000, 0x8000, b1));
+ old_extents.push_back(
+ *new BlueStore::OldExtent(0x10000, 0x10000, 0x20000, b1));
+ old_extents.push_back(*new BlueStore::OldExtent(0x30000, 0x30000, 0xf000, b1));
+
+ saving = gc.estimate(0x30000, 0xf000, em, old_extents, 0x10000);
+ ASSERT_EQ(saving, 2);
+ auto& to_collect = gc.get_extents_to_collect();
+ ASSERT_EQ(to_collect.num_intervals(), 2u);
+ {
+ auto it1 = to_collect.begin();
+ auto it2 = ++to_collect.begin();
+ using p = decltype(*it1);
+ {
+ auto v1 = p{0x0ul ,0x8000ul};
+ auto v2 = p{0x0ul, 0x8000ul};
+ ASSERT_TRUE(*it1 == v1 || *it2 == v2);
+ }
+ {
+ auto v1 = p{0x3f000ul, 0x1000ul};
+ auto v2 = p{0x3f000ul, 0x1000ul};
+ ASSERT_TRUE(*it1 == v1 || *it2 == v2);
+ }
+ }
+
+ em.clear();
+ clear_and_dispose(old_extents);
+ }
+ /*
+ original disposition
+ min_alloc_size = 0x1000
+ extent1 <loffs = 0, boffs = 0, len = 0x4000>
+ -> blob1<compressed, len_on_disk=0x2000, logical_len=0x4000>
+ write 0x3000~4000 resulted in the following extent map
+ (future feature - suppose we can compress incoming write prior to
+ GC invocation)
+
+ extent1 <loffs = 0, boffs = 0, len = 0x4000>
+ -> blob1<compressed, len_on_disk=0x2000, logical_len=0x4000>
+ extent2 <loffs = 0x3000, boffs = 0, len = 0x4000>
+ -> blob2<compressed, len_on_disk=0x2000, llen=0x4000>
+ */
+ {
+ BlueStore::GarbageCollector gc(g_ceph_context);
+ int64_t saving;
+ BlueStore::BlobRef b1(new BlueStore::Blob);
+ BlueStore::BlobRef b2(new BlueStore::Blob);
+ b1->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b2->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b1->dirty_blob().set_compressed(0x4000, 0x2000);
+ b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x2000));
+ b2->dirty_blob().set_compressed(0x4000, 0x2000);
+ b2->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x2000));
+
+ em.extent_map.insert(*new BlueStore::Extent(0, 0, 0x3000, b1));
+ b1->get_ref(coll.get(), 0, 0x3000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x3000, 0, 0x4000, b2)); // new extent
+ b2->get_ref(coll.get(), 0, 0x4000);
+
+ old_extents.push_back(*new BlueStore::OldExtent(0x3000, 0x3000, 0x1000, b1));
+
+ saving = gc.estimate(0x3000, 0x4000, em, old_extents, 0x1000);
+ ASSERT_EQ(saving, 0);
+ auto& to_collect = gc.get_extents_to_collect();
+ ASSERT_EQ(to_collect.num_intervals(), 0u);
+ em.clear();
+ clear_and_dispose(old_extents);
+ }
+ /*
+ original disposition
+ min_alloc_size = 0x10000
+ extent0 <loffs = 0, boffs = 0, len = 0x20000>
+ -> blob0<compressed, len_on_disk=0x10000, logical_len=0x20000>
+ extent1 <loffs = 0x20000, boffs = 0, len = 0x20000>
+ -> blob1<compressed, len_on_disk=0x10000, logical_len=0x20000>
+ write 0x8000~37000 resulted in the following extent map prior
+ to GC for the last write_small(0x30000~0xf000)
+
+ extent0 <loffs = 0, boffs = 0, len = 0x8000>
+ -> blob0<compressed, len_on_disk=0x10000, logical_len=0x20000>
+ extent2 <loffs = 0x8000, boffs = 0x8000, len = 0x8000>
+ -> blob2<raw, len_on_disk=0x10000, llen=0x10000>
+ extent3 <loffs = 0x10000, boffs = 0, len = 0x20000>
+ -> blob3<raw, len_on_disk=0x20000, llen=0x20000>
+ extent4 <loffs = 0x30000, boffs = 0, len = 0xf000>
+ -> blob4<raw, len_on_disk=0x1000, llen=0x1000>
+ extent5 <loffs = 0x3f000, boffs = 0x1f000, len = 0x1000>
+ -> blob1<compressed, len_on_disk=0x10000, llen=0x20000>
+ */
+ {
+ BlueStore store(g_ceph_context, "", 0x10000);
+ auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+ BlueStore::Onode onode(coll.get(), ghobject_t(), "");
+ BlueStore::ExtentMap em(&onode,
+ g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size);
+
+ BlueStore::old_extent_map_t old_extents;
+ BlueStore::GarbageCollector gc(g_ceph_context);
+ int64_t saving;
+ BlueStore::BlobRef b0(new BlueStore::Blob);
+ BlueStore::BlobRef b1(new BlueStore::Blob);
+ BlueStore::BlobRef b2(new BlueStore::Blob);
+ BlueStore::BlobRef b3(new BlueStore::Blob);
+ BlueStore::BlobRef b4(new BlueStore::Blob);
+ b0->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b1->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b2->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b3->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b4->shared_blob = new BlueStore::SharedBlob(coll.get());
+ b0->dirty_blob().set_compressed(0x2000, 0x1000);
+ b0->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x10000));
+ b1->dirty_blob().set_compressed(0x20000, 0x10000);
+ b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x10000));
+ b2->dirty_blob().allocated_test(bluestore_pextent_t(1, 0x10000));
+ b3->dirty_blob().allocated_test(bluestore_pextent_t(2, 0x20000));
+ b4->dirty_blob().allocated_test(bluestore_pextent_t(3, 0x1000));
+
+ em.extent_map.insert(*new BlueStore::Extent(0, 0, 0x8000, b0));
+ b0->get_ref(coll.get(), 0, 0x8000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x8000, 0x8000, 0x8000, b2)); // new extent
+ b2->get_ref(coll.get(), 0x8000, 0x8000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x10000, 0, 0x20000, b3)); // new extent
+ b3->get_ref(coll.get(), 0, 0x20000);
+ em.extent_map.insert(
+ *new BlueStore::Extent(0x30000, 0, 0xf000, b4)); // new extent
+ b4->get_ref(coll.get(), 0, 0xf000);
+ em.extent_map.insert(*new BlueStore::Extent(0x3f000, 0x1f000, 0x1000, b1));
+ b1->get_ref(coll.get(), 0x1f000, 0x1000);
+
+ old_extents.push_back(*new BlueStore::OldExtent(0x8000, 0x8000, 0x8000, b0));
+ old_extents.push_back(
+ *new BlueStore::OldExtent(0x10000, 0x10000, 0x10000, b0));
+ old_extents.push_back(
+ *new BlueStore::OldExtent(0x20000, 0x00000, 0x1f000, b1));
+
+ saving = gc.estimate(0x30000, 0xf000, em, old_extents, 0x10000);
+ ASSERT_EQ(saving, 2);
+ auto& to_collect = gc.get_extents_to_collect();
+ ASSERT_EQ(to_collect.num_intervals(), 2u);
+ {
+ auto it1 = to_collect.begin();
+ auto it2 = ++to_collect.begin();
+ using p = decltype(*it1);
+ {
+ auto v1 = p{0x0ul, 0x8000ul};
+ auto v2 = p{0x0ul, 0x8000ul};
+ ASSERT_TRUE(*it1 == v1 || *it2 == v2);
+ }
+ {
+ auto v1 = p{0x3f000ul, 0x1000ul};
+ auto v2 = p{0x3f000ul, 0x1000ul};
+ ASSERT_TRUE(*it1 == v1 || *it2 == v2);
+ }
+ }
+
+ em.clear();
+ clear_and_dispose(old_extents);
+ }
+}
+
+TEST(BlueStoreRepairer, StoreSpaceTracker)
+{
+ BlueStoreRepairer::StoreSpaceTracker bmap0;
+ bmap0.init((uint64_t)4096 * 1024 * 1024 * 1024, 0x1000);
+ ASSERT_EQ(bmap0.granularity, 2 * 1024 * 1024U);
+ ASSERT_EQ(bmap0.collections_bfs.size(), 2048u * 1024u);
+ ASSERT_EQ(bmap0.objects_bfs.size(), 2048u * 1024u);
+
+ BlueStoreRepairer::StoreSpaceTracker bmap;
+ bmap.init(0x2000 * 0x1000 - 1, 0x1000, 512 * 1024);
+ ASSERT_EQ(bmap.granularity, 0x1000u);
+ ASSERT_EQ(bmap.collections_bfs.size(), 0x2000u);
+ ASSERT_EQ(bmap.objects_bfs.size(), 0x2000u);
+
+ coll_t cid;
+ ghobject_t hoid;
+
+ ASSERT_FALSE(bmap.is_used(cid, 0));
+ ASSERT_FALSE(bmap.is_used(hoid, 0));
+ bmap.set_used(0, 1, cid, hoid);
+ ASSERT_TRUE(bmap.is_used(cid, 0));
+ ASSERT_TRUE(bmap.is_used(hoid, 0));
+
+ ASSERT_FALSE(bmap.is_used(cid, 0x1023));
+ ASSERT_FALSE(bmap.is_used(hoid, 0x1023));
+ ASSERT_FALSE(bmap.is_used(cid, 0x2023));
+ ASSERT_FALSE(bmap.is_used(hoid, 0x2023));
+ ASSERT_FALSE(bmap.is_used(cid, 0x3023));
+ ASSERT_FALSE(bmap.is_used(hoid, 0x3023));
+ bmap.set_used(0x1023, 0x3000, cid, hoid);
+ ASSERT_TRUE(bmap.is_used(cid, 0x1023));
+ ASSERT_TRUE(bmap.is_used(hoid, 0x1023));
+ ASSERT_TRUE(bmap.is_used(cid, 0x2023));
+ ASSERT_TRUE(bmap.is_used(hoid, 0x2023));
+ ASSERT_TRUE(bmap.is_used(cid, 0x3023));
+ ASSERT_TRUE(bmap.is_used(hoid, 0x3023));
+
+ ASSERT_FALSE(bmap.is_used(cid, 0x9001));
+ ASSERT_FALSE(bmap.is_used(hoid, 0x9001));
+ ASSERT_FALSE(bmap.is_used(cid, 0xa001));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xa001));
+ ASSERT_FALSE(bmap.is_used(cid, 0xb000));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xb000));
+ ASSERT_FALSE(bmap.is_used(cid, 0xc000));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
+ bmap.set_used(0x9001, 0x2fff, cid, hoid);
+ ASSERT_TRUE(bmap.is_used(cid, 0x9001));
+ ASSERT_TRUE(bmap.is_used(hoid, 0x9001));
+ ASSERT_TRUE(bmap.is_used(cid, 0xa001));
+ ASSERT_TRUE(bmap.is_used(hoid, 0xa001));
+ ASSERT_TRUE(bmap.is_used(cid, 0xb001));
+ ASSERT_TRUE(bmap.is_used(hoid, 0xb001));
+ ASSERT_FALSE(bmap.is_used(cid, 0xc000));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
+
+ bmap.set_used(0xa001, 0x2, cid, hoid);
+ ASSERT_TRUE(bmap.is_used(cid, 0x9001));
+ ASSERT_TRUE(bmap.is_used(hoid, 0x9001));
+ ASSERT_TRUE(bmap.is_used(cid, 0xa001));
+ ASSERT_TRUE(bmap.is_used(hoid, 0xa001));
+ ASSERT_TRUE(bmap.is_used(cid, 0xb001));
+ ASSERT_TRUE(bmap.is_used(hoid, 0xb001));
+ ASSERT_FALSE(bmap.is_used(cid, 0xc000));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
+
+ ASSERT_FALSE(bmap.is_used(cid, 0xc0000));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xc0000));
+ ASSERT_FALSE(bmap.is_used(cid, 0xc1000));
+ ASSERT_FALSE(bmap.is_used(hoid, 0xc1000));
+
+ bmap.set_used(0xc0000, 0x2000, cid, hoid);
+ ASSERT_TRUE(bmap.is_used(cid, 0xc0000));
+ ASSERT_TRUE(bmap.is_used(hoid, 0xc0000));
+ ASSERT_TRUE(bmap.is_used(cid, 0xc1000));
+ ASSERT_TRUE(bmap.is_used(hoid, 0xc1000));
+
+ interval_set<uint64_t> extents;
+ extents.insert(0,0x500);
+ extents.insert(0x800,0x100);
+ extents.insert(0x1000,0x1000);
+ extents.insert(0xa001,1);
+ extents.insert(0xa0000,0xff8);
+
+ ASSERT_EQ(3u, bmap.filter_out(extents));
+ ASSERT_TRUE(bmap.is_used(cid));
+ ASSERT_TRUE(bmap.is_used(hoid));
+
+ BlueStoreRepairer::StoreSpaceTracker bmap2;
+ bmap2.init((uint64_t)0x3223b1d1000, 0x10000);
+ ASSERT_EQ(0x1a0000u, bmap2.granularity);
+ ASSERT_EQ(0x1edae4u, bmap2.collections_bfs.size());
+ ASSERT_EQ(0x1edae4u, bmap2.objects_bfs.size());
+ bmap2.set_used(0x3223b190000, 0x10000, cid, hoid);
+ ASSERT_TRUE(bmap2.is_used(cid, 0x3223b190000));
+ ASSERT_TRUE(bmap2.is_used(hoid, 0x3223b190000));
+ ASSERT_TRUE(bmap2.is_used(cid, 0x3223b19f000));
+ ASSERT_TRUE(bmap2.is_used(hoid, 0x3223b19ffff));
+}
+
+TEST(bluestore_blob_t, unused)
+{
+ {
+ bluestore_blob_t b;
+ uint64_t min_alloc_size = 64 << 10; // 64 kB
+
+ // _do_write_small 0x0~1000
+ uint64_t offset = 0x0;
+ uint64_t length = 0x1000; // 4kB
+ uint64_t suggested_boff = 0;
+ PExtentVector extents;
+ extents.emplace_back(0x1a560000, min_alloc_size);
+ b.allocated(p2align(suggested_boff, min_alloc_size), 0 /*no matter*/, extents);
+ b.mark_used(offset, length);
+ ASSERT_FALSE(b.is_unused(offset, length));
+
+ // _do_write_small 0x2000~1000
+ offset = 0x2000;
+ length = 0x1000;
+ b.add_unused(0, 0x10000);
+ ASSERT_TRUE(b.is_unused(offset, length));
+ b.mark_used(offset, length);
+ ASSERT_FALSE(b.is_unused(offset, length));
+
+ // _do_write_small 0xc000~2000
+ offset = 0xc000;
+ length = 0x2000;
+ ASSERT_TRUE(b.is_unused(offset, length));
+ b.mark_used(offset, length);
+ ASSERT_FALSE(b.is_unused(offset, length));
+ }
+
+ {
+ bluestore_blob_t b;
+ uint64_t min_alloc_size = 64 << 10; // 64 kB
+
+ // _do_write_small 0x11000~1000
+ uint64_t offset = 0x11000;
+ uint64_t length = 0x1000; // 4kB
+ uint64_t suggested_boff = 0x11000;
+ PExtentVector extents;
+ extents.emplace_back(0x1a560000, min_alloc_size);
+ b.allocated(p2align(suggested_boff, min_alloc_size), 0 /*no matter*/, extents);
+ b.add_unused(0, offset);
+ b.add_unused(offset + length, min_alloc_size * 2 - offset - length);
+ b.mark_used(offset, length);
+ ASSERT_FALSE(b.is_unused(offset, length));
+
+ // _do_write_small 0x15000~3000
+ offset = 0x15000;
+ length = 0x3000;
+ ASSERT_TRUE(b.is_unused(offset, length));
+ b.mark_used(offset, length);
+ ASSERT_FALSE(b.is_unused(offset, length));
+ }
+
+ {
+ // reuse blob
+ bluestore_blob_t b;
+ uint64_t min_alloc_size = 64 << 10; // 64 kB
+
+ // _do_write_small 0x2a000~1000
+ // and 0x1d000~1000
+ uint64_t unused_granularity = 0x3000;
+ // offsets and lenght below are selected to
+ // be aligned with unused_granularity
+ uint64_t offset0 = 0x2a000;
+ uint64_t offset = 0x1d000;
+ uint64_t length = 0x1000; // 4kB
+ PExtentVector extents;
+ extents.emplace_back(0x410000, min_alloc_size);
+ b.allocated(p2align(offset0, min_alloc_size), min_alloc_size, extents);
+ b.add_unused(0, min_alloc_size * 3);
+ b.mark_used(offset0, length);
+ ASSERT_FALSE(b.is_unused(offset0, length));
+ ASSERT_TRUE(b.is_unused(offset, length));
+
+ extents.clear();
+ extents.emplace_back(0x430000, min_alloc_size);
+ b.allocated(p2align(offset, min_alloc_size), min_alloc_size, extents);
+ b.mark_used(offset, length);
+ ASSERT_FALSE(b.is_unused(offset0, length));
+ ASSERT_FALSE(b.is_unused(offset, length));
+ ASSERT_FALSE(b.is_unused(offset, unused_granularity));
+
+ ASSERT_TRUE(b.is_unused(0, offset / unused_granularity * unused_granularity));
+ ASSERT_TRUE(b.is_unused(offset + length, offset0 - offset - length));
+ auto end0_aligned = round_up_to(offset0 + length, unused_granularity);
+ ASSERT_TRUE(b.is_unused(end0_aligned, min_alloc_size * 3 - end0_aligned));
+ }
+}
+// This UT is primarily intended to show how repair procedure
+// causes erroneous write to INVALID_OFFSET which is reported in
+// https://tracker.ceph.com/issues/51682
+// Basic map_any functionality is tested as well though.
+//
+TEST(bluestore_blob_t, wrong_map_bl_in_51682)
+{
+ {
+ bluestore_blob_t b;
+ uint64_t min_alloc_size = 4 << 10; // 64 kB
+
+ b.allocated_test(bluestore_pextent_t(0x17ba000, 4 * min_alloc_size));
+ b.allocated_test(bluestore_pextent_t(0x17bf000, 4 * min_alloc_size));
+ b.allocated_test(
+ bluestore_pextent_t(
+ bluestore_pextent_t::INVALID_OFFSET,
+ 1 * min_alloc_size));
+ b.allocated_test(bluestore_pextent_t(0x153c44d000, 7 * min_alloc_size));
+
+ b.mark_used(0, 0x8000);
+ b.mark_used(0x9000, 0x7000);
+
+ string s(0x7000, 'a');
+ bufferlist bl;
+ bl.append(s);
+ const size_t num_expected_entries = 5;
+ uint64_t expected[num_expected_entries][2] = {
+ {0x17ba000, 0x4000},
+ {0x17bf000, 0x3000},
+ {0x17c0000, 0x3000},
+ {0xffffffffffffffff, 0x1000},
+ {0x153c44d000, 0x3000}};
+ size_t expected_pos = 0;
+ b.map_bl(0, bl,
+ [&](uint64_t o, bufferlist& bl) {
+ ASSERT_EQ(o, expected[expected_pos][0]);
+ ASSERT_EQ(bl.length(), expected[expected_pos][1]);
+ ++expected_pos;
+ });
+ // 0x5000 is an improper offset presumably provided when doing a repair
+ b.map_bl(0x5000, bl,
+ [&](uint64_t o, bufferlist& bl) {
+ ASSERT_EQ(o, expected[expected_pos][0]);
+ ASSERT_EQ(bl.length(), expected[expected_pos][1]);
+ ++expected_pos;
+ });
+ ASSERT_EQ(expected_pos, num_expected_entries);
+ }
+}
+
+//---------------------------------------------------------------------------------
+static int verify_extent(const extent_t & ext, const extent_t *ext_arr, uint64_t ext_arr_size, uint64_t idx)
+{
+ const extent_t & ext_ref = ext_arr[idx];
+ if (ext.offset == ext_ref.offset && ext.length == ext_ref.length) {
+ return 0;
+ } else {
+ std::cerr << "mismatch was found at index " << idx << std::endl;
+ if (ext.length == 0) {
+ std::cerr << "Null extent was returned at idx = " << idx << std::endl;
+ }
+ unsigned start = std::max(((int32_t)(idx)-3), 0);
+ unsigned end = std::min(idx+3, ext_arr_size);
+ for (unsigned j = start; j < end; j++) {
+ const extent_t & ext_ref = ext_arr[j];
+ std::cerr << j << ") ref_ext = [" << ext_ref.offset << ", " << ext_ref.length << "]" << std::endl;
+ }
+ std::cerr << idx << ") ext = [" << ext.offset << ", " << ext.length << "]" << std::endl;
+ return -1;
+ }
+}
+
+//---------------------------------------------------------------------------------
+static int test_extents(uint64_t index, extent_t *ext_arr, uint64_t ext_arr_size, SimpleBitmap& sbmap, bool set)
+{
+ const uint64_t MAX_JUMP_BIG = 1523;
+ const uint64_t MAX_JUMP_SMALL = 19;
+ const uint64_t MAX_LEN_BIG = 523;
+ const uint64_t MAX_LEN_SMALL = 23;
+
+ uint64_t n = sbmap.get_size();
+ uint64_t offset = 0;
+ unsigned length, jump, i;
+ for (i = 0; i < ext_arr_size; i++) {
+ if (i & 3) {
+ jump = std::rand() % MAX_JUMP_BIG;
+ } else {
+ jump = std::rand() % MAX_JUMP_SMALL;
+ }
+ offset += jump;
+ if (i & 1) {
+ length = std::rand() % MAX_LEN_BIG;
+ } else {
+ length = std::rand() % MAX_LEN_SMALL;
+ }
+ // make sure no zero length will be used
+ length++;
+ if (offset + length >= n) {
+ break;
+ }
+
+ bool success;
+ if (set) {
+ success = sbmap.set(offset, length);
+ } else {
+ success = sbmap.clr(offset, length);
+ }
+ if (!success) {
+ std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl;
+ return -1;
+ }
+
+ // if this is not the first entry and no jump -> merge extents
+ if ( (i==0) || (jump > 0) ) {
+ ext_arr[i] = {offset, length};
+ } else {
+ // merge 2 extents
+ i --;
+ ext_arr[i].length += length;
+ }
+ offset += length;
+ }
+ unsigned arr_size = std::min((uint64_t)i, ext_arr_size);
+ std::cout << std::hex << std::right;
+ std::cout << "[" << index << "] " << (set ? "Set::" : "Clr::") << " extents count = 0x" << arr_size;
+ std::cout << std::dec << std::endl;
+
+ offset = 0;
+ extent_t ext;
+ for(unsigned i = 0; i < arr_size; i++) {
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+
+ if (verify_extent(ext, ext_arr, ext_arr_size, i) != 0) {
+ return -1;
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+ if (ext.length == 0) {
+ return 0;
+ } else {
+ std::cerr << "sbmap.get_next_" << (set ? "set" : "clr") << "_extent(" << offset << ") return length = " << ext.length << std::endl;
+ return -1;
+ }
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, basic)
+{
+ const uint64_t MAX_EXTENTS_COUNT = 7131177;
+ std::unique_ptr<extent_t[]> ext_arr = std::make_unique<extent_t[]>(MAX_EXTENTS_COUNT);
+ ASSERT_TRUE(ext_arr != nullptr);
+ const uint64_t BIT_COUNT = 4ULL << 30; // 4Gb = 512MB
+ SimpleBitmap sbmap(g_ceph_context, BIT_COUNT);
+
+ // use current time as seed for random generator
+ std::srand(std::time(nullptr));
+ for (unsigned i = 0; i < 3; i++ ) {
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.clear_all();
+ ASSERT_TRUE(test_extents(i, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, true) == 0);
+
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.set_all();
+ ASSERT_TRUE(test_extents(i, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, false) == 0);
+ }
+}
+
+//---------------------------------------------------------------------------------
+static int test_intersections(unsigned test_idx, SimpleBitmap &sbmap, uint8_t map[], uint64_t map_size)
+{
+ const uint64_t MAX_LEN_BIG = 523;
+ const uint64_t MAX_LEN_SMALL = 23;
+
+ bool success;
+ uint64_t set_op_count = 0, clr_op_count = 0;
+ unsigned length, i;
+ for (i = 0; i < map_size / (MAX_LEN_BIG*2); i++) {
+ uint64_t offset = (std::rand() % (map_size - 1));
+ if (i & 1) {
+ length = std::rand() % MAX_LEN_BIG;
+ } else {
+ length = std::rand() % MAX_LEN_SMALL;
+ }
+ // make sure no zero length will be used
+ length++;
+ if (offset + length >= map_size) {
+ continue;
+ }
+ // 2:1 set/clr
+ bool set = (std::rand() % 3);
+ if (set) {
+ success = sbmap.set(offset, length);
+ memset(map+offset, 0xFF, length);
+ set_op_count++;
+ } else {
+ success = sbmap.clr(offset, length);
+ memset(map+offset, 0x0, length);
+ clr_op_count++;
+ }
+ if (!success) {
+ std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl;
+ return -1;
+ }
+ }
+
+ uint64_t set_bit_count = 0;
+ uint64_t clr_bit_count = 0;
+ for(uint64_t idx = 0; idx < map_size; idx++) {
+ if (map[idx]) {
+ set_bit_count++;
+ success = sbmap.bit_is_set(idx);
+ } else {
+ clr_bit_count++;
+ success = sbmap.bit_is_clr(idx);
+ }
+ if (!success) {
+ std::cerr << "expected: sbmap.bit_is_" << (map[idx] ? "set(" : "clr(") << idx << ")"<< std::endl;
+ return -1;
+ }
+
+ }
+ std::cout << std::hex << std::right << __func__ ;
+ std::cout << " [" << test_idx << "] set_bit_count = 0x" << std::setfill('0') << std::setw(8) << set_bit_count
+ << ", clr_bit_count = 0x" << std::setfill('0') << std::setw(8) << clr_bit_count
+ << ", sum = 0x" << set_bit_count + clr_bit_count << std::endl;
+ std::cout << std::dec;
+ uint64_t offset = 0;
+ for(uint64_t i = 0; i < (set_op_count + clr_op_count); i++) {
+ extent_t ext = sbmap.get_next_set_extent(offset);
+ //std::cout << "set_ext:: " << i << ") [" << ext.offset << ", " << ext.length << "]" << std::endl;
+ for (uint64_t idx = ext.offset; idx < ext.offset + ext.length; idx++) {
+ if (map[idx] != 0xFF) {
+ std::cerr << "map[" << idx << "] is clear, but extent [" << ext.offset << ", " << ext.length << "] is set" << std::endl;
+ return -1;
+ }
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ offset = 0;
+ for(uint64_t i = 0; i < (set_op_count + clr_op_count); i++) {
+ extent_t ext = sbmap.get_next_clr_extent(offset);
+ //std::cout << "clr_ext:: " << i << ") [" << ext.offset << ", " << ext.length << "]" << std::endl;
+ for (uint64_t idx = ext.offset; idx < ext.offset + ext.length; idx++) {
+ if (map[idx] ) {
+ std::cerr << "map[" << idx << "] is set, but extent [" << ext.offset << ", " << ext.length << "] is free" << std::endl;
+ return -1;
+ }
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ return 0;
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, intersection)
+{
+ const uint64_t MAP_SIZE = 1ULL << 30; // 1G
+ SimpleBitmap sbmap(g_ceph_context, MAP_SIZE);
+
+ // use current time as seed for random generator
+ std::srand(std::time(nullptr));
+
+ std::unique_ptr<uint8_t[]> map = std::make_unique<uint8_t[]> (MAP_SIZE);
+ ASSERT_TRUE(map != nullptr);
+
+ for (unsigned i = 0; i < 1; i++ ) {
+ sbmap.clear_all();
+ memset(map.get(), 0, MAP_SIZE);
+ ASSERT_TRUE(test_intersections(i, sbmap, map.get(), MAP_SIZE) == 0);
+
+ sbmap.set_all();
+ memset(map.get(), 0xFF, MAP_SIZE);
+ ASSERT_TRUE(test_intersections(i, sbmap, map.get(), MAP_SIZE) == 0);
+ }
+}
+
+
+//---------------------------------------------------------------------------------
+static int test_extents_boundaries(uint64_t index, extent_t *ext_arr, uint64_t ext_arr_size, SimpleBitmap& sbmap, bool set)
+{
+ uint64_t n = sbmap.get_size();
+ uint64_t offset = 0, k = 0;
+ for(unsigned i = 0; i < 64; i++) {
+ offset += i;
+ if (offset >= n) {
+ break;
+ }
+
+ for(unsigned length = 1; length <= 128; length++) {
+ if (offset + length >= n) {
+ break;
+ }
+
+ if (k >= ext_arr_size) {
+ break;
+ }
+ bool success;
+ if (set) {
+ success = sbmap.set(offset, length);
+ } else {
+ success = sbmap.clr(offset, length);
+ }
+ if (!success) {
+ std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl;
+ return -1;
+ }
+ ext_arr[k++] = {offset, length};
+ if (length < 64) {
+ offset += 64;
+ } else {
+ offset += 128;
+ }
+ }
+ if (k >= ext_arr_size) {
+ break;
+ }
+ }
+
+ unsigned arr_size = std::min((uint64_t)k, ext_arr_size);
+ std::cout << std::hex << std::right << __func__ ;
+ std::cout << " [" << index << "] " << (set ? "Set::" : "Clr::") << " extents count = 0x" << arr_size;
+ std::cout << std::dec << std::endl;
+
+ offset = 0;
+ extent_t ext;
+ for(unsigned i = 0; i < arr_size; i++) {
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+
+ if (verify_extent(ext, ext_arr, ext_arr_size, i) != 0) {
+ return -1;
+ }
+ offset = ext.offset + ext.length;
+ }
+
+ if (set) {
+ ext = sbmap.get_next_set_extent(offset);
+ } else {
+ ext = sbmap.get_next_clr_extent(offset);
+ }
+ if (ext.length == 0) {
+ return 0;
+ } else {
+ std::cerr << "sbmap.get_next_" << (set ? "set" : "clr") << "_extent(" << offset << ") return length = " << ext.length << std::endl;
+ return -1;
+ }
+
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, boundaries)
+{
+ const uint64_t MAX_EXTENTS_COUNT = 64 << 10;
+ std::unique_ptr<extent_t[]> ext_arr = std::make_unique<extent_t[]>(MAX_EXTENTS_COUNT);
+ ASSERT_TRUE(ext_arr != nullptr);
+
+ // use current time as seed for random generator
+ std::srand(std::time(nullptr));
+
+ uint64_t bit_count = 32 << 20; // 32Mb = 4MB
+ unsigned count = 0;
+ for (unsigned i = 0; i < 64; i++) {
+ SimpleBitmap sbmap(g_ceph_context, bit_count+i);
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.clear_all();
+ ASSERT_TRUE(test_extents_boundaries(count, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, true) == 0);
+
+ memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT);
+ sbmap.set_all();
+ ASSERT_TRUE(test_extents_boundaries(count++, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, false) == 0);
+ }
+}
+
+//---------------------------------------------------------------------------------
+TEST(SimpleBitmap, boundaries2)
+{
+ const uint64_t bit_count_base = 64 << 10; // 64Kb = 8MB
+ const extent_t null_extent = {0, 0};
+
+ for (unsigned i = 0; i < 64; i++) {
+ uint64_t bit_count = bit_count_base + i;
+ extent_t full_extent = {0, bit_count};
+ SimpleBitmap sbmap(g_ceph_context, bit_count);
+
+ sbmap.set(0, bit_count);
+ ASSERT_TRUE(sbmap.get_next_set_extent(0) == full_extent);
+ ASSERT_TRUE(sbmap.get_next_clr_extent(0) == null_extent);
+
+ for (uint64_t bit = 0; bit < bit_count; bit++) {
+ sbmap.clr(bit, 1);
+ }
+ ASSERT_TRUE(sbmap.get_next_set_extent(0) == null_extent);
+ ASSERT_TRUE(sbmap.get_next_clr_extent(0) == full_extent);
+
+ for (uint64_t bit = 0; bit < bit_count; bit++) {
+ sbmap.set(bit, 1);
+ }
+ ASSERT_TRUE(sbmap.get_next_set_extent(0) == full_extent);
+ ASSERT_TRUE(sbmap.get_next_clr_extent(0) == null_extent);
+
+ sbmap.clr(0, bit_count);
+ ASSERT_TRUE(sbmap.get_next_set_extent(0) == null_extent);
+ ASSERT_TRUE(sbmap.get_next_clr_extent(0) == full_extent);
+ }
+}
+
+TEST(shared_blob_2hash_tracker_t, basic_test)
+{
+ shared_blob_2hash_tracker_t t1(1024 * 1024, 4096);
+
+ ASSERT_TRUE(t1.count_non_zero() == 0);
+
+ t1.inc(0, 0, 1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(0, 0, -1);
+ ASSERT_TRUE(t1.count_non_zero() == 0);
+
+ t1.inc(3, 0x1000, 2);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(3, 0x1000, -1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(3, 0x1000, -1);
+ ASSERT_TRUE(t1.count_non_zero() == 0);
+
+ t1.inc(2, 0x2000, 5);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(18, 0x2000, -5);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(18, 0x2000, 1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(2, 0x2000, -1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(18, 0x2000, 4);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(2, 0x2000, -4);
+ ASSERT_TRUE(t1.count_non_zero() == 0);
+
+ t1.inc(3, 0x3000, 2);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(4, 0x3000, -1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(4, 0x3000, -1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(3, 0x3000, -2);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(4, 0x3000, 1);
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+ t1.inc(4, 0x3000, 1);
+ ASSERT_TRUE(t1.count_non_zero() == 0);
+
+ t1.inc(5, 0x1000, 1);
+ t1.inc(5, 0x2000, 3);
+ t1.inc(5, 0x3000, 2);
+ t1.inc(5, 0x8000, 1);
+
+ ASSERT_TRUE(t1.count_non_zero() != 0);
+
+ ASSERT_TRUE(!t1.test_all_zero(5,0x1000));
+ ASSERT_TRUE(!t1.test_all_zero(5, 0x2000));
+ ASSERT_TRUE(!t1.test_all_zero(5, 0x3000));
+ ASSERT_TRUE(t1.test_all_zero(5, 0x4000));
+ ASSERT_TRUE(!t1.test_all_zero(5, 0x8000));
+
+ ASSERT_TRUE(t1.test_all_zero_range(5, 0, 0x1000));
+ ASSERT_TRUE(t1.test_all_zero_range(5, 0x500, 0x500));
+ ASSERT_TRUE(!t1.test_all_zero_range(5, 0x500, 0x1500));
+ ASSERT_TRUE(!t1.test_all_zero_range(5, 0x1500, 0x3200));
+ ASSERT_TRUE(t1.test_all_zero_range(5, 0x4500, 0x1500));
+ ASSERT_TRUE(t1.test_all_zero_range(5, 0x4500, 0x3b00));
+ ASSERT_TRUE(!t1.test_all_zero_range(5, 0, 0x9000));
+}
+
+TEST(bluestore_blob_use_tracker_t, mempool_stats_test)
+{
+ using mempool::bluestore_cache_other::allocated_items;
+ using mempool::bluestore_cache_other::allocated_bytes;
+ uint64_t other_items0 = allocated_items();
+ uint64_t other_bytes0 = allocated_bytes();
+ {
+ bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t;
+
+ t1->init(1024 * 1024, 4096);
+ ASSERT_EQ(256, allocated_items() - other_items0); // = 1M / 4K
+ ASSERT_EQ(1024, allocated_bytes() - other_bytes0); // = 1M / 4K * 4
+
+ delete t1;
+ ASSERT_EQ(allocated_items(), other_items0);
+ ASSERT_EQ(allocated_bytes(), other_bytes0);
+ }
+ {
+ bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t;
+
+ t1->init(1024 * 1024, 4096);
+ t1->add_tail(2048 * 1024, 4096);
+ // proper stats update after tail add
+ ASSERT_EQ(512, allocated_items() - other_items0); // = 2M / 4K
+ ASSERT_EQ(2048, allocated_bytes() - other_bytes0); // = 2M / 4K * 4
+
+ delete t1;
+ ASSERT_EQ(allocated_items(), other_items0);
+ ASSERT_EQ(allocated_bytes(), other_bytes0);
+ }
+ {
+ bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t;
+
+ t1->init(1024 * 1024, 4096);
+ t1->prune_tail(512 * 1024);
+ // no changes in stats after pruning
+ ASSERT_EQ(256, allocated_items() - other_items0); // = 1M / 4K
+ ASSERT_EQ(1024, allocated_bytes() - other_bytes0); // = 1M / 4K * 4
+
+ delete t1;
+ ASSERT_EQ(allocated_items(), other_items0);
+ ASSERT_EQ(allocated_bytes(), other_bytes0);
+ }
+ {
+ bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t;
+ bluestore_blob_use_tracker_t* t2 = new bluestore_blob_use_tracker_t;
+
+ t1->init(1024 * 1024, 4096);
+
+ // t1 keeps the same amount of entries + t2 has got half of them
+ t1->split(512 * 1024, t2);
+ ASSERT_EQ(256 + 128, allocated_items() - other_items0); //= 1M / 4K*1.5
+ ASSERT_EQ(1024 + 512, allocated_bytes() - other_bytes0); //= 1M / 4K*4*1.5
+
+ // t1 & t2 release everything, then t2 get one less entry than t2 had had
+ // before
+ t1->split(4096, t2);
+ ASSERT_EQ(127, allocated_items() - other_items0); // = 512K / 4K - 1
+ ASSERT_EQ(127 * 4, allocated_bytes() - other_bytes0); // = 512L / 4K * 4 - 4
+ delete t1;
+ delete t2;
+ ASSERT_EQ(allocated_items(), other_items0);
+ ASSERT_EQ(allocated_bytes(), other_bytes0);
+ }
+}
+
+int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/test_deferred.cc b/src/test/objectstore/test_deferred.cc
new file mode 100644
index 000000000..1b5608101
--- /dev/null
+++ b/src/test/objectstore/test_deferred.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <memory>
+#include <time.h>
+
+#include "os/ObjectStore.h"
+#include "os/bluestore/BlueStore.h"
+#include "include/Context.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "common/options.h" // for the size literals
+#include <semaphore.h>
+
+
+
+class C_do_action : public Context {
+public:
+ std::function<void()> action;
+ C_do_action(std::function<void()> action)
+ : action(action) {}
+
+ void finish(int r) override {
+ action();
+ }
+};
+
+void create_deferred_and_terminate() {
+ std::unique_ptr<ObjectStore> store;
+
+ g_ceph_context->_conf._clear_safe_to_start_threads();
+ g_ceph_context->_conf.set_val_or_die("bluestore_prefer_deferred_size", "4096");
+ g_ceph_context->_conf.set_val_or_die("bluestore_allocator", "bitmap");
+ g_ceph_context->_conf.set_val_or_die("bluestore_block_size", "10240000000");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ int64_t poolid;
+ coll_t cid;
+ ghobject_t hoid;
+ ObjectStore::CollectionHandle ch;
+ ceph_assert(::mkdir("bluestore.test_temp_dir", 0777) == 0);
+ store = ObjectStore::create(g_ceph_context,
+ "bluestore",
+ "bluestore.test_temp_dir",
+ "store_test_temp_journal");
+ ceph_assert(store->mkfs() == 0);
+ ceph_assert(store->mount() == 0);
+
+ poolid = 11;
+ cid = coll_t(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD));
+ ch = store->create_new_collection(cid);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ std::string oid = "zapchajdziura";
+ ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 1, poolid, ""));
+ bufferlist bl;
+ bl.append(std::string(0xe000, '-'));
+ t.write(cid, hoid, 0, 0xe000, bl);
+ r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ }
+
+ size_t object_count = 10;
+
+ // initial fill
+ bufferlist bl_64K;
+ bl_64K.append(std::string(64 * 1024, '-'));
+
+ std::atomic<size_t> prefill_counter{0};
+ sem_t prefill_mutex;
+ sem_init(&prefill_mutex, 0, 0);
+
+ for (size_t o = 0; o < object_count; o++) {
+ ObjectStore::Transaction t;
+ std::string oid = "object-" + std::to_string(o);
+ ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 1, poolid, ""));
+
+ t.write(cid, hoid, 0, bl_64K.length(), bl_64K);
+ t.register_on_commit(new C_do_action([&] {
+ if (++prefill_counter == object_count) {
+ sem_post(&prefill_mutex);
+ }
+ }));
+
+ r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ }
+ sem_wait(&prefill_mutex);
+
+ // small deferred writes over object
+ // and complete overwrite of previous one
+ bufferlist bl_8_bytes;
+ bl_8_bytes.append("abcdefgh");
+ std::atomic<size_t> deferred_counter{0};
+ for (size_t o = 0; o < object_count - 1; o++) {
+ ObjectStore::Transaction t;
+
+ // sprinkle deferred writes
+ std::string oid_d = "object-" + std::to_string(o + 1);
+ ghobject_t hoid_d(hobject_t(oid_d, "", CEPH_NOSNAP, 1, poolid, ""));
+
+ for(int i = 0; i < 16; i++) {
+ t.write(cid, hoid_d, 4096 * i, bl_8_bytes.length(), bl_8_bytes);
+ }
+
+ // overwrite previous object
+ std::string oid_m = "object-" + std::to_string(o);
+ ghobject_t hoid_m(hobject_t(oid_m, "", CEPH_NOSNAP, 1, poolid, ""));
+ t.write(cid, hoid_m, 0, bl_64K.length(), bl_64K);
+
+ t.register_on_commit(new C_do_action([&] {
+ if (++deferred_counter == object_count - 1) {
+ exit(0);
+ }
+ }));
+ r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ }
+ sleep(10);
+ ceph_assert(0 && "should not reach here");
+}
+
+int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ create_deferred_and_terminate();
+ return 0;
+}
diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc
new file mode 100644
index 000000000..33ffd6ab3
--- /dev/null
+++ b/src/test/objectstore/test_kv.cc
@@ -0,0 +1,1304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <iostream>
+#include <time.h>
+#include <sys/mount.h>
+#include "kv/KeyValueDB.h"
+#include "kv/RocksDBStore.h"
+#include "include/Context.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include <gtest/gtest.h>
+
+using namespace std;
+
+class KVTest : public ::testing::TestWithParam<const char*> {
+public:
+ boost::scoped_ptr<KeyValueDB> db;
+
+ KVTest() : db(0) {}
+
+ string _bl_to_str(bufferlist val) {
+ string str(val.c_str(), val.length());
+ return str;
+ }
+
+ void rm_r(string path) {
+ string cmd = string("rm -r ") + path;
+ cout << "==> " << cmd << std::endl;
+ int r = ::system(cmd.c_str());
+ if (r) {
+ cerr << "failed with exit code " << r
+ << ", continuing anyway" << std::endl;
+ }
+ }
+
+ void init() {
+ cout << "Creating " << string(GetParam()) << "\n";
+ db.reset(KeyValueDB::create(g_ceph_context, string(GetParam()),
+ "kv_test_temp_dir"));
+ }
+ void fini() {
+ db.reset(NULL);
+ }
+
+ void SetUp() override {
+ int r = ::mkdir("kv_test_temp_dir", 0777);
+ if (r < 0 && errno != EEXIST) {
+ r = -errno;
+ cerr << __func__ << ": unable to create kv_test_temp_dir: "
+ << cpp_strerror(r) << std::endl;
+ return;
+ }
+ init();
+ }
+ void TearDown() override {
+ fini();
+ rm_r("kv_test_temp_dir");
+ }
+};
+
+TEST_P(KVTest, OpenClose) {
+ ASSERT_EQ(0, db->create_and_open(cout));
+ db->close();
+ db->open(cout);
+ fini();
+}
+
+TEST_P(KVTest, OpenCloseReopenClose) {
+ ASSERT_EQ(0, db->create_and_open(cout));
+ fini();
+ init();
+ ASSERT_EQ(0, db->open(cout));
+ fini();
+}
+
+/*
+ * Basic write and read test case in same database session.
+ */
+TEST_P(KVTest, OpenWriteRead) {
+ ASSERT_EQ(0, db->create_and_open(cout));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist value;
+ value.append("value");
+ t->set("prefix", "key", value);
+ value.clear();
+ value.append("value2");
+ t->set("prefix", "key2", value);
+ value.clear();
+ value.append("value3");
+ t->set("prefix", "key3", value);
+ db->submit_transaction_sync(t);
+
+ bufferlist v1, v2;
+ ASSERT_EQ(0, db->get("prefix", "key", &v1));
+ ASSERT_EQ(v1.length(), 5u);
+ (v1.c_str())[v1.length()] = 0x0;
+ ASSERT_EQ(std::string(v1.c_str()), std::string("value"));
+ ASSERT_EQ(0, db->get("prefix", "key2", &v2));
+ ASSERT_EQ(v2.length(), 6u);
+ (v2.c_str())[v2.length()] = 0x0;
+ ASSERT_EQ(std::string(v2.c_str()), std::string("value2"));
+ }
+ fini();
+}
+
+TEST_P(KVTest, PutReopen) {
+ ASSERT_EQ(0, db->create_and_open(cout));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist value;
+ value.append("value");
+ t->set("prefix", "key", value);
+ t->set("prefix", "key2", value);
+ t->set("prefix", "key3", value);
+ db->submit_transaction_sync(t);
+ }
+ fini();
+
+ init();
+ ASSERT_EQ(0, db->open(cout));
+ {
+ bufferlist v1, v2;
+ ASSERT_EQ(0, db->get("prefix", "key", &v1));
+ ASSERT_EQ(v1.length(), 5u);
+ ASSERT_EQ(0, db->get("prefix", "key2", &v2));
+ ASSERT_EQ(v2.length(), 5u);
+ }
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rmkey("prefix", "key");
+ t->rmkey("prefix", "key3");
+ db->submit_transaction_sync(t);
+ }
+ fini();
+
+ init();
+ ASSERT_EQ(0, db->open(cout));
+ {
+ bufferlist v1, v2, v3;
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key", &v1));
+ ASSERT_EQ(0, db->get("prefix", "key2", &v2));
+ ASSERT_EQ(v2.length(), 5u);
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key3", &v3));
+ }
+ fini();
+}
+
+TEST_P(KVTest, BenchCommit) {
+ int n = 1024;
+ ASSERT_EQ(0, db->create_and_open(cout));
+ utime_t start = ceph_clock_now();
+ {
+ cout << "priming" << std::endl;
+ // prime
+ bufferlist big;
+ bufferptr bp(1048576);
+ bp.zero();
+ big.append(bp);
+ for (int i=0; i<30; ++i) {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->set("prefix", "big" + stringify(i), big);
+ db->submit_transaction_sync(t);
+ }
+ }
+ cout << "now doing small writes" << std::endl;
+ bufferlist data;
+ bufferptr bp(1024);
+ bp.zero();
+ data.append(bp);
+ for (int i=0; i<n; ++i) {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->set("prefix", "key" + stringify(i), data);
+ db->submit_transaction_sync(t);
+ }
+ utime_t end = ceph_clock_now();
+ utime_t dur = end - start;
+ cout << n << " commits in " << dur << ", avg latency " << (dur / (double)n)
+ << std::endl;
+ fini();
+}
+
+struct AppendMOP : public KeyValueDB::MergeOperator {
+ void merge_nonexistent(
+ const char *rdata, size_t rlen, std::string *new_value) override {
+ *new_value = "?" + std::string(rdata, rlen);
+ }
+ void merge(
+ const char *ldata, size_t llen,
+ const char *rdata, size_t rlen,
+ std::string *new_value) override {
+ *new_value = std::string(ldata, llen) + std::string(rdata, rlen);
+ }
+ // We use each operator name and each prefix to construct the
+ // overall RocksDB operator name for consistency check at open time.
+ const char *name() const override {
+ return "Append";
+ }
+};
+
+string tostr(bufferlist& b) {
+ return string(b.c_str(),b.length());
+}
+
+TEST_P(KVTest, Merge) {
+ shared_ptr<KeyValueDB::MergeOperator> p(new AppendMOP);
+ int r = db->set_merge_operator("A",p);
+ if (r < 0)
+ return; // No merge operators for this database type
+ ASSERT_EQ(0, db->create_and_open(cout));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist v1, v2, v3;
+ v1.append(string("1"));
+ v2.append(string("2"));
+ v3.append(string("3"));
+ t->set("P", "K1", v1);
+ t->set("A", "A1", v2);
+ t->rmkey("A", "A2");
+ t->merge("A", "A2", v3);
+ db->submit_transaction_sync(t);
+ }
+ {
+ bufferlist v1, v2, v3;
+ ASSERT_EQ(0, db->get("P", "K1", &v1));
+ ASSERT_EQ(tostr(v1), "1");
+ ASSERT_EQ(0, db->get("A", "A1", &v2));
+ ASSERT_EQ(tostr(v2), "2");
+ ASSERT_EQ(0, db->get("A", "A2", &v3));
+ ASSERT_EQ(tostr(v3), "?3");
+ }
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist v1;
+ v1.append(string("1"));
+ t->merge("A", "A2", v1);
+ db->submit_transaction_sync(t);
+ }
+ {
+ bufferlist v;
+ ASSERT_EQ(0, db->get("A", "A2", &v));
+ ASSERT_EQ(tostr(v), "?31");
+ }
+ fini();
+}
+
+TEST_P(KVTest, RMRange) {
+ ASSERT_EQ(0, db->create_and_open(cout));
+ bufferlist value;
+ value.append("value");
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->set("prefix", "key1", value);
+ t->set("prefix", "key2", value);
+ t->set("prefix", "key3", value);
+ t->set("prefix", "key4", value);
+ t->set("prefix", "key45", value);
+ t->set("prefix", "key5", value);
+ t->set("prefix", "key6", value);
+ db->submit_transaction_sync(t);
+ }
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->set("prefix", "key7", value);
+ t->set("prefix", "key8", value);
+ t->rm_range_keys("prefix", "key2", "key7");
+ db->submit_transaction_sync(t);
+ bufferlist v1, v2;
+ ASSERT_EQ(0, db->get("prefix", "key1", &v1));
+ v1.clear();
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key45", &v1));
+ ASSERT_EQ(0, db->get("prefix", "key8", &v1));
+ v1.clear();
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key2", &v1));
+ ASSERT_EQ(0, db->get("prefix", "key7", &v2));
+ }
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rm_range_keys("prefix", "key", "key");
+ db->submit_transaction_sync(t);
+ bufferlist v1, v2;
+ ASSERT_EQ(0, db->get("prefix", "key1", &v1));
+ ASSERT_EQ(0, db->get("prefix", "key8", &v2));
+ }
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rm_range_keys("prefix", "key-", "key~");
+ db->submit_transaction_sync(t);
+ bufferlist v1, v2;
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key1", &v1));
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key8", &v2));
+ }
+
+ fini();
+}
+
+TEST_P(KVTest, ShardingRMRange) {
+ if(string(GetParam()) != "rocksdb")
+ return;
+ std::string cfs("O(7)=");
+ ASSERT_EQ(0, db->create_and_open(cout, cfs));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (size_t i = 0; i < 1000; i++) {
+ bufferlist value;
+ char* a;
+ ASSERT_EQ(asprintf(&a, "key%3.3ld", i), 6);
+ value.append(a);
+ t->set("O", a, value);
+ free(a);
+ }
+ db->submit_transaction_sync(t);
+ }
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rm_range_keys("O", "key277", "key467");
+ db->submit_transaction_sync(t);
+ }
+
+ for (size_t i = 0; i < 1000; i++) {
+ char* key;
+ ASSERT_EQ(asprintf(&key, "key%3.3ld", i), 6);
+ bufferlist value;
+ int r = db->get("O", key, &value);
+ ASSERT_EQ(r, (i >= 277 && i < 467 ? -ENOENT : 0));
+ free(key);
+ }
+
+ fini();
+}
+
+
+TEST_P(KVTest, RocksDBColumnFamilyTest) {
+ if(string(GetParam()) != "rocksdb")
+ return;
+
+ std::string cfs("cf1 cf2");
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ cout << "creating two column families and opening them" << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout, cfs));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist value;
+ value.append("value");
+ cout << "write a transaction includes three keys in different CFs" << std::endl;
+ t->set("prefix", "key", value);
+ t->set("cf1", "key", value);
+ t->set("cf2", "key2", value);
+ ASSERT_EQ(0, db->submit_transaction_sync(t));
+ }
+ fini();
+
+ init();
+ ASSERT_EQ(0, db->open(cout, cfs));
+ {
+ bufferlist v1, v2, v3;
+ cout << "reopen db and read those keys" << std::endl;
+ ASSERT_EQ(0, db->get("prefix", "key", &v1));
+ ASSERT_EQ(0, _bl_to_str(v1) != "value");
+ ASSERT_EQ(0, db->get("cf1", "key", &v2));
+ ASSERT_EQ(0, _bl_to_str(v2) != "value");
+ ASSERT_EQ(0, db->get("cf2", "key2", &v3));
+ ASSERT_EQ(0, _bl_to_str(v2) != "value");
+ }
+ {
+ cout << "delete two keys in CFs" << std::endl;
+ KeyValueDB::Transaction t = db->get_transaction();
+ t->rmkey("prefix", "key");
+ t->rmkey("cf2", "key2");
+ ASSERT_EQ(0, db->submit_transaction_sync(t));
+ }
+ fini();
+
+ init();
+ ASSERT_EQ(0, db->open(cout, cfs));
+ {
+ cout << "reopen db and read keys again." << std::endl;
+ bufferlist v1, v2, v3;
+ ASSERT_EQ(-ENOENT, db->get("prefix", "key", &v1));
+ ASSERT_EQ(0, db->get("cf1", "key", &v2));
+ ASSERT_EQ(0, _bl_to_str(v2) != "value");
+ ASSERT_EQ(-ENOENT, db->get("cf2", "key2", &v3));
+ }
+ fini();
+}
+
+TEST_P(KVTest, RocksDBIteratorTest) {
+ if(string(GetParam()) != "rocksdb")
+ return;
+
+ std::string cfs("cf1");
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ cout << "creating one column family and opening it" << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout, cfs));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist bl1;
+ bl1.append("hello");
+ bufferlist bl2;
+ bl2.append("world");
+ cout << "write some kv pairs into default and new CFs" << std::endl;
+ t->set("prefix", "key1", bl1);
+ t->set("prefix", "key2", bl2);
+ t->set("cf1", "key1", bl1);
+ t->set("cf1", "key2", bl2);
+ ASSERT_EQ(0, db->submit_transaction_sync(t));
+ }
+ {
+ cout << "iterating the default CF" << std::endl;
+ KeyValueDB::Iterator iter = db->get_iterator("prefix");
+ iter->seek_to_first();
+ ASSERT_EQ(1, iter->valid());
+ ASSERT_EQ("key1", iter->key());
+ ASSERT_EQ("hello", _bl_to_str(iter->value()));
+ ASSERT_EQ(0, iter->next());
+ ASSERT_EQ(1, iter->valid());
+ ASSERT_EQ("key2", iter->key());
+ ASSERT_EQ("world", _bl_to_str(iter->value()));
+ }
+ {
+ cout << "iterating the new CF" << std::endl;
+ KeyValueDB::Iterator iter = db->get_iterator("cf1");
+ iter->seek_to_first();
+ ASSERT_EQ(1, iter->valid());
+ ASSERT_EQ("key1", iter->key());
+ ASSERT_EQ("hello", _bl_to_str(iter->value()));
+ ASSERT_EQ(0, iter->next());
+ ASSERT_EQ(1, iter->valid());
+ ASSERT_EQ("key2", iter->key());
+ ASSERT_EQ("world", _bl_to_str(iter->value()));
+ }
+ fini();
+}
+
+TEST_P(KVTest, RocksDBShardingIteratorTest) {
+ if(string(GetParam()) != "rocksdb")
+ return;
+
+ std::string cfs("A(6)");
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ cout << "creating one column family and opening it" << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout, cfs));
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (int v = 100; v <= 999; v++) {
+ std::string str = to_string(v);
+ bufferlist val;
+ val.append(str);
+ t->set("A", str, val);
+ }
+ ASSERT_EQ(0, db->submit_transaction_sync(t));
+ }
+ {
+ KeyValueDB::Iterator it = db->get_iterator("A");
+ int pos = 0;
+ ASSERT_EQ(it->lower_bound(to_string(pos)), 0);
+ for (pos = 100; pos <= 999; pos++) {
+ ASSERT_EQ(it->valid(), true);
+ ASSERT_EQ(it->key(), to_string(pos));
+ ASSERT_EQ(it->value().to_str(), to_string(pos));
+ it->next();
+ }
+ ASSERT_EQ(it->valid(), false);
+ pos = 999;
+ ASSERT_EQ(it->lower_bound(to_string(pos)), 0);
+ for (pos = 999; pos >= 100; pos--) {
+ ASSERT_EQ(it->valid(), true);
+ ASSERT_EQ(it->key(), to_string(pos));
+ ASSERT_EQ(it->value().to_str(), to_string(pos));
+ it->prev();
+ }
+ ASSERT_EQ(it->valid(), false);
+ }
+ fini();
+}
+
+TEST_P(KVTest, RocksDBCFMerge) {
+ if(string(GetParam()) != "rocksdb")
+ return;
+
+ shared_ptr<KeyValueDB::MergeOperator> p(new AppendMOP);
+ int r = db->set_merge_operator("cf1",p);
+ if (r < 0)
+ return; // No merge operators for this database type
+ std::string cfs("cf1");
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ cout << "creating one column family and opening it" << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout, cfs));
+
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist v1, v2, v3;
+ v1.append(string("1"));
+ v2.append(string("2"));
+ v3.append(string("3"));
+ t->set("P", "K1", v1);
+ t->set("cf1", "A1", v2);
+ t->rmkey("cf1", "A2");
+ t->merge("cf1", "A2", v3);
+ db->submit_transaction_sync(t);
+ }
+ {
+ bufferlist v1, v2, v3;
+ ASSERT_EQ(0, db->get("P", "K1", &v1));
+ ASSERT_EQ(tostr(v1), "1");
+ ASSERT_EQ(0, db->get("cf1", "A1", &v2));
+ ASSERT_EQ(tostr(v2), "2");
+ ASSERT_EQ(0, db->get("cf1", "A2", &v3));
+ ASSERT_EQ(tostr(v3), "?3");
+ }
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist v1;
+ v1.append(string("1"));
+ t->merge("cf1", "A2", v1);
+ db->submit_transaction_sync(t);
+ }
+ {
+ bufferlist v;
+ ASSERT_EQ(0, db->get("cf1", "A2", &v));
+ ASSERT_EQ(tostr(v), "?31");
+ }
+ fini();
+}
+
+TEST_P(KVTest, RocksDB_estimate_size) {
+ if(string(GetParam()) != "rocksdb")
+ GTEST_SKIP();
+
+ std::string cfs("cf1");
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ cout << "creating one column family and opening it" << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout));
+
+ for(int test = 0; test < 20; test++)
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist v1;
+ v1.append(string(1000, '1'));
+ for (int i = 0; i < 100; i++)
+ t->set("A", to_string(rand()%100000), v1);
+ db->submit_transaction_sync(t);
+ db->compact();
+
+ int64_t size_a = db->estimate_prefix_size("A","");
+ ASSERT_GT(size_a, (test + 1) * 1000 * 100 * 0.5);
+ ASSERT_LT(size_a, (test + 1) * 1000 * 100 * 1.5);
+ int64_t size_a1 = db->estimate_prefix_size("A","1");
+ ASSERT_GT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 0.5);
+ ASSERT_LT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 1.5);
+ int64_t size_b = db->estimate_prefix_size("B","");
+ ASSERT_EQ(size_b, 0);
+ }
+
+ fini();
+}
+
+TEST_P(KVTest, RocksDB_estimate_size_column_family) {
+ if(string(GetParam()) != "rocksdb")
+ GTEST_SKIP();
+
+ std::string cfs("cf1");
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ cout << "creating one column family and opening it" << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout, cfs));
+
+ for(int test = 0; test < 20; test++)
+ {
+ KeyValueDB::Transaction t = db->get_transaction();
+ bufferlist v1;
+ v1.append(string(1000, '1'));
+ for (int i = 0; i < 100; i++)
+ t->set("cf1", to_string(rand()%100000), v1);
+ db->submit_transaction_sync(t);
+ db->compact();
+
+ int64_t size_a = db->estimate_prefix_size("cf1","");
+ ASSERT_GT(size_a, (test + 1) * 1000 * 100 * 0.5);
+ ASSERT_LT(size_a, (test + 1) * 1000 * 100 * 1.5);
+ int64_t size_a1 = db->estimate_prefix_size("cf1","1");
+ ASSERT_GT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 0.5);
+ ASSERT_LT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 1.5);
+ int64_t size_b = db->estimate_prefix_size("B","");
+ ASSERT_EQ(size_b, 0);
+ }
+
+ fini();
+}
+
+TEST_P(KVTest, RocksDB_parse_sharding_def) {
+ if(string(GetParam()) != "rocksdb")
+ GTEST_SKIP();
+
+ bool result;
+ std::vector<RocksDBStore::ColumnFamily> sharding_def;
+ char const* error_position = nullptr;
+ std::string error_msg;
+
+ std::string_view text_def = "A(10,0-30) B(6)=option1,option2=aaaa C";
+ result = RocksDBStore::parse_sharding_def(text_def,
+ sharding_def,
+ &error_position,
+ &error_msg);
+
+ ASSERT_EQ(result, true);
+ ASSERT_EQ(error_position, nullptr);
+ ASSERT_EQ(error_msg, "");
+ std::cout << text_def << std::endl;
+ if (error_position) std::cout << std::string(error_position - text_def.begin(), ' ') << "^" << error_msg << std::endl;
+
+ ASSERT_EQ(sharding_def.size(), 3);
+ ASSERT_EQ(sharding_def[0].name, "A");
+ ASSERT_EQ(sharding_def[0].shard_cnt, 10);
+ ASSERT_EQ(sharding_def[0].hash_l, 0);
+ ASSERT_EQ(sharding_def[0].hash_h, 30);
+
+ ASSERT_EQ(sharding_def[1].name, "B");
+ ASSERT_EQ(sharding_def[1].shard_cnt, 6);
+ ASSERT_EQ(sharding_def[1].options, "option1,option2=aaaa");
+ ASSERT_EQ(sharding_def[2].name, "C");
+ ASSERT_EQ(sharding_def[2].shard_cnt, 1);
+
+
+ text_def = "A(10 B(6)=option C";
+ result = RocksDBStore::parse_sharding_def(text_def,
+ sharding_def,
+ &error_position,
+ &error_msg);
+ std::cout << text_def << std::endl;
+ if (error_position)
+ std::cout << std::string(error_position - text_def.begin(), ' ') << "^" << error_msg << std::endl;
+ ASSERT_EQ(result, false);
+ ASSERT_NE(error_position, nullptr);
+ ASSERT_NE(error_msg, "");
+
+ text_def = "A(10,1) B(6)=option C";
+ result = RocksDBStore::parse_sharding_def(text_def,
+ sharding_def,
+ &error_position,
+ &error_msg);
+ std::cout << text_def << std::endl;
+ std::cout << std::string(error_position - text_def.begin(), ' ') << "^" << error_msg << std::endl;
+ ASSERT_EQ(result, false);
+ ASSERT_NE(error_position, nullptr);
+ ASSERT_NE(error_msg, "");
+}
+
+
+
+class RocksDBShardingTest : public ::testing::TestWithParam<const char*> {
+public:
+ boost::scoped_ptr<KeyValueDB> db;
+
+ RocksDBShardingTest() : db(0) {}
+
+ string _bl_to_str(bufferlist val) {
+ string str(val.c_str(), val.length());
+ return str;
+ }
+
+ void rm_r(string path) {
+ string cmd = string("rm -r ") + path;
+ if (verbose)
+ cout << "==> " << cmd << std::endl;
+ int r = ::system(cmd.c_str());
+ if (r) {
+ cerr << "failed with exit code " << r
+ << ", continuing anyway" << std::endl;
+ }
+ }
+
+ void SetUp() override {
+ verbose = getenv("VERBOSE") && strcmp(getenv("VERBOSE"), "1") == 0;
+
+ int r = ::mkdir("kv_test_temp_dir", 0777);
+ if (r < 0 && errno != EEXIST) {
+ r = -errno;
+ cerr << __func__ << ": unable to create kv_test_temp_dir: "
+ << cpp_strerror(r) << std::endl;
+ return;
+ }
+ db.reset(KeyValueDB::create(g_ceph_context, "rocksdb",
+ "kv_test_temp_dir"));
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ if (verbose)
+ cout << "Creating database with sharding: " << GetParam() << std::endl;
+ ASSERT_EQ(0, db->create_and_open(cout, GetParam()));
+ }
+ void TearDown() override {
+ db.reset(nullptr);
+ rm_r("kv_test_temp_dir");
+ }
+
+ /*
+ A - main 0/1/20
+ B - shard 1/3 x 0/1/20
+ C - main 0/1/20
+ D - shard 1/3 x 0/1/20
+ E - main 0/1/20
+ */
+ bool verbose;
+ std::vector<std::string> sharding_defs = {
+ "Betelgeuse D",
+ "Betelgeuse(3) D",
+ "Betelgeuse D(3)",
+ "Betelgeuse(3) D(3)"};
+ std::vector<std::string> prefixes = {"Ad", "Betelgeuse", "C", "D", "Evade"};
+ std::vector<std::string> randoms = {"0", "1", "2", "3", "4", "5",
+ "found", "brain", "fully", "pen", "worth", "race",
+ "stand", "nodded", "whenever", "surrounded", "industrial", "skin",
+ "this", "direction", "family", "beginning", "whenever", "held",
+ "metal", "year", "like", "valuable", "softly", "whistle",
+ "perfectly", "broken", "idea", "also", "coffee", "branch",
+ "tongue", "immediately", "bent", "partly", "burn", "include",
+ "certain", "burst", "final", "smoke", "positive", "perfectly"
+ };
+ int R = randoms.size();
+
+ typedef int test_id[6];
+ void zero(test_id& x) {
+ k = 0;
+ v = 0;
+ for (auto& i:x)
+ i = 0;
+ }
+ bool end(const test_id& x) {
+ return x[5] != 0;
+ }
+ void next(test_id& x) {
+ x[0]++;
+ for (int i = 0; i < 5; i++) {
+ if (x[i] == 3) {
+ x[i] = 0;
+ ++x[i + 1];
+ }
+ }
+ }
+
+ std::map<std::string, std::string> data;
+ int k = 0;
+ int v = 0;
+
+ void generate_data(const test_id& x) {
+ data.clear();
+ for (int i = 0; i < 5; i++) {
+ if (verbose)
+ std::cout << x[i] << "-";
+ switch (x[i]) {
+ case 0:
+ break;
+ case 1:
+ data[RocksDBStore::combine_strings(prefixes[i], randoms[k++ % R])] = randoms[v++ % R];
+ break;
+ case 2:
+ std::string base = randoms[k++ % R];
+ for (int j = 0; j < 10; j++) {
+ data[RocksDBStore::combine_strings(prefixes[i], base + "." + randoms[k++ % R])] = randoms[v++ % R];
+ }
+ break;
+ }
+ }
+ }
+
+ void data_to_db() {
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (auto &d : data) {
+ bufferlist v1;
+ v1.append(d.second);
+ string prefix;
+ string key;
+ RocksDBStore::split_key(d.first, &prefix, &key);
+ t->set(prefix, key, v1);
+ if (verbose)
+ std::cout << "SET " << prefix << " " << key << std::endl;
+ }
+ ASSERT_EQ(db->submit_transaction_sync(t), 0);
+ }
+
+ void clear_db() {
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (auto &d : data) {
+ string prefix;
+ string key;
+ RocksDBStore::split_key(d.first, &prefix, &key);
+ t->rmkey(prefix, key);
+ }
+ ASSERT_EQ(db->submit_transaction_sync(t), 0);
+ //paranoid, check if db empty
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ ASSERT_EQ(it->seek_to_first(), 0);
+ ASSERT_EQ(it->valid(), false);
+ }
+};
+
+TEST_P(RocksDBShardingTest, wholespace_next) {
+ test_id X;
+ zero(X);
+ do {
+ generate_data(X);
+ data_to_db();
+
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ //move forward
+ auto dit = data.begin();
+ int r = it->seek_to_first();
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(it->valid(), (dit != data.end()));
+
+ while (dit != data.end()) {
+ ASSERT_EQ(it->valid(), true);
+ string prefix;
+ string key;
+ RocksDBStore::split_key(dit->first, &prefix, &key);
+ auto raw_key = it->raw_key();
+ ASSERT_EQ(raw_key.first, prefix);
+ ASSERT_EQ(raw_key.second, key);
+ ASSERT_EQ(it->value().to_str(), dit->second);
+ if (verbose)
+ std::cout << "next " << prefix << " " << key << std::endl;
+ ASSERT_EQ(it->next(), 0);
+ ++dit;
+ }
+ ASSERT_EQ(it->valid(), false);
+
+ clear_db();
+ next(X);
+ } while (!end(X));
+}
+
+TEST_P(RocksDBShardingTest, wholespace_prev) {
+ test_id X;
+ zero(X);
+ do {
+ generate_data(X);
+ data_to_db();
+
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ auto dit = data.rbegin();
+ int r = it->seek_to_last();
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(it->valid(), (dit != data.rend()));
+
+ while (dit != data.rend()) {
+ ASSERT_EQ(it->valid(), true);
+ string prefix;
+ string key;
+ RocksDBStore::split_key(dit->first, &prefix, &key);
+ auto raw_key = it->raw_key();
+ ASSERT_EQ(raw_key.first, prefix);
+ ASSERT_EQ(raw_key.second, key);
+ ASSERT_EQ(it->value().to_str(), dit->second);
+ if (verbose)
+ std::cout << "prev " << prefix << " " << key << std::endl;
+ ASSERT_EQ(it->prev(), 0);
+ ++dit;
+ }
+ ASSERT_EQ(it->valid(), false);
+
+ clear_db();
+ next(X);
+ } while (!end(X));
+}
+
+TEST_P(RocksDBShardingTest, wholespace_lower_bound) {
+ test_id X;
+ zero(X);
+ do {
+ generate_data(X);
+ data_to_db();
+
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ auto dit = data.begin();
+ int r = it->seek_to_first();
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(it->valid(), (dit != data.end()));
+
+ while (dit != data.end()) {
+ ASSERT_EQ(it->valid(), true);
+ string prefix;
+ string key;
+ RocksDBStore::split_key(dit->first, &prefix, &key);
+ KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator();
+ ASSERT_EQ(it1->lower_bound(prefix, key), 0);
+ ASSERT_EQ(it1->valid(), true);
+ auto raw_key = it1->raw_key();
+ ASSERT_EQ(raw_key.first, prefix);
+ ASSERT_EQ(raw_key.second, key);
+ if (verbose)
+ std::cout << "lower_bound " << prefix << " " << key << std::endl;
+ ASSERT_EQ(it->next(), 0);
+ ++dit;
+ }
+ ASSERT_EQ(it->valid(), false);
+
+ clear_db();
+ next(X);
+ } while (!end(X));
+}
+
+TEST_P(RocksDBShardingTest, wholespace_upper_bound) {
+ test_id X;
+ zero(X);
+ do {
+ generate_data(X);
+ data_to_db();
+
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ auto dit = data.begin();
+ int r = it->seek_to_first();
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(it->valid(), (dit != data.end()));
+
+ while (dit != data.end()) {
+ ASSERT_EQ(it->valid(), true);
+ string prefix;
+ string key;
+ string key_minus_1;
+ RocksDBStore::split_key(dit->first, &prefix, &key);
+ //decrement key minimally
+ key_minus_1 = key.substr(0, key.length() - 1) + std::string(1, key[key.length() - 1] - 1);
+ KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator();
+ ASSERT_EQ(it1->upper_bound(prefix, key_minus_1), 0);
+ ASSERT_EQ(it1->valid(), true);
+ auto raw_key = it1->raw_key();
+ ASSERT_EQ(raw_key.first, prefix);
+ ASSERT_EQ(raw_key.second, key);
+ if (verbose)
+ std::cout << "upper_bound " << prefix << " " << key_minus_1 << std::endl;
+ ASSERT_EQ(it->next(), 0);
+ ++dit;
+ }
+ ASSERT_EQ(it->valid(), false);
+
+ clear_db();
+ next(X);
+ } while (!end(X));
+}
+
+TEST_P(RocksDBShardingTest, wholespace_lookup_limits) {
+ test_id X;
+ zero(X);
+ do {
+ generate_data(X);
+ data_to_db();
+
+ //lookup before first
+ if (data.size() > 0) {
+ auto dit = data.begin();
+ string prefix;
+ string key;
+ RocksDBStore::split_key(dit->first, &prefix, &key);
+ KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator();
+ ASSERT_EQ(it1->lower_bound(" ", " "), 0);
+ ASSERT_EQ(it1->valid(), true);
+ auto raw_key = it1->raw_key();
+ ASSERT_EQ(raw_key.first, prefix);
+ ASSERT_EQ(raw_key.second, key);
+ }
+ //lookup after last
+ KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator();
+ ASSERT_EQ(it1->lower_bound("~", "~"), 0);
+ ASSERT_EQ(it1->valid(), false);
+
+ clear_db();
+ next(X);
+ } while (!end(X));
+}
+
+
+
+class RocksDBResharding : public ::testing::Test {
+public:
+ boost::scoped_ptr<RocksDBStore> db;
+
+ RocksDBResharding() : db(0) {}
+
+ string _bl_to_str(bufferlist val) {
+ string str(val.c_str(), val.length());
+ return str;
+ }
+
+ void rm_r(string path) {
+ string cmd = string("rm -r ") + path;
+ if (verbose)
+ cout << "==> " << cmd << std::endl;
+ int r = ::system(cmd.c_str());
+ if (r) {
+ cerr << "failed with exit code " << r
+ << ", continuing anyway" << std::endl;
+ }
+ }
+
+ void SetUp() override {
+ verbose = getenv("VERBOSE") && strcmp(getenv("VERBOSE"), "1") == 0;
+
+ int r = ::mkdir("kv_test_temp_dir", 0777);
+ if (r < 0 && errno != EEXIST) {
+ r = -errno;
+ cerr << __func__ << ": unable to create kv_test_temp_dir: "
+ << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ KeyValueDB* db_kv = KeyValueDB::create(g_ceph_context, "rocksdb",
+ "kv_test_temp_dir");
+ RocksDBStore* db_rocks = dynamic_cast<RocksDBStore*>(db_kv);
+ ceph_assert(db_rocks);
+ db.reset(db_rocks);
+ ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options));
+ }
+ void TearDown() override {
+ db.reset(nullptr);
+ rm_r("kv_test_temp_dir");
+ }
+
+ bool verbose;
+ std::vector<std::string> prefixes = {"Ad", "Betelgeuse", "C", "D", "Evade"};
+ std::vector<std::string> randoms = {"0", "1", "2", "3", "4", "5",
+ "found", "brain", "fully", "pen", "worth", "race",
+ "stand", "nodded", "whenever", "surrounded", "industrial", "skin",
+ "this", "direction", "family", "beginning", "whenever", "held",
+ "metal", "year", "like", "valuable", "softly", "whistle",
+ "perfectly", "broken", "idea", "also", "coffee", "branch",
+ "tongue", "immediately", "bent", "partly", "burn", "include",
+ "certain", "burst", "final", "smoke", "positive", "perfectly"
+ };
+ int R = randoms.size();
+ int k = 0;
+ std::map<std::string, std::string> data;
+
+ void generate_data() {
+ data.clear();
+ for (size_t p = 0; p < prefixes.size(); p++) {
+ size_t elem_count = 1 << (( p * 3 ) + 3);
+ for (size_t i = 0; i < elem_count; i++) {
+ std::string key;
+ for (int x = 0; x < 5; x++) {
+ key = key + randoms[rand() % R];
+ }
+ std::string value;
+ for (int x = 0; x < 3; x++) {
+ value = value + randoms[rand() % R];
+ }
+ data[RocksDBStore::combine_strings(prefixes[p], key)] = value;
+ }
+ }
+ }
+
+ void data_to_db() {
+ KeyValueDB::Transaction t = db->get_transaction();
+ size_t i = 0;
+ for (auto& d: data) {
+ bufferlist v1;
+ v1.append(d.second);
+ string prefix;
+ string key;
+ RocksDBStore::split_key(d.first, &prefix, &key);
+ t->set(prefix, key, v1);
+ if (verbose)
+ std::cout << "SET " << prefix << " " << key << std::endl;
+ i++;
+ if ((i % 1000) == 0) {
+ ASSERT_EQ(db->submit_transaction_sync(t), 0);
+ t.reset();
+ if (verbose)
+ std::cout << "writing key to DB" << std::endl;
+ t = db->get_transaction();
+ }
+ }
+ if (verbose)
+ std::cout << "writing keys to DB" << std::endl;
+ ASSERT_EQ(db->submit_transaction_sync(t), 0);
+ }
+
+ void clear_db() {
+ KeyValueDB::Transaction t = db->get_transaction();
+ for (auto &d : data) {
+ string prefix;
+ string key;
+ RocksDBStore::split_key(d.first, &prefix, &key);
+ t->rmkey(prefix, key);
+ }
+ ASSERT_EQ(db->submit_transaction_sync(t), 0);
+ //paranoid, check if db empty
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ ASSERT_EQ(it->seek_to_first(), 0);
+ ASSERT_EQ(it->valid(), false);
+ }
+
+ void check_db() {
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ //move forward
+ auto dit = data.begin();
+ int r = it->seek_to_first();
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(it->valid(), (dit != data.end()));
+
+ while (dit != data.end()) {
+ ASSERT_EQ(it->valid(), true);
+ string prefix;
+ string key;
+ RocksDBStore::split_key(dit->first, &prefix, &key);
+ auto raw_key = it->raw_key();
+ ASSERT_EQ(raw_key.first, prefix);
+ ASSERT_EQ(raw_key.second, key);
+ ASSERT_EQ(it->value().to_str(), dit->second);
+ if (verbose)
+ std::cout << "next " << prefix << " " << key << std::endl;
+ ASSERT_EQ(it->next(), 0);
+ ++dit;
+ }
+ ASSERT_EQ(it->valid(), false);
+ }
+};
+
+TEST_F(RocksDBResharding, basic) {
+ ASSERT_EQ(0, db->create_and_open(cout, ""));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ ASSERT_EQ(db->reshard("Evade(4)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, all_to_shards) {
+ ASSERT_EQ(0, db->create_and_open(cout, ""));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ ASSERT_EQ(db->reshard("Ad(1) Betelgeuse(1) C(1) D(1) Evade(1)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, all_to_shards_and_back_again) {
+ ASSERT_EQ(0, db->create_and_open(cout, ""));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ ASSERT_EQ(db->reshard("Ad(1) Betelgeuse(1) C(1) D(1) Evade(1)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+ ASSERT_EQ(db->reshard(""), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, resume_interrupted_at_batch) {
+ ASSERT_EQ(0, db->create_and_open(cout, ""));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ RocksDBStore::resharding_ctrl ctrl;
+ ctrl.unittest_fail_after_first_batch = true;
+ ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1000);
+ ASSERT_NE(db->open(cout), 0);
+ ASSERT_EQ(db->reshard("Evade(4)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, resume_interrupted_at_column) {
+ ASSERT_EQ(0, db->create_and_open(cout, ""));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ RocksDBStore::resharding_ctrl ctrl;
+ ctrl.unittest_fail_after_processing_column = true;
+ ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1001);
+ ASSERT_NE(db->open(cout), 0);
+ ASSERT_EQ(db->reshard("Evade(4)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, resume_interrupted_before_commit) {
+ ASSERT_EQ(0, db->create_and_open(cout, ""));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ RocksDBStore::resharding_ctrl ctrl;
+ ctrl.unittest_fail_after_successful_processing = true;
+ ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1002);
+ ASSERT_NE(db->open(cout), 0);
+ ASSERT_EQ(db->reshard("Evade(4)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, prevent_incomplete_hash_change) {
+ ASSERT_EQ(0, db->create_and_open(cout, "Evade(4,0-3)"));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ RocksDBStore::resharding_ctrl ctrl;
+ ctrl.unittest_fail_after_successful_processing = true;
+ ASSERT_EQ(db->reshard("Evade(4,0-8)", &ctrl), -1002);
+ ASSERT_NE(db->open(cout), 0);
+ ASSERT_EQ(db->reshard("Evade(4,0-8)"), 0);
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+}
+
+TEST_F(RocksDBResharding, change_reshard) {
+ ASSERT_EQ(0, db->create_and_open(cout, "Ad(4)"));
+ generate_data();
+ data_to_db();
+ check_db();
+ db->close();
+ {
+ RocksDBStore::resharding_ctrl ctrl;
+ ctrl.unittest_fail_after_first_batch = true;
+ ASSERT_EQ(db->reshard("C(5) D(3)", &ctrl), -1000);
+ }
+ {
+ RocksDBStore::resharding_ctrl ctrl;
+ ASSERT_NE(db->open(cout), 0);
+ ctrl.unittest_fail_after_first_batch = false;
+ ctrl.unittest_fail_after_processing_column = true;
+ ASSERT_EQ(db->reshard("C(5) Evade(2)", &ctrl), -1001);
+ }
+ {
+ RocksDBStore::resharding_ctrl ctrl;
+ ASSERT_NE(db->open(cout), 0);
+ ctrl.unittest_fail_after_processing_column = false;
+ ctrl.unittest_fail_after_successful_processing = true;
+ ASSERT_EQ(db->reshard("Evade(2) D(3)", &ctrl), -1002);
+ }
+ {
+ ASSERT_NE(db->open(cout), 0);
+ ASSERT_EQ(db->reshard("Ad(1) Evade(5)"), 0);
+ }
+ {
+ ASSERT_EQ(db->open(cout), 0);
+ check_db();
+ db->close();
+ }
+}
+
+
+INSTANTIATE_TEST_SUITE_P(
+ KeyValueDB,
+ KVTest,
+ ::testing::Values("rocksdb"));
+
+INSTANTIATE_TEST_SUITE_P(
+ KeyValueDB,
+ RocksDBShardingTest,
+ ::testing::Values("Betelgeuse D",
+ "Betelgeuse(3) D",
+ "Betelgeuse D(3)",
+ "Betelgeuse(3) D(3)"));
+
+int main(int argc, char **argv) {
+ auto args = argv_to_vec(argc, argv);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ g_ceph_context->_conf.set_val(
+ "enable_experimental_unrecoverable_data_corrupting_features",
+ "rocksdb");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/test_memstore_clone.cc b/src/test/objectstore/test_memstore_clone.cc
new file mode 100644
index 000000000..507f74d22
--- /dev/null
+++ b/src/test/objectstore/test_memstore_clone.cc
@@ -0,0 +1,202 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <boost/intrusive_ptr.hpp>
+#include "global/global_init.h"
+#include "common/ceph_argparse.h"
+#include "os/ObjectStore.h"
+#include <gtest/gtest.h>
+#include "include/ceph_assert.h"
+#include "common/errno.h"
+#include "store_test_fixture.h"
+
+#define dout_context g_ceph_context
+
+using namespace std;
+
+namespace {
+
+const coll_t cid;
+
+ghobject_t make_ghobject(const char *oid)
+{
+ return ghobject_t{hobject_t{oid, "", CEPH_NOSNAP, 0, 0, ""}};
+}
+
+} // anonymous namespace
+
+class MemStoreClone : public StoreTestFixture {
+public:
+ MemStoreClone()
+ : StoreTestFixture("memstore")
+ {}
+ void SetUp() override {
+ StoreTestFixture::SetUp();
+ if (HasFailure()) {
+ return;
+ }
+ ObjectStore::Transaction t;
+ ch = store->create_new_collection(cid);
+ t.create_collection(cid, 4);
+ unsigned r = store->queue_transaction(ch, std::move(t));
+ if (r != 0) {
+ derr << "failed to create collection with " << cpp_strerror(r) << dendl;
+ }
+ ASSERT_EQ(0U, r);
+ }
+ void TearDown() override {
+ ch.reset();
+ StoreTestFixture::TearDown();
+ }
+};
+
+// src 11[11 11 11 11]11
+// dst 22 22 22 22 22 22
+// res 22 11 11 11 11 22
+TEST_F(MemStoreClone, CloneRangeAllocated)
+{
+ ASSERT_TRUE(store);
+
+ const auto src = make_ghobject("src1");
+ const auto dst = make_ghobject("dst1");
+
+ bufferlist srcbl, dstbl, result, expected;
+ srcbl.append("111111111111");
+ dstbl.append("222222222222");
+ expected.append("221111111122");
+
+ ObjectStore::Transaction t;
+ t.write(cid, src, 0, 12, srcbl);
+ t.write(cid, dst, 0, 12, dstbl);
+ t.clone_range(cid, src, dst, 2, 8, 2);
+ ASSERT_EQ(0, store->queue_transaction(ch, std::move(t)));
+ ASSERT_EQ(12, store->read(ch, dst, 0, 12, result));
+ ASSERT_EQ(expected, result);
+}
+
+// src __[__ __ __ __]__ 11 11
+// dst 22 22 22 22 22 22
+// res 22 00 00 00 00 22
+TEST_F(MemStoreClone, CloneRangeHole)
+{
+ ASSERT_TRUE(store);
+
+ const auto src = make_ghobject("src2");
+ const auto dst = make_ghobject("dst2");
+
+ bufferlist srcbl, dstbl, result, expected;
+ srcbl.append("1111");
+ dstbl.append("222222222222");
+ expected.append("22\000\000\000\000\000\000\000\00022", 12);
+
+ ObjectStore::Transaction t;
+ t.write(cid, src, 12, 4, srcbl);
+ t.write(cid, dst, 0, 12, dstbl);
+ t.clone_range(cid, src, dst, 2, 8, 2);
+ ASSERT_EQ(0, store->queue_transaction(ch, std::move(t)));
+ ASSERT_EQ(12, store->read(ch, dst, 0, 12, result));
+ ASSERT_EQ(expected, result);
+}
+
+// src __[__ __ __ 11]11
+// dst 22 22 22 22 22 22
+// res 22 00 00 00 11 22
+TEST_F(MemStoreClone, CloneRangeHoleStart)
+{
+ ASSERT_TRUE(store);
+
+ const auto src = make_ghobject("src3");
+ const auto dst = make_ghobject("dst3");
+
+ bufferlist srcbl, dstbl, result, expected;
+ srcbl.append("1111");
+ dstbl.append("222222222222");
+ expected.append("22\000\000\000\000\000\0001122", 12);
+
+ ObjectStore::Transaction t;
+ t.write(cid, src, 8, 4, srcbl);
+ t.write(cid, dst, 0, 12, dstbl);
+ t.clone_range(cid, src, dst, 2, 8, 2);
+ ASSERT_EQ(0, store->queue_transaction(ch, std::move(t)));
+ ASSERT_EQ(12, store->read(ch, dst, 0, 12, result));
+ ASSERT_EQ(expected, result);
+}
+
+// src 11[11 __ __ 11]11
+// dst 22 22 22 22 22 22
+// res 22 11 00 00 11 22
+TEST_F(MemStoreClone, CloneRangeHoleMiddle)
+{
+ ASSERT_TRUE(store);
+
+ const auto src = make_ghobject("src4");
+ const auto dst = make_ghobject("dst4");
+
+ bufferlist srcbl, dstbl, result, expected;
+ srcbl.append("1111");
+ dstbl.append("222222222222");
+ expected.append("2211\000\000\000\0001122", 12);
+
+ ObjectStore::Transaction t;
+ t.write(cid, src, 0, 4, srcbl);
+ t.write(cid, src, 8, 4, srcbl);
+ t.write(cid, dst, 0, 12, dstbl);
+ t.clone_range(cid, src, dst, 2, 8, 2);
+ ASSERT_EQ(0, store->queue_transaction(ch, std::move(t)));
+ ASSERT_EQ(12, store->read(ch, dst, 0, 12, result));
+ ASSERT_EQ(expected, result);
+}
+
+// src 11[11 __ __ __]__ 11 11
+// dst 22 22 22 22 22 22
+// res 22 11 00 00 00 22
+TEST_F(MemStoreClone, CloneRangeHoleEnd)
+{
+ ASSERT_TRUE(store);
+
+ const auto src = make_ghobject("src5");
+ const auto dst = make_ghobject("dst5");
+
+ bufferlist srcbl, dstbl, result, expected;
+ srcbl.append("1111");
+ dstbl.append("222222222222");
+ expected.append("2211\000\000\000\000\000\00022", 12);
+
+ ObjectStore::Transaction t;
+ t.write(cid, src, 0, 4, srcbl);
+ t.write(cid, src, 12, 4, srcbl);
+ t.write(cid, dst, 0, 12, dstbl);
+ t.clone_range(cid, src, dst, 2, 8, 2);
+ ASSERT_EQ(0, store->queue_transaction(ch, std::move(t)));
+ ASSERT_EQ(12, store->read(ch, dst, 0, 12, result));
+ ASSERT_EQ(expected, result);
+}
+
+int main(int argc, char** argv)
+{
+ // default to memstore
+ map<string,string> defaults = {
+ { "osd_objectstore", "memstore" },
+ { "osd_data", "msc.test_temp_dir" },
+ { "memstore_page_size", "4" }
+ };
+
+ auto args = argv_to_vec(argc, argv);
+ auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/test/objectstore/test_transaction.cc b/src/test/objectstore/test_transaction.cc
new file mode 100644
index 000000000..381b9df7d
--- /dev/null
+++ b/src/test/objectstore/test_transaction.cc
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "os/ObjectStore.h"
+#include <gtest/gtest.h>
+#include "common/Clock.h"
+#include "include/utime.h"
+#include <boost/tuple/tuple.hpp>
+
+using namespace std;
+
+TEST(Transaction, MoveConstruct)
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+ ASSERT_FALSE(a.empty());
+
+ // move-construct in b
+ auto b = std::move(a);
+ ASSERT_TRUE(a.empty());
+ ASSERT_FALSE(b.empty());
+}
+
+TEST(Transaction, MoveAssign)
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+ ASSERT_FALSE(a.empty());
+
+ auto b = ObjectStore::Transaction{};
+ b = std::move(a); // move-assign to b
+ ASSERT_TRUE(a.empty());
+ ASSERT_FALSE(b.empty());
+}
+
+TEST(Transaction, CopyConstruct)
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+ ASSERT_FALSE(a.empty());
+
+ auto b = a; // copy-construct in b
+ ASSERT_FALSE(a.empty());
+ ASSERT_FALSE(b.empty());
+}
+
+TEST(Transaction, CopyAssign)
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+ ASSERT_FALSE(a.empty());
+
+ auto b = ObjectStore::Transaction{};
+ b = a; // copy-assign to b
+ ASSERT_FALSE(a.empty());
+ ASSERT_FALSE(b.empty());
+}
+
+TEST(Transaction, Swap)
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+ ASSERT_FALSE(a.empty());
+
+ auto b = ObjectStore::Transaction{};
+ std::swap(a, b); // swap a and b
+ ASSERT_TRUE(a.empty());
+ ASSERT_FALSE(b.empty());
+}
+
+ObjectStore::Transaction generate_transaction()
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+
+ coll_t cid;
+ object_t obj("test_name");
+ snapid_t snap(0);
+ hobject_t hoid(obj, "key", snap, 0, 0, "nspace");
+ ghobject_t oid(hoid);
+
+ coll_t acid;
+ object_t aobj("another_test_name");
+ snapid_t asnap(0);
+ hobject_t ahoid(obj, "another_key", snap, 0, 0, "another_nspace");
+ ghobject_t aoid(hoid);
+ std::set<string> keys;
+ keys.insert("any_1");
+ keys.insert("any_2");
+ keys.insert("any_3");
+
+ bufferlist bl;
+ bl.append_zero(4096);
+
+ a.write(cid, oid, 1, 4096, bl, 0);
+
+ a.omap_setkeys(acid, aoid, bl);
+
+ a.omap_rmkeys(cid, aoid, keys);
+
+ a.touch(acid, oid);
+
+ return a;
+}
+
+TEST(Transaction, MoveRangesDelSrcObj)
+{
+ auto t = ObjectStore::Transaction{};
+ t.nop();
+
+ coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+
+ ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+ ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+ vector<std::pair<uint64_t, uint64_t>> move_info = {
+ make_pair(1, 5),
+ make_pair(10, 5)
+ };
+
+ t.touch(c, o1);
+ bufferlist bl;
+ bl.append("some data");
+ t.write(c, o1, 1, bl.length(), bl);
+ t.write(c, o1, 10, bl.length(), bl);
+
+ t.clone(c, o1, o2);
+ bl.append("some other data");
+ t.write(c, o2, 1, bl.length(), bl);
+}
+
+TEST(Transaction, GetNumBytes)
+{
+ auto a = ObjectStore::Transaction{};
+ a.nop();
+ ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test());
+
+ coll_t cid;
+ object_t obj("test_name");
+ snapid_t snap(0);
+ hobject_t hoid(obj, "key", snap, 0, 0, "nspace");
+ ghobject_t oid(hoid);
+
+ coll_t acid;
+ object_t aobj("another_test_name");
+ snapid_t asnap(0);
+ hobject_t ahoid(obj, "another_key", snap, 0, 0, "another_nspace");
+ ghobject_t aoid(hoid);
+ std::set<string> keys;
+ keys.insert("any_1");
+ keys.insert("any_2");
+ keys.insert("any_3");
+
+ bufferlist bl;
+ bl.append_zero(4096);
+
+ a.write(cid, oid, 1, 4096, bl, 0);
+ ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test());
+
+ a.omap_setkeys(acid, aoid, bl);
+ ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test());
+
+ a.omap_rmkeys(cid, aoid, keys);
+ ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test());
+
+ a.touch(acid, oid);
+ ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test());
+}
+
+void bench_num_bytes(bool legacy)
+{
+ const int max = 2500000;
+ auto a = generate_transaction();
+
+ if (legacy) {
+ cout << "get_encoded_bytes_test: ";
+ } else {
+ cout << "get_encoded_bytes: ";
+ }
+
+ utime_t start = ceph_clock_now();
+ if (legacy) {
+ for (int i = 0; i < max; ++i) {
+ a.get_encoded_bytes_test();
+ }
+ } else {
+ for (int i = 0; i < max; ++i) {
+ a.get_encoded_bytes();
+ }
+ }
+
+ utime_t end = ceph_clock_now();
+ cout << max << " encodes in " << (end - start) << std::endl;
+
+}
+
+TEST(Transaction, GetNumBytesBenchLegacy)
+{
+ bench_num_bytes(true);
+}
+
+TEST(Transaction, GetNumBytesBenchCurrent)
+{
+ bench_num_bytes(false);
+}