diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/test/objectstore | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/test/objectstore')
25 files changed, 21690 insertions, 0 deletions
diff --git a/src/test/objectstore/Allocator_aging_fragmentation.cc b/src/test/objectstore/Allocator_aging_fragmentation.cc new file mode 100755 index 000000000..220f8841b --- /dev/null +++ b/src/test/objectstore/Allocator_aging_fragmentation.cc @@ -0,0 +1,463 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Bitmap allocator fragmentation benchmarks. + * Author: Adam Kupczyk, akupczyk@redhat.com + */ +#include <bit> +#include <iostream> +#include <boost/scoped_ptr.hpp> +#include <gtest/gtest.h> +#include <boost/random/triangle_distribution.hpp> + +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "global/global_init.h" +#include "include/stringify.h" +#include "include/Context.h" +#include "os/bluestore/Allocator.h" + +#include <boost/random/uniform_int.hpp> + +typedef boost::mt11213b gen_type; + +#include "common/debug.h" +#define dout_context cct +#define dout_subsys ceph_subsys_ + +struct Scenario { + uint64_t capacity; + uint64_t alloc_unit; + double high_mark; + double low_mark; + double leakness; + uint32_t repeats; +}; + +std::vector<Scenario> scenarios{ + Scenario{512, 65536, 0.8, 0.6, 0.1, 3}, + Scenario{512, 65536, 0.9, 0.7, 0.0, 3}, + Scenario{512, 65536, 0.9, 0.7, 0.1, 3}, + Scenario{512, 65536, 0.8, 0.6, 0.5, 3}, + Scenario{512, 65536, 0.9, 0.7, 0.5, 3}, + Scenario{1024, 65536, 0.8, 0.6, 0.1, 3}, + Scenario{1024, 65536, 0.9, 0.7, 0.0, 3}, + Scenario{1024, 65536, 0.9, 0.7, 0.1, 3}, + Scenario{1024*2, 65536, 0.8, 0.6, 0.3, 3}, + Scenario{1024*2, 65536, 0.9, 0.7, 0.0, 3}, + Scenario{1024*2, 65536, 0.9, 0.7, 0.3, 3}, + Scenario{512, 65536/16, 0.8, 0.6, 0.1, 3}, + Scenario{512, 65536/16, 0.9, 0.7, 0.0, 3}, + Scenario{512, 65536/16, 0.9, 0.7, 0.1, 3}, + Scenario{512, 65536/16, 0.8, 0.6, 0.5, 3}, + Scenario{512, 65536/16, 0.9, 0.7, 0.5, 3}, + Scenario{1024, 65536/16, 0.8, 0.6, 0.1, 3}, + Scenario{1024, 65536/16, 0.9, 0.7, 0.0, 3}, + Scenario{1024, 65536/16, 0.9, 0.7, 0.1, 3}, + Scenario{1024*2, 65536/16, 0.8, 0.6, 0.3, 3}, + Scenario{1024*2, 65536/16, 0.9, 0.7, 0.0, 3}, + Scenario{1024*2, 65536/16, 0.9, 0.7, 0.3, 3} +}; + +void PrintTo(const Scenario& s, ::std::ostream* os) +{ + *os << "(capacity=" << s.capacity; + *os << "G, alloc_unit=" << s.alloc_unit; + *os << ", high_mark=" << s.high_mark; + *os << ", low_mark=" << s.low_mark; + *os << ", leakness=" << s.leakness; + *os << ", repeats=" << s.repeats << ")"; +} +bool verbose = getenv("VERBOSE") != nullptr; + +class AllocTracker; +class AllocTest : public ::testing::TestWithParam<std::string> { +protected: + boost::scoped_ptr<AllocTracker> at; + gen_type rng; + static boost::intrusive_ptr<CephContext> cct; + +public: + boost::scoped_ptr<Allocator> alloc; + AllocTest(): alloc(nullptr) {} + void init_alloc(const std::string& alloc_name, int64_t size, uint64_t min_alloc_size); + void init_close(); + void doAgingTest(std::function<uint32_t()> size_generator, + const std::string& alloc_name, uint64_t capacity, uint32_t alloc_unit, + uint64_t high_mark, uint64_t low_mark, uint32_t iterations, double leak_factor = 0); + + uint64_t capacity; + uint32_t alloc_unit; + + uint64_t level = 0; + uint64_t allocs = 0; + uint64_t fragmented = 0; + uint64_t fragments = 0; + uint64_t total_fragments = 0; + + void do_fill(uint64_t high_mark, std::function<uint32_t()> size_generator, double leak_factor = 0); + void do_free(uint64_t low_mark); + uint32_t free_random(); + + void TearDown() final; + static void SetUpTestSuite(); + static void TearDownTestSuite(); +}; + +struct test_result { + uint64_t tests_cnt = 0; + double fragmented_percent = 0; + double fragments_count = 0; + double time = 0; + double frag_score = 0; +}; + +std::map<std::string, test_result> results_per_allocator; + +const uint64_t _1m = 1024 * 1024; +const uint64_t _1G = 1024 * 1024 * 1024; + +const uint64_t _2m = 2 * 1024 * 1024; + +class AllocTracker +{ + std::vector<bluestore_pextent_t> allocations; + uint64_t size = 0; + +public: + bool push(uint64_t offs, uint32_t len) + { + assert(len != 0); + if (size + 1 > allocations.size()) + allocations.resize(size + 100); + allocations[size++] = bluestore_pextent_t(offs, len); + return true; + } + + bool pop_random(gen_type& rng, uint64_t* offs, uint32_t* len, + uint32_t max_len = 0) + { + if (size == 0) + return false; + uint64_t pos = rng() % size; + *len = allocations[pos].length; + *offs = allocations[pos].offset; + + if (max_len && *len > max_len) { + allocations[pos].length = *len - max_len; + allocations[pos].offset = *offs + max_len; + *len = max_len; + } else { + allocations[pos] = allocations[size-1]; + --size; + } + return true; + } +}; + +boost::intrusive_ptr<CephContext> AllocTest::cct; + +void AllocTest::init_alloc(const std::string& allocator_name, int64_t size, uint64_t min_alloc_size) { + this->capacity = size; + this->alloc_unit = min_alloc_size; + rng.seed(0); + alloc.reset(Allocator::create(cct.get(), allocator_name, size, + min_alloc_size)); + at.reset(new AllocTracker()); +} + +void AllocTest::init_close() { + alloc.reset(0); + at.reset(nullptr); +} + +uint32_t AllocTest::free_random() { + uint64_t o = 0; + uint32_t l = 0; + interval_set<uint64_t> release_set; + if (!at->pop_random(rng, &o, &l)) { + //empty? + return 0; + } + release_set.insert(o, l); + alloc->release(release_set); + level -= l; + return l; +} + + +void AllocTest::do_fill(uint64_t high_mark, std::function<uint32_t()> size_generator, double leak_factor) { + assert (leak_factor >= 0); + assert (leak_factor < 1); + uint32_t leak_level = leak_factor * std::numeric_limits<uint32_t>::max(); + PExtentVector tmp; + while (level < high_mark) + { + uint32_t want = size_generator(); + tmp.clear(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + if (r < want) { + break; + } + level += r; + for(auto a : tmp) { + bool full = !at->push(a.offset, a.length); + EXPECT_EQ(full, false); + } + allocs++; + if (tmp.size() > 1) { + fragmented ++; + total_fragments += r; + fragments += tmp.size(); + } + if (leak_level > 0) { + for (size_t i=0; i<tmp.size(); i++) { + if (uint32_t(rng()) < leak_level) { + free_random(); + } + } + } + } +} + +void AllocTest::do_free(uint64_t low_mark) { + while (level > low_mark) + { + if (free_random() == 0) + break; + } +} + +void AllocTest::doAgingTest( + std::function<uint32_t()> size_generator, + const std::string& allocator_name, + uint64_t capacity, uint32_t alloc_unit, + uint64_t high_mark, uint64_t low_mark, uint32_t iterations, double leak_factor) +{ + assert(std::has_single_bit(alloc_unit)); + cct->_conf->bdev_block_size = alloc_unit; + PExtentVector allocated, tmp; + init_alloc(allocator_name, capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + utime_t start = ceph_clock_now(); + level = 0; + allocs = 0; + fragmented = 0; + fragments = 0; + total_fragments = 0; + if (verbose) std::cout << "INITIAL FILL" << std::endl; + do_fill(high_mark, size_generator, leak_factor); //initial fill with data + if (verbose) std::cout << " fragmented allocs=" << 100.0 * fragmented / allocs << "%" << + " #frags=" << ( fragmented != 0 ? double(fragments) / fragmented : 0 )<< + " time=" << (ceph_clock_now() - start) * 1000 << "ms" << std::endl; + + for (uint32_t i=0; i < iterations; i++) + { + allocs = 0; + fragmented = 0; + fragments = 0; + total_fragments = 0; + + uint64_t level_previous = level; + start = ceph_clock_now(); + if (verbose) std::cout << "ADDING CAPACITY " << i + 1 << std::endl; + do_free(low_mark); //simulates adding new capacity to cluster + if (verbose) std::cout << " level change: " << + double(level_previous) / capacity * 100 << "% -> " << + double(level) / capacity * 100 << "% time=" << + (ceph_clock_now() - start) * 1000 << "ms" << std::endl; + + start = ceph_clock_now(); + if (verbose) std::cout << "APPENDING " << i + 1 << std::endl; + do_fill(high_mark, size_generator, leak_factor); //only creating elements + if (verbose) std::cout << " fragmented allocs=" << 100.0 * fragmented / allocs << "%" << + " #frags=" << ( fragmented != 0 ? double(fragments) / fragmented : 0 ) << + " time=" << (ceph_clock_now() - start) * 1000 << "ms" << std::endl; + } + double frag_score = alloc->get_fragmentation_score(); + do_free(0); + double free_frag_score = alloc->get_fragmentation_score(); + ASSERT_EQ(alloc->get_free(), capacity); + + std::cout << " fragmented allocs=" << 100.0 * fragmented / allocs << "%" << + " #frags=" << ( fragmented != 0 ? double(fragments) / fragmented : 0 ) << + " time=" << (ceph_clock_now() - start) * 1000 << "ms" << + " frag.score=" << frag_score << " after free frag.score=" << free_frag_score << std::endl; + + uint64_t sum = 0; + uint64_t cnt = 0; + auto list_free = [&](size_t off, size_t len) { + cnt++; + sum+=len; + }; + alloc->dump(list_free); + ASSERT_EQ(sum, capacity); + if (verbose) + std::cout << "free chunks sum=" << sum << " free chunks count=" << cnt << std::endl; + + //adding to totals + test_result &r = results_per_allocator[allocator_name]; + r.tests_cnt ++; + r.fragmented_percent += 100.0 * fragmented / allocs; + r.fragments_count += ( fragmented != 0 ? double(fragments) / fragmented : 2 ); + r.time += ceph_clock_now() - start; + r.frag_score += frag_score; +} + +void AllocTest::SetUpTestSuite() +{ + vector<const char*> args; + cct = global_init(NULL, args, + CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(cct.get()); +} + +void AllocTest::TearDown() +{ + at.reset(); + alloc.reset(); +} + +void AllocTest::TearDownTestSuite() +{ + cct.reset(); + + std::cout << "Summary: " << std::endl; + for (auto& r: results_per_allocator) { + std::cout << r.first << + " fragmented allocs=" << r.second.fragmented_percent / r.second.tests_cnt << "%" << + " #frags=" << r.second.fragments_count / r.second.tests_cnt << + " free_score=" << r.second.frag_score / r.second.tests_cnt << + " time=" << r.second.time * 1000 << "ms" << std::endl; + } +} + + +TEST_P(AllocTest, test_alloc_triangle_0_8M_16M) +{ + std::string allocator_name = GetParam(); + boost::triangle_distribution<double> D(1, (8 * 1024 * 1024) , (16 * 1024 * 1024) ); + for (auto& s:scenarios) { + std::cout << "Allocator: " << allocator_name << ", "; + PrintTo(s, &std::cout); + std::cout << std::endl; + + auto size_generator = [&]() -> uint32_t { + return (uint32_t(D(rng)) + s.alloc_unit) & ~(s.alloc_unit - 1); + }; + + doAgingTest(size_generator, allocator_name, s.capacity * _1G, s.alloc_unit, + s.high_mark * s.capacity * _1G, + s.low_mark * s.capacity * _1G, + s.repeats, s.leakness); + } +} + +TEST_P(AllocTest, test_alloc_8M_and_64K) +{ + std::string allocator_name = GetParam(); + constexpr uint32_t max_chunk_size = 8*1024*1024; + constexpr uint32_t min_chunk_size = 64*1024; + for (auto& s:scenarios) { + std::cout << "Allocator: " << allocator_name << ", "; + PrintTo(s, &std::cout); + std::cout << std::endl; + boost::uniform_int<> D(0, 1); + + auto size_generator = [&]() -> uint32_t { + if (D(rng) == 0) + return max_chunk_size; + else + return min_chunk_size; + }; + + doAgingTest(size_generator, allocator_name, s.capacity * _1G, s.alloc_unit, + s.high_mark * s.capacity * _1G, + s.low_mark * s.capacity * _1G, + s.repeats, s.leakness); + } +} + +TEST_P(AllocTest, test_alloc_fragmentation_max_chunk_8M) +{ + std::string allocator_name = GetParam(); + constexpr uint32_t max_object_size = 150*1000*1000; + constexpr uint32_t max_chunk_size = 8*1024*1024; + for (auto& s:scenarios) { + std::cout << "Allocator: " << allocator_name << ", "; + PrintTo(s, &std::cout); + std::cout << std::endl; + boost::uniform_int<> D(1, max_object_size / s.alloc_unit); + + uint32_t object_size = 0; + + auto size_generator = [&]() -> uint32_t { + uint32_t c; + if (object_size == 0) + object_size = (uint32_t(D(rng))* s.alloc_unit); + if (object_size > max_chunk_size) + c = max_chunk_size; + else + c = object_size; + object_size -= c; + return c; + }; + + doAgingTest(size_generator, allocator_name, s.capacity * _1G, s.alloc_unit, + s.high_mark * s.capacity * _1G, + s.low_mark * s.capacity * _1G, + s.repeats, s.leakness); + } +} + +TEST_P(AllocTest, test_bonus_empty_fragmented) +{ + uint64_t capacity = uint64_t(512) * 1024 * 1024 * 1024; //512 G + uint64_t alloc_unit = 64 * 1024; + std::string allocator_name = GetParam(); + std::cout << "Allocator: " << allocator_name << std::endl; + init_alloc(allocator_name, capacity, alloc_unit); + alloc->init_add_free(0, capacity); + PExtentVector tmp; + for (size_t i = 0; i < capacity / (1024 * 1024); i++) { + tmp.clear(); + uint32_t want = 1024 * 1024; + int r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + ASSERT_EQ(r, want); + if (tmp.size() > 1) { + interval_set<uint64_t> release_set; + for (auto& t: tmp) { + release_set.insert(t.offset, t.length); + } + alloc->release(release_set); + } else { + interval_set<uint64_t> release_set; + uint64_t offset = tmp[0].offset; + uint64_t length = tmp[0].length; + + release_set.insert(offset + alloc_unit, length - 3 * alloc_unit); + alloc->release(release_set); + release_set.clear(); + + release_set.insert(offset , alloc_unit); + alloc->release(release_set); + release_set.clear(); + + release_set.insert(offset + length - 2 * alloc_unit, 2 * alloc_unit); + alloc->release(release_set); + release_set.clear(); + } + } + double frag_score = alloc->get_fragmentation_score(); + ASSERT_EQ(alloc->get_free(), capacity); + std::cout << " empty storage frag.score=" << frag_score << std::endl; +} + +INSTANTIATE_TEST_SUITE_P( + Allocator, + AllocTest, + ::testing::Values("stupid", "bitmap", "avl", "btree")); diff --git a/src/test/objectstore/Allocator_bench.cc b/src/test/objectstore/Allocator_bench.cc new file mode 100644 index 000000000..0d04a854e --- /dev/null +++ b/src/test/objectstore/Allocator_bench.cc @@ -0,0 +1,368 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * In memory space allocator benchmarks. + * Author: Igor Fedotov, ifedotov@suse.com + */ +#include <iostream> +#include <boost/scoped_ptr.hpp> +#include <gtest/gtest.h> + +#include "common/Cond.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "include/Context.h" +#include "os/bluestore/Allocator.h" + +#include <boost/random/uniform_int.hpp> +typedef boost::mt11213b gen_type; + +#include "common/debug.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_ + +using namespace std; + +class AllocTest : public ::testing::TestWithParam<const char*> { + +public: + boost::scoped_ptr<Allocator> alloc; + AllocTest(): alloc(0) { } + void init_alloc(int64_t size, uint64_t min_alloc_size) { + std::cout << "Creating alloc type " << string(GetParam()) << " \n"; + alloc.reset(Allocator::create(g_ceph_context, GetParam(), size, + min_alloc_size)); + } + + void init_close() { + alloc.reset(0); + } + void doOverwriteTest(uint64_t capacity, uint64_t prefill, + uint64_t overwrite); +}; + +const uint64_t _1m = 1024 * 1024; + +void dump_mempools() +{ + ostringstream ostr; + Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty"); + ostr << "Mempools: "; + f->open_object_section("mempools"); + mempool::dump(f); + f->close_section(); + f->flush(ostr); + delete f; + ldout(g_ceph_context, 0) << ostr.str() << dendl; +} + +class AllocTracker +{ + std::vector<uint64_t> allocations; + uint64_t head = 0; + uint64_t tail = 0; + uint64_t size = 0; + boost::uniform_int<> u1; + +public: + AllocTracker(uint64_t capacity, uint64_t alloc_unit) + : u1(0, capacity) + { + ceph_assert(alloc_unit >= 0x100); + ceph_assert(capacity <= (uint64_t(1) << 48)); // we use 5 octets (bytes 1 - 5) to store + // offset to save the required space. + // This supports capacity up to 281 TB + + allocations.resize(capacity / alloc_unit); + } + inline uint64_t get_head() const + { + return head; + } + + inline uint64_t get_tail() const + { + return tail; + } + + bool push(uint64_t offs, uint32_t len) + { + ceph_assert((len & 0xff) == 0); + ceph_assert((offs & 0xff) == 0); + ceph_assert((offs & 0xffff000000000000) == 0); + + if (head + 1 == tail) + return false; + uint64_t val = (offs << 16) | (len >> 8); + allocations[head++] = val; + head %= allocations.size(); + ++size; + return true; + } + bool pop(uint64_t* offs, uint32_t* len) + { + if (size == 0) + return false; + uint64_t val = allocations[tail++]; + *len = uint64_t((val & 0xffffff) << 8); + *offs = (val >> 16) & ~uint64_t(0xff); + tail %= allocations.size(); + --size; + return true; + } + bool pop_random(gen_type& rng, uint64_t* offs, uint32_t* len, + uint32_t max_len = 0) + { + if (size == 0) + return false; + + uint64_t pos = (u1(rng) % size) + tail; + pos %= allocations.size(); + uint64_t val = allocations[pos]; + *len = uint64_t((val & 0xffffff) << 8); + *offs = (val >> 16) & ~uint64_t(0xff); + if (max_len && *len > max_len) { + val = ((*offs + max_len) << 16) | ((*len - max_len) >> 8); + allocations[pos] = val; + *len = max_len; + } else { + allocations[pos] = allocations[tail++]; + tail %= allocations.size(); + --size; + } + return true; + } +}; + +TEST_P(AllocTest, test_alloc_bench_seq) +{ + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024; + uint64_t alloc_unit = 4096; + uint64_t want_size = alloc_unit; + PExtentVector allocated, tmp; + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + utime_t start = ceph_clock_now(); + for (uint64_t i = 0; i < capacity; i += want_size) + { + tmp.clear(); + EXPECT_EQ(static_cast<int64_t>(want_size), + alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + + std::cout << "releasing..." << std::endl; + for (size_t i = 0; i < capacity; i += want_size) + { + interval_set<uint64_t> release_set; + release_set.insert(i, want_size); + alloc->release(release_set); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "release " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl; + dump_mempools(); +} + +TEST_P(AllocTest, test_alloc_bench) +{ + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024; + uint64_t alloc_unit = 4096; + PExtentVector allocated, tmp; + AllocTracker at(capacity, alloc_unit); + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + gen_type rng(time(NULL)); + boost::uniform_int<> u1(0, 9); // 4K-2M + boost::uniform_int<> u2(0, 7); // 4K-512K + + utime_t start = ceph_clock_now(); + for (uint64_t i = 0; i < capacity * 2; ) + { + uint32_t want = alloc_unit << u1(rng); + + tmp.clear(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + if (r < want) { + break; + } + i += r; + + for(auto a : tmp) { + bool full = !at.push(a.offset, a.length); + EXPECT_EQ(full, false); + } + uint64_t want_release = alloc_unit << u2(rng); + uint64_t released = 0; + do { + uint64_t o = 0; + uint32_t l = 0; + interval_set<uint64_t> release_set; + if (!at.pop_random(rng, &o, &l, want_release - released)) { + break; + } + release_set.insert(o, l); + alloc->release(release_set); + released += l; + } while (released < want_release); + + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl; + std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl; + dump_mempools(); +} + +void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill, + uint64_t overwrite) +{ + uint64_t alloc_unit = 4096; + PExtentVector allocated, tmp; + AllocTracker at(capacity, alloc_unit); + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + gen_type rng(time(NULL)); + boost::uniform_int<> u1(0, 9); // 4K-2M + boost::uniform_int<> u2(0, 9); // 4K-512K + + utime_t start = ceph_clock_now(); + // allocate 90% of the capacity + auto cap = prefill; + for (uint64_t i = 0; i < cap; ) + { + uint32_t want = alloc_unit << u1(rng); + tmp.clear(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + if (r < want) { + break; + } + i += r; + + for(auto a : tmp) { + bool full = !at.push(a.offset, a.length); + EXPECT_EQ(full, false); + } + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc " << i / 1024 / 1024 << " mb of " + << cap / 1024 / 1024 << std::endl; + } + } + + cap = overwrite; + for (uint64_t i = 0; i < cap; ) + { + uint64_t want_release = alloc_unit << u2(rng); + uint64_t released = 0; + do { + uint64_t o = 0; + uint32_t l = 0; + interval_set<uint64_t> release_set; + if (!at.pop_random(rng, &o, &l, want_release - released)) { + break; + } + release_set.insert(o, l); + alloc->release(release_set); + released += l; + } while (released < want_release); + + uint32_t want = alloc_unit << u1(rng); + tmp.clear(); + auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + if (r != want) { + std::cout<<"Can't allocate more space, stopping."<< std::endl; + break; + } + i += r; + + for(auto a : tmp) { + bool full = !at.push(a.offset, a.length); + EXPECT_EQ(full, false); + } + + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "reuse " << i / 1024 / 1024 << " mb of " + << cap / 1024 / 1024 << std::endl; + } + } + std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl; + std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl; + + dump_mempools(); +} + +TEST_P(AllocTest, test_alloc_bench_90_300) +{ + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024; + auto prefill = capacity - capacity / 10; + auto overwrite = capacity * 3; + doOverwriteTest(capacity, prefill, overwrite); +} + +TEST_P(AllocTest, test_alloc_bench_50_300) +{ + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024; + auto prefill = capacity / 2; + auto overwrite = capacity * 3; + doOverwriteTest(capacity, prefill, overwrite); +} + +TEST_P(AllocTest, test_alloc_bench_10_300) +{ + uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024; + auto prefill = capacity / 10; + auto overwrite = capacity * 3; + doOverwriteTest(capacity, prefill, overwrite); +} + +TEST_P(AllocTest, mempoolAccounting) +{ + uint64_t bytes = mempool::bluestore_alloc::allocated_bytes(); + uint64_t items = mempool::bluestore_alloc::allocated_items(); + + uint64_t alloc_size = 4 * 1024; + uint64_t capacity = 512ll * 1024 * 1024 * 1024; + Allocator* alloc = Allocator::create(g_ceph_context, GetParam(), + capacity, alloc_size); + ASSERT_NE(alloc, nullptr); + alloc->init_add_free(0, capacity); + + std::map<uint32_t, PExtentVector> all_allocs; + for (size_t i = 0; i < 10000; i++) { + PExtentVector tmp; + alloc->allocate(alloc_size, alloc_size, 0, 0, &tmp); + all_allocs[rand()] = tmp; + tmp.clear(); + alloc->allocate(alloc_size, alloc_size, 0, 0, &tmp); + all_allocs[rand()] = tmp; + tmp.clear(); + + auto it = all_allocs.upper_bound(rand()); + if (it != all_allocs.end()) { + alloc->release(it->second); + all_allocs.erase(it); + } + } + + delete(alloc); + ASSERT_EQ(mempool::bluestore_alloc::allocated_bytes(), bytes); + ASSERT_EQ(mempool::bluestore_alloc::allocated_items(), items); +} + +INSTANTIATE_TEST_SUITE_P( + Allocator, + AllocTest, + ::testing::Values("stupid", "bitmap", "avl", "btree", "hybrid")); diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc new file mode 100644 index 000000000..b00650015 --- /dev/null +++ b/src/test/objectstore/Allocator_test.cc @@ -0,0 +1,566 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * In memory space allocator test cases. + * Author: Ramesh Chander, Ramesh.Chander@sandisk.com + */ +#include <iostream> +#include <boost/scoped_ptr.hpp> +#include <gtest/gtest.h> + +#include "common/Cond.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "include/Context.h" +#include "os/bluestore/Allocator.h" + +using namespace std; + +typedef boost::mt11213b gen_type; + +class AllocTest : public ::testing::TestWithParam<const char*> { + +public: + boost::scoped_ptr<Allocator> alloc; + AllocTest(): alloc(0) { } + void init_alloc(int64_t size, uint64_t min_alloc_size) { + std::cout << "Creating alloc type " << string(GetParam()) << " \n"; + alloc.reset(Allocator::create(g_ceph_context, GetParam(), size, + min_alloc_size, + 256*1048576, 100*256*1048576ull)); + } + + void init_close() { + alloc.reset(0); + } +}; + +TEST_P(AllocTest, test_alloc_init) +{ + int64_t blocks = 64; + init_alloc(blocks, 1); + ASSERT_EQ(0U, alloc->get_free()); + alloc->shutdown(); + blocks = 1024 * 2 + 16; + init_alloc(blocks, 1); + ASSERT_EQ(0U, alloc->get_free()); + alloc->shutdown(); + blocks = 1024 * 2; + init_alloc(blocks, 1); + ASSERT_EQ(alloc->get_free(), (uint64_t) 0); +} + +TEST_P(AllocTest, test_init_add_free) +{ + int64_t block_size = 1024; + int64_t capacity = 4 * 1024 * block_size; + + { + init_alloc(capacity, block_size); + + auto free = alloc->get_free(); + alloc->init_add_free(block_size, 0); + ASSERT_EQ(free, alloc->get_free()); + + alloc->init_rm_free(block_size, 0); + ASSERT_EQ(free, alloc->get_free()); + } +} + +TEST_P(AllocTest, test_alloc_min_alloc) +{ + int64_t block_size = 1024; + int64_t capacity = 4 * 1024 * block_size; + + { + init_alloc(capacity, block_size); + + alloc->init_add_free(block_size, block_size); + PExtentVector extents; + EXPECT_EQ(block_size, alloc->allocate(block_size, block_size, + 0, (int64_t) 0, &extents)); + } + + /* + * Allocate extent and make sure all comes in single extent. + */ + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 4); + PExtentVector extents; + EXPECT_EQ(4*block_size, + alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, + 0, (int64_t) 0, &extents)); + EXPECT_EQ(1u, extents.size()); + EXPECT_EQ(extents[0].length, 4 * block_size); + } + + /* + * Allocate extent and make sure we get two different extents. + */ + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 2); + alloc->init_add_free(3 * block_size, block_size * 2); + PExtentVector extents; + + EXPECT_EQ(4*block_size, + alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, + 0, (int64_t) 0, &extents)); + EXPECT_EQ(2u, extents.size()); + EXPECT_EQ(extents[0].length, 2 * block_size); + EXPECT_EQ(extents[1].length, 2 * block_size); + } + alloc->shutdown(); +} + +TEST_P(AllocTest, test_alloc_min_max_alloc) +{ + int64_t block_size = 1024; + + int64_t capacity = 4 * 1024 * block_size; + init_alloc(capacity, block_size); + + /* + * Make sure we get all extents different when + * min_alloc_size == max_alloc_size + */ + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 4); + PExtentVector extents; + EXPECT_EQ(4*block_size, + alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, + block_size, (int64_t) 0, &extents)); + for (auto e : extents) { + EXPECT_EQ(e.length, block_size); + } + EXPECT_EQ(4u, extents.size()); + } + + + /* + * Make sure we get extents of length max_alloc size + * when max alloc size > min_alloc size + */ + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 4); + PExtentVector extents; + EXPECT_EQ(4*block_size, + alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size, + 2 * block_size, (int64_t) 0, &extents)); + EXPECT_EQ(2u, extents.size()); + for (auto& e : extents) { + EXPECT_EQ(e.length, block_size * 2); + } + } + + /* + * Make sure allocations are of min_alloc_size when min_alloc_size > block_size. + */ + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 1024); + PExtentVector extents; + EXPECT_EQ(1024 * block_size, + alloc->allocate(1024 * (uint64_t)block_size, + (uint64_t) block_size * 4, + block_size * 4, (int64_t) 0, &extents)); + for (auto& e : extents) { + EXPECT_EQ(e.length, block_size * 4); + } + EXPECT_EQ(1024u/4, extents.size()); + } + + /* + * Allocate and free. + */ + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 16); + PExtentVector extents; + EXPECT_EQ(16 * block_size, + alloc->allocate(16 * (uint64_t)block_size, (uint64_t) block_size, + 2 * block_size, (int64_t) 0, &extents)); + + EXPECT_EQ(extents.size(), 8u); + for (auto& e : extents) { + EXPECT_EQ(e.length, 2 * block_size); + } + } +} + +TEST_P(AllocTest, test_alloc_failure) +{ + int64_t block_size = 1024; + int64_t capacity = 4 * 1024 * block_size; + + { + init_alloc(capacity, block_size); + alloc->init_add_free(0, block_size * 256); + alloc->init_add_free(block_size * 512, block_size * 256); + + PExtentVector extents; + EXPECT_EQ(512 * block_size, + alloc->allocate(512 * (uint64_t)block_size, + (uint64_t) block_size * 256, + block_size * 256, (int64_t) 0, &extents)); + alloc->init_add_free(0, block_size * 256); + alloc->init_add_free(block_size * 512, block_size * 256); + extents.clear(); + EXPECT_EQ(-ENOSPC, + alloc->allocate(512 * (uint64_t)block_size, + (uint64_t) block_size * 512, + block_size * 512, (int64_t) 0, &extents)); + } +} + +TEST_P(AllocTest, test_alloc_big) +{ + int64_t block_size = 4096; + int64_t blocks = 104857600; + int64_t mas = 4096; + init_alloc(blocks*block_size, block_size); + alloc->init_add_free(2*block_size, (blocks-2)*block_size); + for (int64_t big = mas; big < 1048576*128; big*=2) { + cout << big << std::endl; + PExtentVector extents; + EXPECT_EQ(big, + alloc->allocate(big, mas, 0, &extents)); + } +} + +TEST_P(AllocTest, test_alloc_non_aligned_len) +{ + int64_t block_size = 1 << 12; + int64_t blocks = (1 << 20) * 100; + int64_t want_size = 1 << 22; + int64_t alloc_unit = 1 << 20; + + init_alloc(blocks*block_size, block_size); + alloc->init_add_free(0, 2097152); + alloc->init_add_free(2097152, 1064960); + alloc->init_add_free(3670016, 2097152); + + PExtentVector extents; + EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, &extents)); +} + +TEST_P(AllocTest, test_alloc_39334) +{ + uint64_t block = 0x4000; + uint64_t size = 0x5d00000000; + + init_alloc(size, block); + alloc->init_add_free(0x4000, 0x5cffffc000); + EXPECT_EQ(size - block, alloc->get_free()); +} + +TEST_P(AllocTest, test_alloc_fragmentation) +{ + uint64_t capacity = 4 * 1024 * 1024; + uint64_t alloc_unit = 4096; + uint64_t want_size = alloc_unit; + PExtentVector allocated, tmp; + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + bool bitmap_alloc = GetParam() == std::string("bitmap"); + + EXPECT_EQ(0.0, alloc->get_fragmentation()); + + for (size_t i = 0; i < capacity / alloc_unit; ++i) + { + tmp.clear(); + EXPECT_EQ(static_cast<int64_t>(want_size), + alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + allocated.insert(allocated.end(), tmp.begin(), tmp.end()); + + // bitmap fragmentation calculation doesn't provide such constant + // estimate + if (!bitmap_alloc) { + EXPECT_EQ(0.0, alloc->get_fragmentation()); + } + } + tmp.clear(); + EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp)); + + if (GetParam() == string("avl")) { + // AVL allocator uses a different allocating strategy + GTEST_SKIP() << "skipping for AVL allocator"; + } else if (GetParam() == string("hybrid")) { + // AVL allocator uses a different allocating strategy + GTEST_SKIP() << "skipping for Hybrid allocator"; + } + + for (size_t i = 0; i < allocated.size(); i += 2) + { + interval_set<uint64_t> release_set; + release_set.insert(allocated[i].offset, allocated[i].length); + alloc->release(release_set); + } + EXPECT_EQ(1.0, alloc->get_fragmentation()); + EXPECT_EQ(66u, uint64_t(alloc->get_fragmentation_score() * 100)); + + for (size_t i = 1; i < allocated.size() / 2; i += 2) + { + interval_set<uint64_t> release_set; + release_set.insert(allocated[i].offset, allocated[i].length); + alloc->release(release_set); + } + if (bitmap_alloc) { + // fragmentation = one l1 slot is free + one l1 slot is partial + EXPECT_EQ(50U, uint64_t(alloc->get_fragmentation() * 100)); + } else { + // fragmentation approx = 257 intervals / 768 max intervals + EXPECT_EQ(33u, uint64_t(alloc->get_fragmentation() * 100)); + } + EXPECT_EQ(27u, uint64_t(alloc->get_fragmentation_score() * 100)); + + for (size_t i = allocated.size() / 2 + 1; i < allocated.size(); i += 2) + { + interval_set<uint64_t> release_set; + release_set.insert(allocated[i].offset, allocated[i].length); + alloc->release(release_set); + } + // doing some rounding trick as stupid allocator doesn't merge all the + // extents that causes some minor fragmentation (minor bug or by-design behavior?). + // Hence leaving just two + // digits after decimal point due to this. + EXPECT_EQ(0u, uint64_t(alloc->get_fragmentation() * 100)); + if (bitmap_alloc) { + EXPECT_EQ(0u, uint64_t(alloc->get_fragmentation_score() * 100)); + } else { + EXPECT_EQ(11u, uint64_t(alloc->get_fragmentation_score() * 100)); + } +} + +TEST_P(AllocTest, test_dump_fragmentation_score) +{ + uint64_t capacity = 1024 * 1024 * 1024; + uint64_t one_alloc_max = 2 * 1024 * 1024; + uint64_t alloc_unit = 4096; + uint64_t want_size = alloc_unit; + uint64_t rounds = 10; + uint64_t actions_per_round = 1000; + PExtentVector allocated, tmp; + gen_type rng; + + init_alloc(capacity, alloc_unit); + alloc->init_add_free(0, capacity); + + EXPECT_EQ(0.0, alloc->get_fragmentation()); + EXPECT_EQ(0.0, alloc->get_fragmentation_score()); + + uint64_t allocated_cnt = 0; + for (size_t round = 0; round < rounds ; round++) { + for (size_t j = 0; j < actions_per_round ; j++) { + //free or allocate ? + if ( rng() % capacity >= allocated_cnt ) { + //allocate + want_size = ( rng() % one_alloc_max ) / alloc_unit * alloc_unit + alloc_unit; + tmp.clear(); + int64_t r = alloc->allocate(want_size, alloc_unit, 0, 0, &tmp); + if (r > 0) { + for (auto& t: tmp) { + if (t.length > 0) + allocated.push_back(t); + } + allocated_cnt += r; + } + } else { + //free + ceph_assert(allocated.size() > 0); + size_t item = rng() % allocated.size(); + ceph_assert(allocated[item].length > 0); + allocated_cnt -= allocated[item].length; + interval_set<uint64_t> release_set; + release_set.insert(allocated[item].offset, allocated[item].length); + alloc->release(release_set); + std::swap(allocated[item], allocated[allocated.size() - 1]); + allocated.resize(allocated.size() - 1); + } + } + + size_t free_sum = 0; + auto iterated_allocation = [&](size_t off, size_t len) { + ceph_assert(len > 0); + free_sum += len; + }; + alloc->foreach(iterated_allocation); + EXPECT_GT(1, alloc->get_fragmentation_score()); + EXPECT_EQ(capacity, free_sum + allocated_cnt); + } + + for (size_t i = 0; i < allocated.size(); i ++) + { + interval_set<uint64_t> release_set; + release_set.insert(allocated[i].offset, allocated[i].length); + alloc->release(release_set); + } +} + +TEST_P(AllocTest, test_alloc_bug_24598) +{ + if (string(GetParam()) != "bitmap") + return; + + uint64_t capacity = 0x2625a0000ull; + uint64_t alloc_unit = 0x4000; + uint64_t want_size = 0x200000; + PExtentVector allocated, tmp; + + init_alloc(capacity, alloc_unit); + + alloc->init_add_free(0x4800000, 0x100000); + alloc->init_add_free(0x4a00000, 0x100000); + + alloc->init_rm_free(0x4800000, 0x100000); + alloc->init_rm_free(0x4a00000, 0x100000); + + alloc->init_add_free(0x3f00000, 0x500000); + alloc->init_add_free(0x4500000, 0x100000); + alloc->init_add_free(0x4700000, 0x100000); + alloc->init_add_free(0x4900000, 0x100000); + alloc->init_add_free(0x4b00000, 0x200000); + + EXPECT_EQ(static_cast<int64_t>(want_size), + alloc->allocate(want_size, 0x100000, 0, 0, &tmp)); + EXPECT_EQ(1u, tmp.size()); + EXPECT_EQ(0x4b00000u, tmp[0].offset); + EXPECT_EQ(0x200000u, tmp[0].length); +} + +//Verifies issue from +//http://tracker.ceph.com/issues/40703 +// +TEST_P(AllocTest, test_alloc_big2) +{ + int64_t block_size = 4096; + int64_t blocks = 1048576 * 2; + int64_t mas = 1024*1024; + init_alloc(blocks*block_size, block_size); + alloc->init_add_free(0, blocks * block_size); + + PExtentVector extents; + uint64_t need = block_size * blocks / 4; // 2GB + EXPECT_EQ(need, + alloc->allocate(need, mas, 0, &extents)); + need = block_size * blocks / 4; // 2GB + extents.clear(); + EXPECT_EQ(need, + alloc->allocate(need, mas, 0, &extents)); + EXPECT_TRUE(extents[0].length > 0); +} + +//Verifies stuck 4GB chunk allocation +//in StupidAllocator +// +TEST_P(AllocTest, test_alloc_big3) +{ + int64_t block_size = 4096; + int64_t blocks = 1048576 * 2; + int64_t mas = 1024*1024; + init_alloc(blocks*block_size, block_size); + alloc->init_add_free(0, blocks * block_size); + + PExtentVector extents; + uint64_t need = block_size * blocks / 2; // 4GB + EXPECT_EQ(need, + alloc->allocate(need, mas, 0, &extents)); + EXPECT_TRUE(extents[0].length > 0); +} + +TEST_P(AllocTest, test_alloc_contiguous) +{ + int64_t block_size = 0x1000; + int64_t capacity = block_size * 1024 * 1024; + + { + init_alloc(capacity, block_size); + + alloc->init_add_free(0, capacity); + PExtentVector extents; + uint64_t need = 4 * block_size; + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)0, &extents)); + EXPECT_EQ(1u, extents.size()); + EXPECT_EQ(extents[0].offset, 0); + EXPECT_EQ(extents[0].length, 4 * block_size); + + extents.clear(); + EXPECT_EQ(need, + alloc->allocate(need, need, + 0, (int64_t)0, &extents)); + EXPECT_EQ(1u, extents.size()); + EXPECT_EQ(extents[0].offset, 4 * block_size); + EXPECT_EQ(extents[0].length, 4 * block_size); + } + + alloc->shutdown(); +} + +TEST_P(AllocTest, test_alloc_47883) +{ + uint64_t block = 0x1000; + uint64_t size = 1599858540544ul; + + init_alloc(size, block); + + alloc->init_add_free(0x1b970000, 0x26000); + alloc->init_add_free(0x1747e9d5000, 0x493000); + alloc->init_add_free(0x1747ee6a000, 0x196000); + + PExtentVector extents; + auto need = 0x3f980000; + auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents); + EXPECT_GE(got, 0x630000); +} + +TEST_P(AllocTest, test_alloc_50656_best_fit) +{ + uint64_t block = 0x1000; + uint64_t size = 0x3b9e400000; + + init_alloc(size, block); + + // too few free extents - causes best fit mode for avls + for (size_t i = 0; i < 0x10; i++) { + alloc->init_add_free(i * 2 * 0x100000, 0x100000); + } + + alloc->init_add_free(0x1e1bd13000, 0x404000); + + PExtentVector extents; + auto need = 0x400000; + auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents); + EXPECT_GT(got, 0); + EXPECT_EQ(got, 0x400000); +} + +TEST_P(AllocTest, test_alloc_50656_first_fit) +{ + uint64_t block = 0x1000; + uint64_t size = 0x3b9e400000; + + init_alloc(size, block); + + for (size_t i = 0; i < 0x10000; i += 2) { + alloc->init_add_free(i * 0x100000, 0x100000); + } + + alloc->init_add_free(0x1e1bd13000, 0x404000); + + PExtentVector extents; + auto need = 0x400000; + auto got = alloc->allocate(need, 0x10000, 0, (int64_t)0, &extents); + EXPECT_GT(got, 0); + EXPECT_EQ(got, 0x400000); +} + +INSTANTIATE_TEST_SUITE_P( + Allocator, + AllocTest, + ::testing::Values("stupid", "bitmap", "avl", "hybrid")); diff --git a/src/test/objectstore/CMakeLists.txt b/src/test/objectstore/CMakeLists.txt new file mode 100644 index 000000000..a012264e8 --- /dev/null +++ b/src/test/objectstore/CMakeLists.txt @@ -0,0 +1,140 @@ +add_executable(ceph_perf_objectstore + ObjectStoreTransactionBenchmark.cc) +target_link_libraries(ceph_perf_objectstore os osdc global ${UNITTEST_LIBS}) +install(TARGETS ceph_perf_objectstore + DESTINATION bin) + +add_library(store_test_fixture OBJECT store_test_fixture.cc) +target_include_directories(store_test_fixture PRIVATE + $<TARGET_PROPERTY:GTest::GTest,INTERFACE_INCLUDE_DIRECTORIES>) + +add_executable(ceph_test_objectstore + store_test.cc + $<TARGET_OBJECTS:store_test_fixture>) +target_link_libraries(ceph_test_objectstore + os + ceph-common + ${UNITTEST_LIBS} + global + ${EXTRALIBS} + ${BLKID_LIBRARIES} + ${CMAKE_DL_LIBS} + ) +install(TARGETS ceph_test_objectstore + DESTINATION ${CMAKE_INSTALL_BINDIR}) + +add_executable(ceph_test_keyvaluedb + test_kv.cc) +target_link_libraries(ceph_test_keyvaluedb + os + ceph-common + ${UNITTEST_LIBS} + global + ${EXTRALIBS} + ${BLKID_LIBRARIES} + ${CMAKE_DL_LIBS} + ) +install(TARGETS ceph_test_keyvaluedb + DESTINATION ${CMAKE_INSTALL_BINDIR}) + +# unittest_rocksdb_option +add_executable(unittest_rocksdb_option + TestRocksdbOptionParse.cc + $<TARGET_OBJECTS:unit-main> + ) +add_ceph_unittest(unittest_rocksdb_option) +target_link_libraries(unittest_rocksdb_option global os ${BLKID_LIBRARIES}) + +if(WITH_EVENTTRACE) + add_dependencies(os eventtrace_tp) +endif() + +if(WITH_BLUESTORE) + + add_executable(unittest_alloc + Allocator_test.cc + $<TARGET_OBJECTS:unit-main> + ) + add_ceph_unittest(unittest_alloc) + target_link_libraries(unittest_alloc os global) + + add_executable(unittest_alloc_bench + Allocator_bench.cc + $<TARGET_OBJECTS:unit-main> + ) + target_link_libraries(unittest_alloc_bench ${UNITTEST_LIBS} os global) + + add_executable(unittest_fastbmap_allocator + fastbmap_allocator_test.cc + $<TARGET_OBJECTS:unit-main> + ) + add_ceph_unittest(unittest_fastbmap_allocator) + target_link_libraries(unittest_fastbmap_allocator os global) + + set_target_properties(unittest_fastbmap_allocator PROPERTIES COMPILE_FLAGS + "${UNITTEST_CXX_FLAGS}") + + add_executable(unittest_hybrid_allocator + hybrid_allocator_test.cc + $<TARGET_OBJECTS:unit-main> + ) + add_ceph_unittest(unittest_hybrid_allocator) + target_link_libraries(unittest_hybrid_allocator os global) + + set_target_properties(unittest_hybrid_allocator PROPERTIES COMPILE_FLAGS + "${UNITTEST_CXX_FLAGS}") + + add_executable(unittest_alloc_aging EXCLUDE_FROM_ALL + Allocator_aging_fragmentation.cc) + target_link_libraries(unittest_alloc_aging os global GTest::Main) + + # unittest_bluefs + add_executable(unittest_bluefs + test_bluefs.cc + ) + add_ceph_unittest(unittest_bluefs) + target_link_libraries(unittest_bluefs os global) + + # unittest_bluestore_types + add_executable(unittest_bluestore_types + test_bluestore_types.cc + ) + add_ceph_unittest(unittest_bluestore_types) + target_link_libraries(unittest_bluestore_types os global) + + # unittest_bdev + add_executable(unittest_bdev + test_bdev.cc + ) + add_ceph_unittest(unittest_bdev) + target_link_libraries(unittest_bdev os global) + + # unittest_deferred + add_executable(unittest_deferred + test_deferred.cc + ) + add_ceph_unittest(unittest_deferred) + target_link_libraries(unittest_deferred os global) + +endif(WITH_BLUESTORE) + +# unittest_transaction +add_executable(unittest_transaction + test_transaction.cc) +add_ceph_unittest(unittest_transaction) +target_link_libraries(unittest_transaction os ceph-common) + +# unittest_memstore_clone +add_executable(unittest_memstore_clone + test_memstore_clone.cc + $<TARGET_OBJECTS:store_test_fixture>) +add_ceph_unittest(unittest_memstore_clone) +target_link_libraries(unittest_memstore_clone os global) + +if(WITH_BLUESTORE) + add_executable(ceph_test_alloc_replay + allocator_replay_test.cc) + target_link_libraries(ceph_test_alloc_replay os global ${UNITTEST_LIBS}) + install(TARGETS ceph_test_alloc_replay + DESTINATION bin) +endif() diff --git a/src/test/objectstore/ObjectStoreTransactionBenchmark.cc b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc new file mode 100644 index 000000000..e2ce3b2ef --- /dev/null +++ b/src/test/objectstore/ObjectStoreTransactionBenchmark.cc @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <stdlib.h> +#include <stdint.h> +#include <string> +#include <iostream> + +using namespace std; + +#include "common/ceph_argparse.h" +#include "common/debug.h" +#include "common/Cycles.h" +#include "global/global_init.h" +#include "os/ObjectStore.h" + +class Transaction { + private: + ObjectStore::Transaction t; + + public: + struct Tick { + uint64_t ticks; + uint64_t count; + Tick(): ticks(0), count(0) {} + void add(uint64_t a) { + ticks += a; + count++; + } + }; + static Tick write_ticks, setattr_ticks, omap_setkeys_ticks, omap_rmkey_ticks; + static Tick encode_ticks, decode_ticks, iterate_ticks; + + void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, + const bufferlist& data) { + uint64_t start_time = Cycles::rdtsc(); + t.write(cid, oid, off, len, data); + write_ticks.add(Cycles::rdtsc() - start_time); + } + void setattr(coll_t cid, const ghobject_t& oid, const string &name, + bufferlist& val) { + uint64_t start_time = Cycles::rdtsc(); + t.setattr(cid, oid, name, val); + setattr_ticks.add(Cycles::rdtsc() - start_time); + } + void omap_setkeys(coll_t cid, const ghobject_t &oid, + const map<string, bufferlist> &attrset) { + + uint64_t start_time = Cycles::rdtsc(); + t.omap_setkeys(cid, oid, attrset); + omap_setkeys_ticks.add(Cycles::rdtsc() - start_time); + } + void omap_rmkey(coll_t cid, const ghobject_t &oid, + const string &key) { + uint64_t start_time = Cycles::rdtsc(); + t.omap_rmkey(cid, oid, key); + omap_rmkey_ticks.add(Cycles::rdtsc() - start_time); + } + + void apply_encode_decode() { + bufferlist bl; + ObjectStore::Transaction d; + uint64_t start_time = Cycles::rdtsc(); + t.encode(bl); + encode_ticks.add(Cycles::rdtsc() - start_time); + + auto bliter = bl.cbegin(); + start_time = Cycles::rdtsc(); + d.decode(bliter); + decode_ticks.add(Cycles::rdtsc() - start_time); + } + + void apply_iterate() { + uint64_t start_time = Cycles::rdtsc(); + ObjectStore::Transaction::iterator i = t.begin(); + while (i.have_op()) { + ObjectStore::Transaction::Op *op = i.decode_op(); + + switch (op->op) { + case ObjectStore::Transaction::OP_WRITE: + { + ghobject_t oid = i.get_oid(op->oid); + bufferlist bl; + i.decode_bl(bl); + } + break; + case ObjectStore::Transaction::OP_SETATTR: + { + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + map<string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + } + break; + case ObjectStore::Transaction::OP_OMAP_SETKEYS: + { + ghobject_t oid = i.get_oid(op->oid); + map<string, bufferptr> aset; + i.decode_attrset(aset); + } + break; + case ObjectStore::Transaction::OP_OMAP_RMKEYS: + { + ghobject_t oid = i.get_oid(op->oid); + set<string> keys; + i.decode_keyset(keys); + } + break; + } + } + iterate_ticks.add(Cycles::rdtsc() - start_time); + } + + static void dump_stat() { + cerr << " write op: " << Cycles::to_microseconds(write_ticks.ticks) << "us count: " << write_ticks.count << std::endl; + cerr << " setattr op: " << Cycles::to_microseconds(setattr_ticks.ticks) << "us count: " << setattr_ticks.count << std::endl; + cerr << " omap_setkeys op: " << Cycles::to_microseconds(Transaction::omap_setkeys_ticks.ticks) << "us count: " << Transaction::omap_setkeys_ticks.count << std::endl; + cerr << " omap_rmkey op: " << Cycles::to_microseconds(Transaction::omap_rmkey_ticks.ticks) << "us count: " << Transaction::omap_rmkey_ticks.count << std::endl; + cerr << " encode op: " << Cycles::to_microseconds(Transaction::encode_ticks.ticks) << "us count: " << Transaction::encode_ticks.count << std::endl; + cerr << " decode op: " << Cycles::to_microseconds(Transaction::decode_ticks.ticks) << "us count: " << Transaction::decode_ticks.count << std::endl; + cerr << " iterate op: " << Cycles::to_microseconds(Transaction::iterate_ticks.ticks) << "us count: " << Transaction::iterate_ticks.count << std::endl; + } +}; + +class PerfCase { + static const uint64_t Kib = 1024; + static const uint64_t Mib = 1024 * 1024; + static const string info_epoch_attr; + static const string info_info_attr; + static const string attr; + static const string snapset_attr; + static const string pglog_attr; + static const coll_t meta_cid; + static const coll_t cid; + static const ghobject_t pglog_oid; + static const ghobject_t info_oid; + map<string, bufferlist> data; + + ghobject_t create_object() { + bufferlist bl = generate_random(100, 1); + return ghobject_t(hobject_t(string("obj_")+string(bl.c_str()), string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, "")); + } + + + bufferlist generate_random(uint64_t len, int frag) { + static const char alphanum[] = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + uint64_t per_frag = len / frag; + bufferlist bl; + for (int i = 0; i < frag; i++ ) { + bufferptr bp(per_frag); + for (unsigned int j = 0; j < len; j++) { + bp[j] = alphanum[rand() % (sizeof(alphanum) - 1)]; + } + bl.append(bp); + } + return bl; + } + public: + PerfCase() { + uint64_t four_kb = Kib * 4; + uint64_t one_mb = Mib * 1; + uint64_t four_mb = Mib * 4; + data["4k"] = generate_random(four_kb, 1); + data["1m"] = generate_random(one_mb, 1); + data["4m"] = generate_random(four_mb, 1); + data[attr] = generate_random(256, 1); + data[snapset_attr] = generate_random(32, 1); + data[pglog_attr] = generate_random(128, 1); + data[info_epoch_attr] = generate_random(4, 1); + data[info_info_attr] = generate_random(560, 1); + } + + uint64_t rados_write_4k(int times) { + uint64_t ticks = 0; + uint64_t len = Kib *4; + for (int i = 0; i < times; i++) { + uint64_t start_time = 0; + { + Transaction t; + ghobject_t oid = create_object(); + start_time = Cycles::rdtsc(); + t.write(cid, oid, 0, len, data["4k"]); + t.setattr(cid, oid, attr, data[attr]); + t.setattr(cid, oid, snapset_attr, data[snapset_attr]); + t.apply_encode_decode(); + t.apply_iterate(); + ticks += Cycles::rdtsc() - start_time; + } + { + Transaction t; + map<string, bufferlist> pglog_attrset; + map<string, bufferlist> info_attrset; + pglog_attrset[pglog_attr] = data[pglog_attr]; + info_attrset[info_epoch_attr] = data[info_epoch_attr]; + info_attrset[info_info_attr] = data[info_info_attr]; + start_time = Cycles::rdtsc(); + t.omap_setkeys(meta_cid, pglog_oid, pglog_attrset); + t.omap_setkeys(meta_cid, info_oid, info_attrset); + t.omap_rmkey(meta_cid, pglog_oid, pglog_attr); + t.apply_encode_decode(); + t.apply_iterate(); + ticks += Cycles::rdtsc() - start_time; + } + } + return ticks; + } +}; +const string PerfCase::info_epoch_attr("11.40_epoch"); +const string PerfCase::info_info_attr("11.40_info"); +const string PerfCase::attr("_"); +const string PerfCase::snapset_attr("snapset"); +const string PerfCase::pglog_attr("pglog_attr"); +const coll_t PerfCase::meta_cid; +const coll_t PerfCase::cid; +const ghobject_t PerfCase::pglog_oid(hobject_t(sobject_t(object_t("cid_pglog"), 0))); +const ghobject_t PerfCase::info_oid(hobject_t(sobject_t(object_t("infos"), 0))); +Transaction::Tick Transaction::write_ticks, Transaction::setattr_ticks, Transaction::omap_setkeys_ticks, Transaction::omap_rmkey_ticks; +Transaction::Tick Transaction::encode_ticks, Transaction::decode_ticks, Transaction::iterate_ticks; + +void usage(const string &name) { + cerr << "Usage: " << name << " [times] " + << std::endl; +} + +int main(int argc, char **argv) +{ + auto args = argv_to_vec(argc, argv); + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + g_ceph_context->_conf.apply_changes(nullptr); + Cycles::init(); + + cerr << "args: " << args << std::endl; + if (args.size() < 1) { + usage(argv[0]); + return 1; + } + + uint64_t times = atoi(args[0]); + PerfCase c; + uint64_t ticks = c.rados_write_4k(times); + Transaction::dump_stat(); + cerr << " Total rados op " << times << " run time " << Cycles::to_microseconds(ticks) << "us." << std::endl; + + return 0; +} diff --git a/src/test/objectstore/TestObjectStoreState.cc b/src/test/objectstore/TestObjectStoreState.cc new file mode 100644 index 000000000..f4ccef4f0 --- /dev/null +++ b/src/test/objectstore/TestObjectStoreState.cc @@ -0,0 +1,299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 New Dream Network +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <time.h> +#include <stdlib.h> +#include <signal.h> +#include "os/ObjectStore.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/debug.h" +#include <boost/scoped_ptr.hpp> +#include <boost/lexical_cast.hpp> +#include "TestObjectStoreState.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "ceph_test_objectstore_state " + +using namespace std; + +void TestObjectStoreState::init(int colls, int objs) +{ + dout(5) << "init " << colls << " colls " << objs << " objs" << dendl; + + ObjectStore::Transaction t; + auto meta_ch = m_store->create_new_collection(coll_t::meta()); + t.create_collection(coll_t::meta(), 0); + m_store->queue_transaction(meta_ch, std::move(t)); + + wait_for_ready(); + + int baseid = 0; + for (int i = 0; i < colls; i++) { + spg_t pgid(pg_t(i, 1), shard_id_t::NO_SHARD); + coll_t cid(pgid); + auto ch = m_store->create_new_collection(cid); + coll_entry_t *entry = coll_create(pgid, ch); + dout(5) << "init create collection " << entry->m_cid + << " meta " << entry->m_meta_obj << dendl; + + ObjectStore::Transaction *t = new ObjectStore::Transaction; + t->create_collection(entry->m_cid, 32); + bufferlist hint; + uint32_t pg_num = colls; + uint64_t num_objs = uint64_t(objs / colls); + encode(pg_num, hint); + encode(num_objs, hint); + t->collection_hint(entry->m_cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint); + dout(5) << "give collection hint, number of objects per collection: " << num_objs << dendl; + t->touch(cid, entry->m_meta_obj); + + for (int i = 0; i < objs; i++) { + hobject_t *obj = entry->touch_obj(i + baseid); + t->touch(entry->m_cid, ghobject_t(*obj)); + ceph_assert(i + baseid == m_num_objects); + m_num_objects++; + } + baseid += objs; + + t->register_on_commit(new C_OnFinished(this)); + m_store->queue_transaction(entry->m_ch, std::move(*t), nullptr); + + delete t; + inc_in_flight(); + + m_collections.insert(make_pair(cid, entry)); + rebuild_id_vec(); + m_next_coll_nr++; + } + dout(5) << "init has " << m_in_flight.load() << "in-flight transactions" << dendl; + wait_for_done(); + dout(5) << "init finished" << dendl; +} + +TestObjectStoreState::coll_entry_t *TestObjectStoreState::coll_create( + spg_t pgid, ObjectStore::CollectionHandle ch) +{ + char meta_buf[100]; + memset(meta_buf, 0, 100); + snprintf(meta_buf, 100, "pglog_0_head"); + return (new coll_entry_t(pgid, ch, meta_buf)); +} + +TestObjectStoreState::coll_entry_t* +TestObjectStoreState::get_coll(coll_t cid, bool erase) +{ + dout(5) << "get_coll id " << cid << dendl; + + coll_entry_t *entry = NULL; + auto it = m_collections.find(cid); + if (it != m_collections.end()) { + entry = it->second; + if (erase) { + m_collections.erase(it); + rebuild_id_vec(); + } + } + + dout(5) << "get_coll id " << cid; + if (!entry) + *_dout << " non-existent"; + else + *_dout << " name " << entry->m_cid; + *_dout << dendl; + return entry; +} + +TestObjectStoreState::coll_entry_t* +TestObjectStoreState::get_coll_at(int pos, bool erase) +{ + dout(5) << "get_coll_at pos " << pos << dendl; + + if (m_collections.empty()) + return NULL; + + ceph_assert((size_t) pos < m_collections_ids.size()); + + coll_t cid = m_collections_ids[pos]; + coll_entry_t *entry = m_collections[cid]; + + if (entry == NULL) { + dout(5) << "get_coll_at pos " << pos << " non-existent" << dendl; + return NULL; + } + + if (erase) { + m_collections.erase(cid); + rebuild_id_vec(); + } + + dout(5) << "get_coll_at pos " << pos << ": " + << entry->m_cid << "(removed: " << erase << ")" << dendl; + + return entry; +} + +TestObjectStoreState::coll_entry_t::~coll_entry_t() +{ + if (m_objects.size() > 0) { + map<int, hobject_t*>::iterator it = m_objects.begin(); + for (; it != m_objects.end(); ++it) { + hobject_t *obj = it->second; + if (obj) { + delete obj; + } + } + m_objects.clear(); + } +} + +bool TestObjectStoreState::coll_entry_t::check_for_obj(int id) +{ + if (m_objects.count(id)) + return true; + return false; +} + +hobject_t *TestObjectStoreState::coll_entry_t::touch_obj(int id) +{ + map<int, hobject_t*>::iterator it = m_objects.find(id); + if (it != m_objects.end()) { + dout(5) << "touch_obj coll id " << m_cid + << " name " << it->second->oid.name << dendl; + return it->second; + } + + char buf[100]; + memset(buf, 0, 100); + snprintf(buf, 100, "obj%d", id); + + hobject_t *obj = new hobject_t(sobject_t(object_t(buf), CEPH_NOSNAP)); + obj->set_hash(m_pgid.ps()); + obj->pool = m_pgid.pool(); + m_objects.insert(make_pair(id, obj)); + + dout(5) << "touch_obj coll id " << m_cid << " name " << buf << dendl; + return obj; +} + +hobject_t *TestObjectStoreState::coll_entry_t::get_obj(int id) +{ + return get_obj(id, false); +} + +/** + * remove_obj - Removes object without freeing it. + * @param id Object's id in the map. + * @return The object or NULL in case of error. + */ +hobject_t *TestObjectStoreState::coll_entry_t::remove_obj(int id) +{ + return get_obj(id, true); +} + +hobject_t *TestObjectStoreState::coll_entry_t::get_obj(int id, bool remove) +{ + map<int, hobject_t*>::iterator it = m_objects.find(id); + if (it == m_objects.end()) { + dout(5) << "get_obj coll " << m_cid + << " obj #" << id << " non-existent" << dendl; + return NULL; + } + + hobject_t *obj = it->second; + if (remove) + m_objects.erase(it); + + dout(5) << "get_obj coll " << m_cid << " id " << id + << ": " << obj->oid.name << "(removed: " << remove << ")" << dendl; + + return obj; +} + +hobject_t *TestObjectStoreState::coll_entry_t::get_obj_at(int pos, int *key) +{ + return get_obj_at(pos, false, key); +} + +/** + * remove_obj_at - Removes object without freeing it. + * @param pos The map's position in which the object lies. + * @return The object or NULL in case of error. + */ +hobject_t *TestObjectStoreState::coll_entry_t::remove_obj_at(int pos, int *key) +{ + return get_obj_at(pos, true, key); +} + +hobject_t *TestObjectStoreState::coll_entry_t::get_obj_at(int pos, + bool remove, int *key) +{ + if (m_objects.empty()) { + dout(5) << "get_obj_at coll " << m_cid << " pos " << pos + << " in an empty collection" << dendl; + return NULL; + } + + hobject_t *ret = NULL; + map<int, hobject_t*>::iterator it = m_objects.begin(); + for (int i = 0; it != m_objects.end(); ++it, i++) { + if (i == pos) { + ret = it->second; + break; + } + } + + if (ret == NULL) { + dout(5) << "get_obj_at coll " << m_cid << " pos " << pos + << " non-existent" << dendl; + return NULL; + } + + if (key != NULL) + *key = it->first; + + if (remove) + m_objects.erase(it); + + dout(5) << "get_obj_at coll id " << m_cid << " pos " << pos + << ": " << ret->oid.name << "(removed: " << remove << ")" << dendl; + + return ret; +} + +hobject_t* +TestObjectStoreState::coll_entry_t::replace_obj(int id, hobject_t *obj) { + hobject_t *old_obj = remove_obj(id); + m_objects.insert(make_pair(id, obj)); + return old_obj; +} + +int TestObjectStoreState::coll_entry_t::get_random_obj_id(rngen_t& gen) +{ + ceph_assert(!m_objects.empty()); + + boost::uniform_int<> orig_obj_rng(0, m_objects.size()-1); + int pos = orig_obj_rng(gen); + map<int, hobject_t*>::iterator it = m_objects.begin(); + for (int i = 0; it != m_objects.end(); ++it, i++) { + if (i == pos) { + return it->first; + } + } + ceph_abort_msg("INTERNAL ERROR"); +} diff --git a/src/test/objectstore/TestObjectStoreState.h b/src/test/objectstore/TestObjectStoreState.h new file mode 100644 index 000000000..d1e31bd8a --- /dev/null +++ b/src/test/objectstore/TestObjectStoreState.h @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 New Dream Network +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#ifndef TEST_OBJECTSTORE_STATE_H_ +#define TEST_OBJECTSTORE_STATE_H_ + +#include <boost/scoped_ptr.hpp> +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_int.hpp> +#include <map> +#include <vector> + +#include "os/ObjectStore.h" +#include "common/Cond.h" + +typedef boost::mt11213b rngen_t; + +class TestObjectStoreState { +public: + struct coll_entry_t { + spg_t m_pgid; + coll_t m_cid; + ghobject_t m_meta_obj; + ObjectStore::CollectionHandle m_ch; + std::map<int, hobject_t*> m_objects; + int m_next_object_id; + + coll_entry_t(spg_t pgid, ObjectStore::CollectionHandle& ch, + char *meta_obj_buf) + : m_pgid(pgid), + m_cid(m_pgid), + m_meta_obj(hobject_t(sobject_t(object_t(meta_obj_buf), CEPH_NOSNAP))), + m_ch(ch), + m_next_object_id(0) { + m_meta_obj.hobj.pool = m_pgid.pool(); + m_meta_obj.hobj.set_hash(m_pgid.ps()); + } + ~coll_entry_t(); + + hobject_t *touch_obj(int id); + bool check_for_obj(int id); + hobject_t *get_obj(int id); + hobject_t *remove_obj(int id); + hobject_t *get_obj_at(int pos, int *key = NULL); + hobject_t *remove_obj_at(int pos, int *key = NULL); + hobject_t *replace_obj(int id, hobject_t *obj); + int get_random_obj_id(rngen_t& gen); + + private: + hobject_t *get_obj(int id, bool remove); + hobject_t *get_obj_at(int pos, bool remove, int *key = NULL); + }; + + protected: + boost::shared_ptr<ObjectStore> m_store; + std::map<coll_t, coll_entry_t*> m_collections; + std::vector<coll_t> m_collections_ids; + int m_next_coll_nr; + int m_num_objs_per_coll; + int m_num_objects; + + int m_max_in_flight; + std::atomic<int> m_in_flight = { 0 }; + ceph::mutex m_finished_lock = ceph::make_mutex("Finished Lock"); + ceph::condition_variable m_finished_cond; + + void rebuild_id_vec() { + m_collections_ids.clear(); + m_collections_ids.reserve(m_collections.size()); + for (auto& i : m_collections) { + m_collections_ids.push_back(i.first); + } + } + + void wait_for_ready() { + std::unique_lock locker{m_finished_lock}; + m_finished_cond.wait(locker, [this] { + return m_max_in_flight <= 0 || m_in_flight < m_max_in_flight; + }); + } + + void wait_for_done() { + std::unique_lock locker{m_finished_lock}; + m_finished_cond.wait(locker, [this] { return m_in_flight == 0; }); + } + + void set_max_in_flight(int max) { + m_max_in_flight = max; + } + void set_num_objs_per_coll(int val) { + m_num_objs_per_coll = val; + } + + coll_entry_t *get_coll(coll_t cid, bool erase = false); + coll_entry_t *get_coll_at(int pos, bool erase = false); + int get_next_pool_id() { return m_next_pool++; } + + private: + static const int m_default_num_colls = 30; + // The pool ID used for collection creation, ID 0 is preserve for other tests + int m_next_pool; + + public: + explicit TestObjectStoreState(ObjectStore *store) : + m_next_coll_nr(0), m_num_objs_per_coll(10), m_num_objects(0), + m_max_in_flight(0), m_next_pool(2) { + m_store.reset(store); + } + ~TestObjectStoreState() { + auto it = m_collections.begin(); + while (it != m_collections.end()) { + if (it->second) + delete it->second; + m_collections.erase(it++); + } + } + + void init(int colls, int objs); + void init() { + init(m_default_num_colls, 0); + } + + int inc_in_flight() { + return ++m_in_flight; + } + + int dec_in_flight() { + return --m_in_flight; + } + + coll_entry_t *coll_create(spg_t pgid, ObjectStore::CollectionHandle ch); + + class C_OnFinished: public Context { + protected: + TestObjectStoreState *m_state; + + public: + explicit C_OnFinished(TestObjectStoreState *state) : m_state(state) { } + + void finish(int r) override { + std::lock_guard locker{m_state->m_finished_lock}; + m_state->dec_in_flight(); + m_state->m_finished_cond.notify_all(); + + } + }; +}; + +#endif /* TEST_OBJECTSTORE_STATE_H_ */ diff --git a/src/test/objectstore/TestRocksdbOptionParse.cc b/src/test/objectstore/TestRocksdbOptionParse.cc new file mode 100644 index 000000000..c34ea6bc2 --- /dev/null +++ b/src/test/objectstore/TestRocksdbOptionParse.cc @@ -0,0 +1,78 @@ +#include <gtest/gtest.h> +#include "include/Context.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/thread_status.h" +#include "kv/RocksDBStore.h" +#include <iostream> + +using namespace std; + +const string dir("rocksdb.test_temp_dir"); + +TEST(RocksDBOption, simple) { + rocksdb::Options options; + rocksdb::Status status; + map<string,string> kvoptions; + RocksDBStore *db = new RocksDBStore(g_ceph_context, dir, kvoptions, NULL); + string options_string = "" + "write_buffer_size=536870912;" + "create_if_missing=true;" + "max_write_buffer_number=4;" + "max_background_compactions=4;" + "stats_dump_period_sec = 5;" + "min_write_buffer_number_to_merge = 2;" + "level0_file_num_compaction_trigger = 4;" + "max_bytes_for_level_base = 104857600;" + "target_file_size_base = 10485760;" + "num_levels = 3;" + "compression = kNoCompression;" + "compaction_options_universal = {min_merge_width=4;size_ratio=2;max_size_amplification_percent=500}"; + int r = db->ParseOptionsFromString(options_string, options); + ASSERT_EQ(0, r); + ASSERT_EQ(536870912u, options.write_buffer_size); + ASSERT_EQ(4, options.max_write_buffer_number); + ASSERT_EQ(4, options.max_background_compactions); + ASSERT_EQ(5u, options.stats_dump_period_sec); + ASSERT_EQ(2, options.min_write_buffer_number_to_merge); + ASSERT_EQ(4, options.level0_file_num_compaction_trigger); + ASSERT_EQ(104857600u, options.max_bytes_for_level_base); + ASSERT_EQ(10485760u, options.target_file_size_base); + ASSERT_EQ(3, options.num_levels); + ASSERT_EQ(rocksdb::kNoCompression, options.compression); + ASSERT_EQ(2, options.compaction_options_universal.size_ratio); + ASSERT_EQ(4, options.compaction_options_universal.min_merge_width); + ASSERT_EQ(500, options.compaction_options_universal.max_size_amplification_percent); +} +TEST(RocksDBOption, interpret) { + rocksdb::Options options; + rocksdb::Status status; + map<string,string> kvoptions; + RocksDBStore *db = new RocksDBStore(g_ceph_context, dir, kvoptions, NULL); + string options_string = "compact_on_mount = true; compaction_threads=10;flusher_threads=5;"; + + int r = db->ParseOptionsFromString(options_string, options); + ASSERT_EQ(0, r); + ASSERT_TRUE(db->compact_on_mount); + //check thread pool setting + options.env->SleepForMicroseconds(100000); + std::vector<rocksdb::ThreadStatus> thread_list; + status = options.env->GetThreadList(&thread_list); + ASSERT_TRUE(status.ok()); + + int num_high_pri_threads = 0; + int num_low_pri_threads = 0; + for (vector<rocksdb::ThreadStatus>::iterator it = thread_list.begin(); + it!= thread_list.end(); + ++it) { + if (it->thread_type == rocksdb::ThreadStatus::HIGH_PRIORITY) + num_high_pri_threads++; + if (it->thread_type == rocksdb::ThreadStatus::LOW_PRIORITY) + num_low_pri_threads++; + } + ASSERT_EQ(15u, thread_list.size()); + //low pri threads is compaction_threads + ASSERT_EQ(10, num_low_pri_threads); + //high pri threads is flusher_threads + ASSERT_EQ(5, num_high_pri_threads); +} diff --git a/src/test/objectstore/allocator_replay_test.cc b/src/test/objectstore/allocator_replay_test.cc new file mode 100644 index 000000000..18959a56c --- /dev/null +++ b/src/test/objectstore/allocator_replay_test.cc @@ -0,0 +1,694 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Allocator replay tool. + * Author: Igor Fedotov, ifedotov@suse.com + */ +#include <iostream> +#include <vector> + +#include "common/ceph_argparse.h" +#include "common/debug.h" +#include "common/Cycles.h" +#include "common/errno.h" +#include "common/ceph_json.h" +#include "common/admin_socket.h" +#include "include/denc.h" +#include "global/global_init.h" +#include "os/bluestore/Allocator.h" + +using namespace std; + +void usage(const string &name) { + cerr << "Usage: " << name << " <log_to_replay> <raw_duplicates|duplicates|free_dump|try_alloc count want alloc_unit|replay_alloc alloc_list_file|export_binary out_file>" << std::endl; +} + +void usage_replay_alloc(const string &name) { + cerr << "Detailed replay_alloc usage: " << name << " <allocator_dump_JSON> replay_alloc <alloc_list_file> [number of replays]" << std::endl; + cerr << "The number of replays defaults to 1." << std::endl; + cerr << "The \"alloc_list_file\" parameter should be a file with allocation requests, one per line." << std::endl; + cerr << "Allocation request format (space separated, optional parameters are 0 if not given): want unit [max] [hint]" << std::endl; +} + +struct binary_alloc_map_t { + std::vector<std::pair<uint64_t, uint64_t>> free_extents; + + DENC(binary_alloc_map_t, v, p) { + DENC_START(1, 1, p); + denc(v.free_extents, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(binary_alloc_map_t) + +int replay_and_check_for_duplicate(char* fname) +{ + unique_ptr<Allocator> alloc; + + FILE* f = fopen(fname, "r"); + if (!f) { + std::cerr << "error: unable to open " << fname << std::endl; + return -1; + } + + PExtentVector tmp; + bool init_done = false; + char s[4096]; + char* sp, *token; + interval_set<uint64_t> owned_by_app; + while (true) { + if (fgets(s, sizeof(s), f) == nullptr) { + break; + } + sp = strstr(s, "init_add_free"); + if (!sp) { + sp = strstr(s, "release"); + } + if (sp) { + //2019-05-30 03:23:46.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_add_free 0x100000~680000000 + // or + //2019-05-30 03:23:46.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_add_free done + // or + // 2019 - 10 - 08T16:19 : 32.257 + 0300 7f5679f3fe80 10 fbmap_alloc 0x564fab96f100 release 0x450000~10000 + // or + // 2019 - 10 - 08T16 : 19 : 32.257 + 0300 7f5679f3fe80 10 fbmap_alloc 0x564fab96f100 release done + if (strstr(sp, "done") != nullptr) { + continue; + } + std::cout << s << std::endl; + if (!init_done) { + std::cerr << "error: no allocator init before: " << s << std::endl; + return -1; + } + uint64_t offs, len; + strtok(sp, " ~"); + token = strtok(nullptr, " ~"); + ceph_assert(token); + offs = strtoul(token, nullptr, 16); + token = strtok(nullptr, " ~"); + ceph_assert(token); + len = strtoul(token, nullptr, 16); + if (len == 0) { + std::cerr << "error: " << sp <<": " << s << std::endl; + return -1; + } + if (!owned_by_app.contains(offs, len)) { + std::cerr << "error: unexpected return to allocator, not owned by app: " + << s << std::endl; + return -1; + } + owned_by_app.erase(offs, len); + if (strstr(sp, "init_add_free") != nullptr) { + alloc->init_add_free(offs, len); + } else { + PExtentVector release_set; + release_set.emplace_back(offs, len); + alloc->release(release_set); + } + continue; + } + sp = strstr(s, "init_rm_free"); + if (sp) { + //2019-05-30 03:23:46.912 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_rm_free 0x100000~680000000 + // or + // 2019-05-30 03:23:46.916 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 init_rm_free done + + if (strstr(sp, "done") != nullptr) { + continue; + } + std::cout << s << std::endl; + if (!init_done) { + std::cerr << "error: no allocator init before: " << s << std::endl; + return -1; + } + uint64_t offs, len; + strtok(sp, " ~"); + token = strtok(nullptr, " ~"); + ceph_assert(token); + offs = strtoul(token, nullptr, 16); + token = strtok(nullptr, " ~"); + ceph_assert(token); + len = strtoul(token, nullptr, 16); + if (len == 0) { + std::cerr << "error: " << sp <<": " << s << std::endl; + return -1; + } + alloc->init_rm_free(offs, len); + + if (owned_by_app.intersects(offs, len)) { + std::cerr + << "error: unexpected takeover from allocator, already owned by app: " + << s << std::endl; + return -1; + } else { + owned_by_app.insert(offs, len); + } + + continue; + } + sp = strstr(s, "allocate"); + if (sp) { + //2019-05-30 03:23:48.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 allocate 0x80000000/100000,0,0 + // and need to bypass + // 2019-05-30 03:23:48.780 7f889a5edf00 10 fbmap_alloc 0x5642ed370600 allocate 0x69d400000~200000/100000,0,0 + + // Very simple and stupid check to bypass actual allocations + if (strstr(sp, "~") != nullptr) { + continue; + } + + std::cout << s << std::endl; + if (!init_done) { + std::cerr << "error: no allocator init before: " << s << std::endl; + return -1; + } + uint64_t want, alloc_unit; + strtok(sp, " /"); + token = strtok(nullptr, " /"); + ceph_assert(token); + want = strtoul(token, nullptr, 16); + token = strtok(nullptr, " ~"); + ceph_assert(token); + alloc_unit = strtoul(token, nullptr, 16); + if (want == 0 || alloc_unit == 0) { + std::cerr << "error: allocate: " << s << std::endl; + return -1; + } + tmp.clear(); + auto allocated = alloc->allocate(want, alloc_unit, 0, 0, &tmp); + std::cout << "allocated TOTAL: " << allocated << std::endl; + for (auto& ee : tmp) { + std::cerr << "dump extent: " << std::hex + << ee.offset << "~" << ee.length + << std::dec << std::endl; + } + std::cerr << "dump completed." << std::endl; + for (auto& e : tmp) { + if (owned_by_app.intersects(e.offset, e.length)) { + std::cerr << "error: unexpected allocated extent: " << std::hex + << e.offset << "~" << e.length + << " dumping all allocations:" << std::dec << std::endl; + for (auto& ee : tmp) { + std::cerr <<"dump extent: " << std::hex + << ee.offset << "~" << ee.length + << std::dec << std::endl; + } + std::cerr <<"dump completed." << std::endl; + return -1; + } else { + owned_by_app.insert(e.offset, e.length); + } + } + continue; + } + + string alloc_type = "bitmap"; + sp = strstr(s, "BitmapAllocator"); + if (!sp) { + alloc_type = "avl"; + sp = strstr(s, "AvlAllocator"); + } + if (!sp) { + alloc_type = "hybrid"; + sp = strstr(s, "HybridAllocator"); + } + if (!sp) { + alloc_type = "stupid"; + sp = strstr(s, "StupidAllocator"); + } + if (sp) { + // 2019-05-30 03:23:43.460 7f889a5edf00 10 fbmap_alloc 0x5642ed36e900 BitmapAllocator 0x15940000000/100000 + std::cout << s << std::endl; + if (init_done) { + std::cerr << "error: duplicate init: " << s << std::endl; + return -1; + } + uint64_t total, alloc_unit; + strtok(sp, " /"); + token = strtok(nullptr, " /"); + ceph_assert(token); + total = strtoul(token, nullptr, 16); + token = strtok(nullptr, " /"); + ceph_assert(token); + alloc_unit = strtoul(token, nullptr, 16); + if (total == 0 || alloc_unit == 0) { + std::cerr << "error: invalid init: " << s << std::endl; + return -1; + } + alloc.reset(Allocator::create(g_ceph_context, alloc_type, total, + alloc_unit)); + owned_by_app.insert(0, total); + + init_done = true; + continue; + } + } + fclose(f); + return 0; +} + +int replay_free_dump_and_apply_raw( + char* fname, + std::function<void ( + std::string_view, + int64_t, + int64_t, + std::string_view)> create, + std::function<void (uint64_t, uint64_t)> add_ext) +{ + string alloc_type; + string alloc_name; + uint64_t capacity = 0; + uint64_t alloc_unit = 0; + + JSONParser p; + std::cout << "parsing..." << std::endl; + bool b = p.parse(fname); + if (!b) { + std::cerr << "Failed to parse json: " << fname << std::endl; + return -1; + } + + JSONObj::data_val v; + ceph_assert(p.is_object()); + + auto *o = p.find_obj("alloc_type"); + ceph_assert(o); + alloc_type = o->get_data_val().str; + + o = p.find_obj("alloc_name"); + ceph_assert(o); + alloc_name = o->get_data_val().str; + + o = p.find_obj("capacity"); + ceph_assert(o); + decode_json_obj(capacity, o); + o = p.find_obj("alloc_unit"); + ceph_assert(o); + decode_json_obj(alloc_unit, o); + + int fd = -1; + o = p.find_obj("extents_file"); + if (o) { + string filename = o->get_data_val().str; + fd = open(filename.c_str(), O_RDONLY); + if (fd < 0) { + std::cerr << "error: unable to open extents file: " << filename + << ", " << cpp_strerror(-errno) + << std::endl; + return -1; + } + } else { + o = p.find_obj("extents"); + ceph_assert(o); + ceph_assert(o->is_array()); + } + std::cout << "parsing completed!" << std::endl; + + create(alloc_type, capacity, alloc_unit, alloc_name); + int r = 0; + if (fd < 0) { + auto it = o->find_first(); + while (!it.end()) { + auto *item_obj = *it; + uint64_t offset = 0; + uint64_t length = 0; + string offset_str, length_str; + + bool b = JSONDecoder::decode_json("offset", offset_str, item_obj); + ceph_assert(b); + b = JSONDecoder::decode_json("length", length_str, item_obj); + ceph_assert(b); + + char* p; + offset = strtol(offset_str.c_str(), &p, 16); + length = strtol(length_str.c_str(), &p, 16); + + // intentionally skip/trim entries that are above the capacity, + // just to be able to "shrink" allocator by editing that field + if (offset < capacity) { + if (offset + length > capacity) { + length = offset + length - capacity; + } + add_ext(offset, length); + } + ++it; + } + } else { + bufferlist bl; + char buf[4096]; + do { + r = read(fd, buf, sizeof(buf)); + if (r > 0) { + bl.append(buf, r); + } + } while(r > 0); + if (r < 0) { + std::cerr << "error: error reading from extents file: " + << cpp_strerror(-errno) + << std::endl; + } else { + auto p = bl.cbegin(); + binary_alloc_map_t amap; + try { + decode(amap, p); + for (auto p : amap.free_extents) { + add_ext(p.first, p.second); + } + } catch (ceph::buffer::error& e) { + std::cerr << __func__ << " unable to decode extents " + << ": " << e.what() + << std::endl; + r = -1; + } + } + close(fd); + } + return r; +} + +/* +* This replays allocator dump (in JSON) reported by + "ceph daemon <osd> bluestore allocator dump <name>" + command and applies custom method to it +*/ +int replay_free_dump_and_apply(char* fname, + std::function<int (Allocator*, const string& aname)> fn) +{ + unique_ptr<Allocator> alloc; + auto create_fn = [&](std::string_view alloc_type, + int64_t capacity, + int64_t alloc_unit, + std::string_view alloc_name) { + alloc.reset( + Allocator::create( + g_ceph_context, alloc_type, capacity, alloc_unit, 0, 0, alloc_name)); + }; + auto add_fn = [&](uint64_t offset, + uint64_t len) { + alloc->init_add_free(offset, len); + }; + int r = replay_free_dump_and_apply_raw( + fname, + create_fn, + add_fn); + if (r == 0) { + r = fn(alloc.get(), alloc->get_name()); + } + + return r; +} + +void dump_alloc(Allocator* alloc, const string& aname) +{ + AdminSocket* admin_socket = g_ceph_context->get_admin_socket(); + ceph_assert(admin_socket); + + ceph::bufferlist in, out; + ostringstream err; + + string cmd = "{\"prefix\": \"bluestore allocator dump " + aname + "\"}"; + auto r = admin_socket->execute_command( + { cmd }, + in, err, &out); + if (r != 0) { + cerr << "failure querying: " << cpp_strerror(r) << std::endl; + } + else { + std::cout << std::string(out.c_str(), out.length()) << std::endl; + } +} + +int export_as_binary(char* fname, char* target_fname) +{ + int fd = creat(target_fname, 0); + if (fd < 0) { + std::cerr << "error: unable to open target file: " << target_fname + << ", " << cpp_strerror(-errno) + << std::endl; + return -1; + } + + binary_alloc_map_t amap; + auto dummy_create_fn = + [&](std::string_view alloc_type, + int64_t capacity, + int64_t alloc_unit, + std::string_view alloc_name) { + }; + auto add_fn = [&](uint64_t offset, + uint64_t len) { + amap.free_extents.emplace_back(offset, len); + }; + int r = replay_free_dump_and_apply_raw( + fname, + dummy_create_fn, + add_fn); + if (r == 0) { + bufferlist out; + ceph::encode(amap, out); + auto w = write(fd, out.c_str(), out.length()); + if (w < 1) { + std::cerr << "error: unable to open target file: " << target_fname + << ", " << cpp_strerror(-errno) + << std::endl; + } + } + close(fd); + return r; +} + +int check_duplicates(char* fname) +{ + interval_set<uint64_t> free_extents; + interval_set<uint64_t> invalid_extentsA; + interval_set<uint64_t> invalid_extentsB; + auto dummy_create_fn = + [&](std::string_view alloc_type, + int64_t capacity, + int64_t alloc_unit, + std::string_view alloc_name) { + }; + size_t errors = 0; + size_t pos = 0; + size_t first_err_pos = 0; + auto add_fn = [&](uint64_t offset, + uint64_t len) { + ++pos; + if (free_extents.intersects(offset, len)) { + invalid_extentsB.insert(offset, len); + ++errors; + if (first_err_pos == 0) { + first_err_pos = pos; + } + } else { + free_extents.insert(offset, len); + } + }; + int r = replay_free_dump_and_apply_raw( + fname, + dummy_create_fn, + add_fn); + if (r < 0) { + return r; + } + pos = 0; + auto add_fn2 = [&](uint64_t offset, + uint64_t len) { + ++pos; + if (pos < first_err_pos) { + if (invalid_extentsB.intersects(offset, len)) { + invalid_extentsA.insert(offset, len); + } + } + }; + r = replay_free_dump_and_apply_raw( + fname, + dummy_create_fn, + add_fn2); + ceph_assert(r >= 0); + auto itA = invalid_extentsA.begin(); + auto itB = invalid_extentsB.begin(); + while (itA != invalid_extentsA.end()) { + std::cerr << "error: overlapping extents: " << std::hex + << itA.get_start() << "~" << itA.get_end() - itA.get_start() + << " vs."; + while (itB != invalid_extentsB.end() && + itB.get_start() >= itA.get_start() && + itB.get_end() <= itA.get_end()) { + std::cerr << " " << itB.get_start() << "~" << itB.get_end() - itB.get_start(); + ++itB; + } + std::cerr << std::dec << std::endl; + ++itA; + } + return r >= 0 ? errors != 0 : r; +} + +int main(int argc, char **argv) +{ + vector<const char*> args; + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + g_ceph_context->_conf.apply_changes(nullptr); + + if (argc < 3) { + usage(argv[0]); + return 1; + } + if (strcmp(argv[2], "raw_duplicates") == 0) { + return replay_and_check_for_duplicate(argv[1]); + } else if (strcmp(argv[2], "free_dump") == 0) { + return replay_free_dump_and_apply(argv[1], + [&](Allocator* a, const string& aname) { + ceph_assert(a); + std::cout << "Fragmentation:" << a->get_fragmentation() + << std::endl; + std::cout << "Fragmentation score:" << a->get_fragmentation_score() + << std::endl; + std::cout << "Free:" << std::hex << a->get_free() << std::dec + << std::endl; + { + // stub to implement various testing stuff on properly initialized allocator + // e.g. one can dump allocator back via dump_alloc(a, aname); + } + return 0; + }); + } else if (strcmp(argv[2], "try_alloc") == 0) { + if (argc < 6) { + std::cerr << "Error: insufficient arguments for \"try_alloc\" operation." + << std::endl; + usage(argv[0]); + return 1; + } + auto count = strtoul(argv[3], nullptr, 10); + auto want = strtoul(argv[4], nullptr, 10); + auto alloc_unit = strtoul(argv[5], nullptr, 10); + + return replay_free_dump_and_apply(argv[1], + [&](Allocator* a, const string& aname) { + ceph_assert(a); + std::cout << "Fragmentation:" << a->get_fragmentation() + << std::endl; + std::cout << "Fragmentation score:" << a->get_fragmentation_score() + << std::endl; + std::cout << "Free:" << std::hex << a->get_free() << std::dec + << std::endl; + { + PExtentVector extents; + for(size_t i = 0; i < count; i++) { + extents.clear(); + auto r = a->allocate(want, alloc_unit, 0, &extents); + if (r < 0) { + std::cerr << "Error: allocation failure at step:" << i + 1 + << ", ret = " << r << std::endl; + return -1; + } + } + } + std::cout << "Successfully allocated: " << count << " * " << want + << ", unit:" << alloc_unit << std::endl; + return 0; + }); + } else if (strcmp(argv[2], "replay_alloc") == 0) { + if (argc < 4) { + std::cerr << "Error: insufficient arguments for \"replay_alloc\" option." + << std::endl; + usage_replay_alloc(argv[0]); + return 1; + } + return replay_free_dump_and_apply(argv[1], + [&](Allocator *a, const string &aname) { + ceph_assert(a); + std::cout << "Fragmentation:" << a->get_fragmentation() + << std::endl; + std::cout << "Fragmentation score:" << a->get_fragmentation_score() + << std::endl; + std::cout << "Free:" << std::hex << a->get_free() << std::dec + << std::endl; + { + /* replay a set of allocation requests */ + char s[4096]; + + FILE *f_alloc_list = fopen(argv[3], "r"); + if (!f_alloc_list) { + std::cerr << "error: unable to open " << argv[3] << std::endl; + return -1; + } + + /* Replay user specified number of times to simulate extended activity + * Defaults to 1 replay. + */ + auto replay_count = 1; + if (argc == 5) { + replay_count = atoi(argv[4]); + } + + for (auto i = 0; i < replay_count; ++i) { + while (fgets(s, sizeof(s), f_alloc_list) != nullptr) { + /* parse allocation request */ + uint64_t want = 0, unit = 0, max = 0, hint = 0; + + if (std::sscanf(s, "%ji %ji %ji %ji", &want, &unit, &max, &hint) < 2) + { + cerr << "Error: malformed allocation request:" << std::endl; + cerr << s << std::endl; + /* do not attempt to allocate a malformed request */ + continue; + } + + /* timestamp for allocation start */ + auto t0 = ceph::mono_clock::now(); + + /* allocate */ + PExtentVector extents; + auto r = a->allocate(want, unit, max, hint, &extents); + if (r < 0) { + /* blind replays of allocations may run out of space, provide info for easy confirmation */ + std::cerr << "Error: allocation failure code: " << r + << " requested want/unit/max/hint (hex): " << std::hex + << want << "/" << unit << "/" << max << "/" << hint + << std::dec << std::endl; + std::cerr << "Fragmentation:" << a->get_fragmentation() + << std::endl; + std::cerr << "Fragmentation score:" << a->get_fragmentation_score() + << std::endl; + std::cerr << "Free:" << std::hex << a->get_free() << std::dec + << std::endl; + /* return 0 if the allocator ran out of space */ + if (r == -ENOSPC) { + return 0; + } + return -1; + } + + /* Outputs the allocation's duration in nanoseconds and the allocation request parameters */ + std::cout << "Duration (ns): " << (ceph::mono_clock::now() - t0).count() + << " want/unit/max/hint (hex): " << std::hex + << want << "/" << unit << "/" << max << "/" << hint + << std::dec << std::endl; + + /* Do not release. */ + //alloc->release(extents); + extents.clear(); + } + fseek(f_alloc_list, 0, SEEK_SET); + } + fclose(f_alloc_list); + std::cout << "Fragmentation:" << a->get_fragmentation() + << std::endl; + std::cout << "Fragmentation score:" << a->get_fragmentation_score() + << std::endl; + std::cout << "Free:" << std::hex << a->get_free() << std::dec + << std::endl; + } + return 0; + }); + } else if (strcmp(argv[2], "export_binary") == 0) { + return export_as_binary(argv[1], argv[3]); + } else if (strcmp(argv[2], "duplicates") == 0) { + return check_duplicates(argv[1]); + } +} diff --git a/src/test/objectstore/fastbmap_allocator_test.cc b/src/test/objectstore/fastbmap_allocator_test.cc new file mode 100644 index 000000000..710b3798f --- /dev/null +++ b/src/test/objectstore/fastbmap_allocator_test.cc @@ -0,0 +1,1145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <iostream> +#include <gtest/gtest.h> + +#include "os/bluestore/fastbmap_allocator_impl.h" + +class TestAllocatorLevel01 : public AllocatorLevel01Loose +{ +public: + void init(uint64_t capacity, uint64_t alloc_unit) + { + _init(capacity, alloc_unit); + } + interval_t allocate_l1_cont(uint64_t length, uint64_t min_length, + uint64_t pos_start, uint64_t pos_end) + { + return _allocate_l1_contiguous(length, min_length, 0, pos_start, pos_end); + } + void free_l1(const interval_t& r) + { + _free_l1(r.offset, r.length); + } +}; + +class TestAllocatorLevel02 : public AllocatorLevel02<AllocatorLevel01Loose> +{ +public: + void init(uint64_t capacity, uint64_t alloc_unit) + { + _init(capacity, alloc_unit); + } + void allocate_l2(uint64_t length, uint64_t min_length, + uint64_t* allocated0, + interval_vector_t* res) + { + uint64_t allocated = 0; + uint64_t hint = 0; // trigger internal l2 hint support + _allocate_l2(length, min_length, 0, hint, &allocated, res); + *allocated0 += allocated; + } + void free_l2(const interval_vector_t& r) + { + _free_l2(r); + } + void mark_free(uint64_t o, uint64_t len) + { + _mark_free(o, len); + } + void mark_allocated(uint64_t o, uint64_t len) + { + _mark_allocated(o, len); + } +}; + +const uint64_t _1m = 1024 * 1024; +const uint64_t _2m = 2 * 1024 * 1024; + +TEST(TestAllocatorLevel01, test_l1) +{ + TestAllocatorLevel01 al1; + uint64_t num_l1_entries = 3 * 256; + uint64_t capacity = num_l1_entries * 512 * 4096; + al1.init(capacity, 0x1000); + ASSERT_EQ(capacity, al1.debug_get_free()); + + auto i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i1.offset, 0u); + ASSERT_EQ(i1.length, 0x1000u); + ASSERT_EQ(capacity - 0x1000, al1.debug_get_free()); + + auto i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 0x1000u); + ASSERT_EQ(i2.length, 0x1000u); + al1.free_l1(i2); + al1.free_l1(i1); + i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i1.offset, 0u); + ASSERT_EQ(i1.length, 0x1000u); + i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 0x1000u); + ASSERT_EQ(i2.length, 0x1000u); + al1.free_l1(i1); + al1.free_l1(i2); + + i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i1.offset, 0u); + ASSERT_EQ(i1.length, 0x2000u); + + i2 = al1.allocate_l1_cont(0x3000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 0x2000u); + ASSERT_EQ(i2.length, 0x3000u); + + al1.free_l1(i1); + al1.free_l1(i2); + + i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i1.offset, 0u); + ASSERT_EQ(i1.length, 0x2000u); + + i2 = al1.allocate_l1_cont(2 * 1024 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 2u * 1024u * 1024u); + ASSERT_EQ(i2.length, 2u * 1024u * 1024u); + + al1.free_l1(i1); + i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i1.offset, 0u); + ASSERT_EQ(i1.length, 1024u * 1024u); + + auto i3 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i3.offset, 2u * 2u * 1024u * 1024u); + ASSERT_EQ(i3.length, 1024u * 1024u + 0x1000u); + + // here we have the following layout: + // Alloc: 0~1M, 2M~2M, 4M~1M+4K + // Free: 1M~1M, 4M+4K ~ 2M-4K, 6M ~... + // + auto i4 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(1 * 1024 * 1024u, i4.offset); + ASSERT_EQ(1024 * 1024u, i4.length); + al1.free_l1(i4); + + i4 = al1.allocate_l1_cont(1024 * 1024 - 0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i4.offset, 5u * 1024u * 1024u + 0x1000u); + ASSERT_EQ(i4.length, 1024u * 1024u - 0x1000u); + al1.free_l1(i4); + + i4 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i4.offset, 6u * 1024u * 1024u); + //ASSERT_EQ(i4.offset, 5 * 1024 * 1024 + 0x1000); + ASSERT_EQ(i4.length, 1024u * 1024u + 0x1000u); + + al1.free_l1(i1); + al1.free_l1(i2); + al1.free_l1(i3); + al1.free_l1(i4); + + i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i1.offset, 0u); + ASSERT_EQ(i1.length, 1024u * 1024u); + + i2 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 1u * 1024u * 1024u); + ASSERT_EQ(i2.length, 1024u * 1024u); + + i3 = al1.allocate_l1_cont(512 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i3.offset, 2u * 1024u * 1024u); + ASSERT_EQ(i3.length, 512u * 1024u); + + i4 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i4.offset, (2u * 1024u + 512u) * 1024u); + ASSERT_EQ(i4.length, 1536u * 1024u); + // making a hole 1.5 Mb length + al1.free_l1(i2); + al1.free_l1(i3); + // and trying to fill it + i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 1024u * 1024u); + ASSERT_EQ(i2.length, 1536u * 1024u); + + al1.free_l1(i2); + // and trying to fill it partially + i2 = al1.allocate_l1_cont(1528 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 1024u * 1024u); + ASSERT_EQ(i2.length, 1528u * 1024u); + + i3 = al1.allocate_l1_cont(8 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i3.offset, 2552u * 1024u); + ASSERT_EQ(i3.length, 8u * 1024u); + + al1.free_l1(i2); + // here we have the following layout: + // Alloc: 0~1M, 2552K~8K, num_l1_entries0K~1.5M + // Free: 1M~1528K, 4M ~... + // + i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.offset, 4u * 1024u * 1024u); + ASSERT_EQ(i2.length, 1536u * 1024u); + + al1.free_l1(i1); + al1.free_l1(i2); + al1.free_l1(i3); + al1.free_l1(i4); + ASSERT_EQ(capacity, al1.debug_get_free()); + + for (uint64_t i = 0; i < capacity; i += _2m) { + i1 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries); + ASSERT_EQ(i1.offset, i); + ASSERT_EQ(i1.length, _2m); + } + ASSERT_EQ(0u, al1.debug_get_free()); + i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries); + ASSERT_EQ(i2.length, 0u); + ASSERT_EQ(0u, al1.debug_get_free()); + + al1.free_l1(i1); + i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries); + ASSERT_EQ(i2, i1); + al1.free_l1(i2); + i2 = al1.allocate_l1_cont(_1m, _1m, 0, num_l1_entries); + ASSERT_EQ(i2.offset, i1.offset); + ASSERT_EQ(i2.length, _1m); + + i3 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries); + ASSERT_EQ(i3.length, 0u); + + i3 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries); + ASSERT_EQ(i3.length, _1m); + + i4 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries); + ASSERT_EQ(i4.length, 0u); + + al1.free_l1(i2); + i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries); + ASSERT_EQ(i2.length, 0u); + + i2 = al1.allocate_l1_cont(_2m, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.length, _1m); + + al1.free_l1(i2); + al1.free_l1(i3); + ASSERT_EQ(_2m, al1.debug_get_free()); + + i1 = al1.allocate_l1_cont(_2m - 3 * 0x1000, 0x1000, 0, num_l1_entries); + i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + i3 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + i4 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(0u, al1.debug_get_free()); + + al1.free_l1(i2); + al1.free_l1(i4); + + i2 = al1.allocate_l1_cont(0x4000, 0x2000, 0, num_l1_entries); + ASSERT_EQ(i2.length, 0u); + i2 = al1.allocate_l1_cont(0x4000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i2.length, 0x1000u); + + al1.free_l1(i3); + i3 = al1.allocate_l1_cont(0x6000, 0x3000, 0, num_l1_entries); + ASSERT_EQ(i3.length, 0u); + i3 = al1.allocate_l1_cont(0x6000, 0x1000, 0, num_l1_entries); + ASSERT_EQ(i3.length, 0x2000u); + ASSERT_EQ(0u, al1.debug_get_free()); + + std::cout << "Done L1" << std::endl; +} + +TEST(TestAllocatorLevel01, test_l2) +{ + TestAllocatorLevel02 al2; + uint64_t num_l2_entries = 64;// *512; + uint64_t capacity = num_l2_entries * 256 * 512 * 4096; + al2.init(capacity, 0x1000); + std::cout << "Init L2" << std::endl; + + uint64_t allocated1 = 0; + interval_vector_t a1; + al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1); + ASSERT_EQ(allocated1, 0x2000u); + ASSERT_EQ(a1[0].offset, 0u); + ASSERT_EQ(a1[0].length, 0x2000u); + + // limit query range in debug_get_free for the sake of performance + ASSERT_EQ(0x2000u, al2.debug_get_allocated(0, 1)); + ASSERT_EQ(0u, al2.debug_get_allocated(1, 2)); + + uint64_t allocated2 = 0; + interval_vector_t a2; + al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2); + ASSERT_EQ(allocated2, 0x2000u); + ASSERT_EQ(a2[0].offset, 0x2000u); + ASSERT_EQ(a2[0].length, 0x2000u); + // limit query range in debug_get_free for the sake of performance + ASSERT_EQ(0x4000u, al2.debug_get_allocated(0, 1)); + ASSERT_EQ(0u, al2.debug_get_allocated(1, 2)); + + al2.free_l2(a1); + + allocated2 = 0; + a2.clear(); + al2.allocate_l2(0x1000, 0x1000, &allocated2, &a2); + ASSERT_EQ(allocated2, 0x1000u); + ASSERT_EQ(a2[0].offset, 0x0000u); + ASSERT_EQ(a2[0].length, 0x1000u); + // limit query range in debug_get_free for the sake of performance + ASSERT_EQ(0x3000u, al2.debug_get_allocated(0, 1)); + ASSERT_EQ(0u, al2.debug_get_allocated(1, 2)); + + uint64_t allocated3 = 0; + interval_vector_t a3; + al2.allocate_l2(0x2000, 0x1000, &allocated3, &a3); + ASSERT_EQ(allocated3, 0x2000u); + ASSERT_EQ(a3.size(), 2u); + ASSERT_EQ(a3[0].offset, 0x1000u); + ASSERT_EQ(a3[0].length, 0x1000u); + ASSERT_EQ(a3[1].offset, 0x4000u); + ASSERT_EQ(a3[1].length, 0x1000u); + // limit query range in debug_get_free for the sake of performance + ASSERT_EQ(0x5000u, al2.debug_get_allocated(0, 1)); + ASSERT_EQ(0u, al2.debug_get_allocated(1, 2)); + { + interval_vector_t r; + r.emplace_back(0x0, 0x5000); + al2.free_l2(r); + } + + a3.clear(); + allocated3 = 0; + al2.allocate_l2(_1m, _1m, &allocated3, &a3); + ASSERT_EQ(a3.size(), 1u); + ASSERT_EQ(a3[0].offset, 0u); + ASSERT_EQ(a3[0].length, _1m); + + al2.free_l2(a3); + + a3.clear(); + allocated3 = 0; + al2.allocate_l2(4 * _1m, _1m, &allocated3, &a3); + ASSERT_EQ(a3.size(), 1u); + ASSERT_EQ(a3[0].offset, 0u); + ASSERT_EQ(a3[0].length, 4 * _1m); + + al2.free_l2(a3); + +#ifndef _DEBUG + for (uint64_t i = 0; i < capacity; i += 0x1000) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, 0x1000u); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc1 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } +#else + for (uint64_t i = 0; i < capacity; i += _2m) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_2m, _2m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, _2m); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc1 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } +#endif + + ASSERT_EQ(0u, al2.debug_get_free()); + for (uint64_t i = 0; i < capacity; i += _1m) { + interval_vector_t r; + r.emplace_back(i, _1m); + al2.free_l2(r); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "free1 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + ASSERT_EQ(capacity, al2.debug_get_free()); + + for (uint64_t i = 0; i < capacity; i += _1m) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, _1m); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc2 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + ASSERT_EQ(0u, al2.debug_get_free()); + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + + for (uint64_t i = 0; i < capacity; i += 0x2000) { + interval_vector_t r; + r.emplace_back(i, 0x1000); + al2.free_l2(r); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "free2 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + ASSERT_EQ(capacity / 2, al2.debug_get_free()); + + // unable to allocate due to fragmentation + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + + for (uint64_t i = 0; i < capacity; i += 2 * _1m) { + a4.clear(); + allocated4 = 0; + al2.allocate_l2(_1m, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), _1m / 0x1000); + ASSERT_EQ(allocated4, _1m); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, 0x1000u); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "alloc3 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + ASSERT_EQ(0u, al2.debug_get_free()); + + std::cout << "Done L2" << std::endl; +} + +TEST(TestAllocatorLevel01, test_l2_huge) +{ + TestAllocatorLevel02 al2; + uint64_t num_l2_entries = 4 * 512; + uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 1 TB + al2.init(capacity, 0x1000); + std::cout << "Init L2 Huge" << std::endl; + + for (uint64_t i = 0; i < capacity; i += _1m) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, 0x1000u); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, 0x1000u); + + allocated4 = 0; + a4.clear(); + al2.allocate_l2(_1m - 0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m - 0x1000); + ASSERT_EQ(a4[0].offset, i + 0x1000); + ASSERT_EQ(a4[0].length, _1m - 0x1000); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "allocH " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + for (uint64_t i = 0; i < capacity; i += _1m) { + interval_vector_t a4; + a4.emplace_back(i, 0x1000); + al2.free_l2(a4); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "freeH1 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + { + std::cout << "Try" << std::endl; + time_t t = time(NULL); + for (int i = 0; i < 10; ++i) { + uint64_t allocated = 0; + interval_vector_t a; + al2.allocate_l2(0x2000, 0x2000, &allocated, &a); + ASSERT_EQ(a.size(), 0u); + } + std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl; + } + { + std::cout << "Try" << std::endl; + time_t t = time(NULL); + for (int i = 0; i < 10; ++i) { + uint64_t allocated = 0; + interval_vector_t a; + al2.allocate_l2(_2m, _2m, &allocated, &a); + ASSERT_EQ(a.size(), 0u); + } + std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl; + } + + ASSERT_EQ((capacity / _1m) * 0x1000, al2.debug_get_free()); + + std::cout << "Done L2 Huge" << std::endl; +} + +TEST(TestAllocatorLevel01, test_l2_unaligned) +{ + { + TestAllocatorLevel02 al2; + uint64_t num_l2_entries = 3; + uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB + al2.init(capacity, 0x1000); + std::cout << "Init L2 Unaligned" << std::endl; + + for (uint64_t i = 0; i < capacity; i += _1m / 2) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m / 2); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, _1m / 2); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "allocU " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + ASSERT_EQ(0u, al2.debug_get_free()); + { + // no space to allocate + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + } + } + { + TestAllocatorLevel02 al2; + uint64_t capacity = 500 * 512 * 4096; // 500x2 MB + al2.init(capacity, 0x1000); + std::cout << ("Init L2 Unaligned2\n"); + for (uint64_t i = 0; i < capacity; i += _1m / 2) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m / 2); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, _1m / 2); + if (0 == (i % (1 * 1024 * _1m))) { + std::cout << "allocU2 " << i / 1024 / 1024 << " mb of " + << capacity / 1024 / 1024 << std::endl; + } + } + ASSERT_EQ(0u, al2.debug_get_free()); + { + // no space to allocate + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + } + } + + { + TestAllocatorLevel02 al2; + uint64_t capacity = 100 * 512 * 4096 + 127 * 4096; + al2.init(capacity, 0x1000); + std::cout << "Init L2 Unaligned2" << std::endl; + for (uint64_t i = 0; i < capacity; i += 0x1000) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, 0x1000u); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, 0x1000u); + } + ASSERT_EQ(0u, al2.debug_get_free()); + { + // no space to allocate + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + } + } + { + TestAllocatorLevel02 al2; + uint64_t capacity = 3 * 4096; + al2.init(capacity, 0x1000); + std::cout << "Init L2 Unaligned2" << std::endl; + for (uint64_t i = 0; i < capacity; i += 0x1000) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, 0x1000u); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, 0x1000u); + } + ASSERT_EQ(0u, al2.debug_get_free()); + { + // no space to allocate + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 0u); + } + } + + std::cout << "Done L2 Unaligned" << std::endl; +} + +TEST(TestAllocatorLevel01, test_l2_contiguous_alignment) +{ + { + TestAllocatorLevel02 al2; + uint64_t num_l2_entries = 3; + uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB + uint64_t num_chunks = capacity / 4096; + al2.init(capacity, 4096); + std::cout << "Init L2 cont aligned" << std::endl; + + std::map<size_t, size_t> bins_overall; + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 1u); +// std::cout<<bins_overall.begin()->first << std::endl; + ASSERT_EQ(bins_overall[cbits(num_chunks) - 1], 1u); + + for (uint64_t i = 0; i < capacity / 2; i += _1m) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, _1m); + } + ASSERT_EQ(capacity / 2, al2.debug_get_free()); + + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + + { + // Original free space disposition (start chunk, count): + // <NC/2, NC/2> + size_t to_release = 2 * _1m + 0x1000; + // release 2M + 4K at the beginning + interval_vector_t r; + r.emplace_back(0, to_release); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits(to_release / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <0, 513>, <NC / 2, NC / 2> + // allocate 4K within the deallocated range + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, 0x1000u); + ASSERT_EQ(a4[0].offset, 0u); + ASSERT_EQ(a4[0].length, 0x1000u); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits(2 * _1m / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <1, 512>, <NC / 2, NC / 2> + // allocate 1M - should go to offset 4096 + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m); + ASSERT_EQ(a4[0].offset, 4096); + ASSERT_EQ(a4[0].length, _1m); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <257, 256>, <NC / 2, NC / 2> + // and allocate yet another 8K within the deallocated range + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x2000, 0x1000, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, 0x2000u); + ASSERT_EQ(a4[0].offset, _1m + 0x1000u); + ASSERT_EQ(a4[0].length, 0x2000u); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <259, 254>, <NC / 2, NC / 2> + // release 4K~1M + interval_vector_t r; + r.emplace_back(0x1000, _1m); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 3u); + //ASSERT_EQ(bins_overall[cbits((2 * _1m - 0x3000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <1, 257>, <259, 254>, <NC / 2, NC / 2> + // allocate 3M - should go to the first 1M chunk and @capacity/2 + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(3 * _1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 2u); + ASSERT_EQ(allocated4, 3 * _1m); + ASSERT_EQ(a4[0].offset, 0x1000); + ASSERT_EQ(a4[0].length, _1m); + ASSERT_EQ(a4[1].offset, capacity / 2); + ASSERT_EQ(a4[1].length, 2 * _1m); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <259, 254>, <NC / 2 - 512, NC / 2 - 512> + // release allocated 1M in the first meg chunk except + // the first 4K chunk + interval_vector_t r; + r.emplace_back(0x1000, _1m); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 3u); + ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <1, 256>, <259, 254>, <NC / 2 - 512, NC / 2 - 512> + // release 2M @(capacity / 2) + interval_vector_t r; + r.emplace_back(capacity / 2, 2 * _1m); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 3u); + ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((num_chunks) / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <1, 256>, <259, 254>, <NC / 2, NC / 2> + // allocate 4x512K - should go to the second halves of + // the first and second 1M chunks and @(capacity / 2) + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(2 * _1m, _1m / 2, &allocated4, &a4); + ASSERT_EQ(a4.size(), 3u); + ASSERT_EQ(allocated4, 2 * _1m); + ASSERT_EQ(a4[1].offset, 0x1000); + ASSERT_EQ(a4[1].length, _1m); + ASSERT_EQ(a4[0].offset, _1m + 0x3000); + ASSERT_EQ(a4[0].length, _1m / 2); + ASSERT_EQ(a4[2].offset, capacity / 2); + ASSERT_EQ(a4[2].length, _1m / 2); + + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u); + + } + { + // Original free space disposition (start chunk, count): + // <387, 126>, <NC / 2 + 128, NC / 2 - 128> + // cleanup first 1536K except the last 4K chunk + interval_vector_t r; + r.emplace_back(0, _1m + _1m / 2 - 0x1000); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 3u); + ASSERT_EQ(bins_overall[cbits((_1m + _1m / 2 - 0x1000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <0, 383> <387, 126>, <NC / 2 + 128, NC / 2 - 128> + // release 512K @(capacity / 2) + interval_vector_t r; + r.emplace_back(capacity / 2, _1m / 2); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 3u); + ASSERT_EQ(bins_overall[cbits((_1m + _1m / 2 - 0x1000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <0, 383> <387, 126>, <NC / 2, NC / 2> + // allocate 132M (=33792*4096) = using 4M granularity should go to (capacity / 2) + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(132 * _1m, 4 * _1m , &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(a4[0].offset, capacity / 2); + ASSERT_EQ(a4[0].length, 132 * _1m); + + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 3u); + ASSERT_EQ(bins_overall[cbits((_1m + _1m / 2 - 0x1000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits((_1m - 0x2000 - 0x80000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 33792) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <0, 383> <387, 126>, <NC / 2 + 33792, NC / 2 - 33792> + // cleanup remaining 4*4K chunks in the first 2M + interval_vector_t r; + r.emplace_back(383 * 4096, 4 * 0x1000); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits((2 * _1m + 0x1000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 33792) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <0, 513>, <NC / 2 + 33792, NC / 2 - 33792> + // release 132M @(capacity / 2) + interval_vector_t r; + r.emplace_back(capacity / 2, 132 * _1m); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits((2 * _1m + 0x1000) / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <0, 513>, <NC / 2, NC / 2> + // allocate 132M using 2M granularity should go to the first chunk and to + // (capacity / 2) + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(132 * _1m, 2 * _1m , &allocated4, &a4); + ASSERT_EQ(a4.size(), 2u); + ASSERT_EQ(a4[0].offset, 0u); + ASSERT_EQ(a4[0].length, 2 * _1m); + ASSERT_EQ(a4[1].offset, capacity / 2); + ASSERT_EQ(a4[1].length, 130 * _1m); + + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits(0)], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 33792) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <512, 1>, <NC / 2 + 33792, NC / 2 - 33792> + // release 130M @(capacity / 2) + interval_vector_t r; + r.emplace_back(capacity / 2, 132 * _1m); + al2.free_l2(r); + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 2u); + ASSERT_EQ(bins_overall[cbits(0)], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <512,1>, <NC / 2, NC / 2> + // release 4K~16K + // release 28K~32K + // release 68K~24K + interval_vector_t r; + r.emplace_back(0x1000, 0x4000); + r.emplace_back(0x7000, 0x8000); + r.emplace_back(0x11000, 0x6000); + al2.free_l2(r); + + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 4u); + ASSERT_EQ(bins_overall[cbits(0)], 1u); + ASSERT_EQ(bins_overall[cbits(0x4000 / 0x1000) - 1], 2u); // accounts both 0x4000 & 0x6000 + ASSERT_EQ(bins_overall[cbits(0x8000 / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <1, 4>, <7, 8>, <17, 6> <512,1>, <NC / 2, NC / 2> + // allocate 80K using 16K granularity + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x14000, 0x4000, &allocated4, &a4); + + ASSERT_EQ(a4.size(), 4); + ASSERT_EQ(a4[1].offset, 0x1000u); + ASSERT_EQ(a4[1].length, 0x4000u); + ASSERT_EQ(a4[0].offset, 0x7000u); + ASSERT_EQ(a4[0].length, 0x8000u); + ASSERT_EQ(a4[2].offset, 0x11000u); + ASSERT_EQ(a4[2].length, 0x4000u); + ASSERT_EQ(a4[3].offset, capacity / 2); + ASSERT_EQ(a4[3].length, 0x4000u); + + bins_overall.clear(); + al2.collect_stats(bins_overall); + + ASSERT_EQ(bins_overall.size(), 3u); + ASSERT_EQ(bins_overall[cbits(0)], 1u); + ASSERT_EQ(bins_overall[cbits(0x2000 / 0x1000) - 1], 1u); + ASSERT_EQ(bins_overall[cbits(num_chunks / 2 - 1) - 1], 1u); + } + { + // Original free space disposition (start chunk, count): + // <21, 2> <512,1>, <NC / 2 + 1, NC / 2 - 1> + } + } + std::cout << "Done L2 cont aligned" << std::endl; +} + +TEST(TestAllocatorLevel01, test_4G_alloc_bug) +{ + { + TestAllocatorLevel02 al2; + uint64_t capacity = 0x8000 * _1m; // = 32GB + al2.init(capacity, 0x10000); + std::cout << "Init L2 cont aligned" << std::endl; + + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); // the bug caused no allocations here + ASSERT_EQ(allocated4, _1m); + ASSERT_EQ(a4[0].offset, 0u); + ASSERT_EQ(a4[0].length, _1m); + } +} + +TEST(TestAllocatorLevel01, test_4G_alloc_bug2) +{ + { + TestAllocatorLevel02 al2; + uint64_t capacity = 0x8000 * _1m; // = 32GB + al2.init(capacity, 0x10000); + + for (uint64_t i = 0; i < capacity; i += _1m) { + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(_1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 1u); + ASSERT_EQ(allocated4, _1m); + ASSERT_EQ(a4[0].offset, i); + ASSERT_EQ(a4[0].length, _1m); + } + ASSERT_EQ(0u , al2.debug_get_free()); + + interval_vector_t r; + r.emplace_back(0x5fec30000, 0x13d0000); + r.emplace_back(0x628000000, 0x80000000); + r.emplace_back(0x6a8000000, 0x80000000); + r.emplace_back(0x728100000, 0x70000); + al2.free_l2(r); + + std::map<size_t, size_t> bins_overall; + al2.collect_stats(bins_overall); + + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(0x3e000000, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 2u); + ASSERT_EQ(allocated4, 0x3e000000u); + ASSERT_EQ(a4[0].offset, 0x5fec30000u); + ASSERT_EQ(a4[0].length, 0x1300000u); + ASSERT_EQ(a4[1].offset, 0x628000000u); + ASSERT_EQ(a4[1].length, 0x3cd00000u); + } +} + +TEST(TestAllocatorLevel01, test_4G_alloc_bug3) +{ + { + TestAllocatorLevel02 al2; + uint64_t capacity = 0x8000 * _1m; // = 32GB + al2.init(capacity, 0x10000); + std::cout << "Init L2 cont aligned" << std::endl; + + uint64_t allocated4 = 0; + interval_vector_t a4; + al2.allocate_l2(4096ull * _1m, _1m, &allocated4, &a4); + ASSERT_EQ(a4.size(), 2u); // allocator has to split into 2 allocations + ASSERT_EQ(allocated4, 4096ull * _1m); + ASSERT_EQ(a4[0].offset, 0u); + ASSERT_EQ(a4[0].length, 2048ull * _1m); + ASSERT_EQ(a4[1].offset, 2048ull * _1m); + ASSERT_EQ(a4[1].length, 2048ull * _1m); + } +} + +TEST(TestAllocatorLevel01, test_claim_free_l2) +{ + TestAllocatorLevel02 al2; + uint64_t num_l2_entries = 64;// *512; + uint64_t capacity = num_l2_entries * 256 * 512 * 4096; + al2.init(capacity, 0x1000); + std::cout << "Init L2" << std::endl; + + uint64_t max_available = 0x20000; + al2.mark_allocated(max_available, capacity - max_available); + + uint64_t allocated1 = 0; + interval_vector_t a1; + al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1); + ASSERT_EQ(allocated1, 0x2000u); + ASSERT_EQ(a1[0].offset, 0u); + ASSERT_EQ(a1[0].length, 0x2000u); + + uint64_t allocated2 = 0; + interval_vector_t a2; + al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2); + ASSERT_EQ(allocated2, 0x2000u); + ASSERT_EQ(a2[0].offset, 0x2000u); + ASSERT_EQ(a2[0].length, 0x2000u); + + uint64_t allocated3 = 0; + interval_vector_t a3; + al2.allocate_l2(0x3000, 0x3000, &allocated3, &a3); + ASSERT_EQ(allocated3, 0x3000u); + ASSERT_EQ(a3[0].offset, 0x4000u); + ASSERT_EQ(a3[0].length, 0x3000u); + + al2.free_l2(a1); + al2.free_l2(a3); + ASSERT_EQ(max_available - 0x2000, al2.debug_get_free()); + + auto claimed = al2.claim_free_to_right(0x4000); + ASSERT_EQ(max_available - 0x4000u, claimed); + ASSERT_EQ(0x2000, al2.debug_get_free()); + + claimed = al2.claim_free_to_right(0x4000); + ASSERT_EQ(0, claimed); + ASSERT_EQ(0x2000, al2.debug_get_free()); + + claimed = al2.claim_free_to_left(0x2000); + ASSERT_EQ(0x2000u, claimed); + ASSERT_EQ(0, al2.debug_get_free()); + + claimed = al2.claim_free_to_left(0x2000); + ASSERT_EQ(0, claimed); + ASSERT_EQ(0, al2.debug_get_free()); + + + al2.mark_free(0x3000, 0x4000); + ASSERT_EQ(0x4000, al2.debug_get_free()); + + claimed = al2.claim_free_to_right(0x7000); + ASSERT_EQ(0, claimed); + ASSERT_EQ(0x4000, al2.debug_get_free()); + + claimed = al2.claim_free_to_right(0x6000); + ASSERT_EQ(0x1000, claimed); + ASSERT_EQ(0x3000, al2.debug_get_free()); + + claimed = al2.claim_free_to_right(0x6000); + ASSERT_EQ(0, claimed); + ASSERT_EQ(0x3000, al2.debug_get_free()); + + claimed = al2.claim_free_to_left(0x3000); + ASSERT_EQ(0u, claimed); + ASSERT_EQ(0x3000, al2.debug_get_free()); + + claimed = al2.claim_free_to_left(0x4000); + ASSERT_EQ(0x1000, claimed); + ASSERT_EQ(0x2000, al2.debug_get_free()); + + // claiming on the right boundary + claimed = al2.claim_free_to_right(capacity); + ASSERT_EQ(0x0, claimed); + ASSERT_EQ(0x2000, al2.debug_get_free()); + + // extend allocator space up to 64M + auto max_available2 = 64 * 1024 * 1024; + al2.mark_free(max_available, max_available2 - max_available); + ASSERT_EQ(max_available2 - max_available + 0x2000, al2.debug_get_free()); + + // pin some allocations + al2.mark_allocated(0x400000 + 0x2000, 1000); + al2.mark_allocated(0x400000 + 0x5000, 1000); + al2.mark_allocated(0x400000 + 0x20000, 1000); + ASSERT_EQ(max_available2 - max_available - 0x1000, al2.debug_get_free()); + + claimed = al2.claim_free_to_left(0x403000); + ASSERT_EQ(0x0, claimed); + + claimed = al2.claim_free_to_left(0x404000); + ASSERT_EQ(0x1000, claimed); + ASSERT_EQ(max_available2 - max_available - 0x2000, al2.debug_get_free()); + + claimed = al2.claim_free_to_left(max_available); + ASSERT_EQ(0, claimed); + + claimed = al2.claim_free_to_left(0x400000); + ASSERT_EQ(0x3e0000, claimed); + ASSERT_EQ(max_available2 - max_available - 0x3e2000, al2.get_available()); + ASSERT_EQ(max_available2 - max_available - 0x3e2000, al2.debug_get_free()); + + claimed = al2.claim_free_to_right(0x407000); + ASSERT_EQ(0x19000, claimed); + ASSERT_EQ(max_available2 - max_available - 0x3e2000 - 0x19000, + al2.get_available()); + ASSERT_EQ(max_available2 - max_available - 0x3e2000 - 0x19000, + al2.debug_get_free()); + + claimed = al2.claim_free_to_right(0x407000); + ASSERT_EQ(0, claimed); + + claimed = al2.claim_free_to_right(0x430000); + ASSERT_EQ(max_available2 - 0x430000, claimed); + ASSERT_EQ(0x15000, + al2.get_available()); + ASSERT_EQ(0x15000, + al2.debug_get_free()); +} diff --git a/src/test/objectstore/hybrid_allocator_test.cc b/src/test/objectstore/hybrid_allocator_test.cc new file mode 100755 index 000000000..e43d28b28 --- /dev/null +++ b/src/test/objectstore/hybrid_allocator_test.cc @@ -0,0 +1,231 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <iostream> +#include <gtest/gtest.h> + +#include "os/bluestore/HybridAllocator.h" + +class TestHybridAllocator : public HybridAllocator { +public: + TestHybridAllocator(CephContext* cct, + int64_t device_size, + int64_t _block_size, + uint64_t max_entries, + const std::string& name) : + HybridAllocator(cct, device_size, _block_size, + max_entries, + name) { + } + + uint64_t get_bmap_free() { + return get_bmap() ? get_bmap()->get_free() : 0; + } + uint64_t get_avl_free() { + return AvlAllocator::get_free(); + } +}; + +const uint64_t _1m = 1024 * 1024; +const uint64_t _4m = 4 * 1024 * 1024; + +TEST(HybridAllocator, basic) +{ + { + uint64_t block_size = 0x1000; + uint64_t capacity = 0x10000 * _1m; // = 64GB + TestHybridAllocator ha(g_ceph_context, capacity, block_size, + 4 * sizeof(range_seg_t), "test_hybrid_allocator"); + + ASSERT_EQ(0, ha.get_free()); + ASSERT_EQ(0, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + ha.init_add_free(0, _4m); + ASSERT_EQ(_4m, ha.get_free()); + ASSERT_EQ(_4m, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + ha.init_add_free(2 * _4m, _4m); + ASSERT_EQ(_4m * 2, ha.get_free()); + ASSERT_EQ(_4m * 2, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + ha.init_add_free(100 * _4m, _4m); + ha.init_add_free(102 * _4m, _4m); + + ASSERT_EQ(_4m * 4, ha.get_free()); + ASSERT_EQ(_4m * 4, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + // next allocs will go to bitmap + ha.init_add_free(4 * _4m, _4m); + ASSERT_EQ(_4m * 5, ha.get_free()); + ASSERT_EQ(_4m * 4, ha.get_avl_free()); + ASSERT_EQ(_4m * 1, ha.get_bmap_free()); + + ha.init_add_free(6 * _4m, _4m); + ASSERT_EQ(_4m * 6, ha.get_free()); + ASSERT_EQ(_4m * 4, ha.get_avl_free()); + ASSERT_EQ(_4m * 2, ha.get_bmap_free()); + + // so we have 6x4M chunks, 4 chunks at AVL and 2 at bitmap + + ha.init_rm_free(_1m, _1m); // take 1M from AVL + ASSERT_EQ(_1m * 23, ha.get_free()); + ASSERT_EQ(_1m * 14, ha.get_avl_free()); + ASSERT_EQ(_1m * 9, ha.get_bmap_free()); + + ha.init_rm_free(6 * _4m + _1m, _1m); // take 1M from bmap + ASSERT_EQ(_1m * 22, ha.get_free()); + ASSERT_EQ(_1m * 14, ha.get_avl_free()); + ASSERT_EQ(_1m * 8, ha.get_bmap_free()); + + // so we have at avl: 2M~2M, 8M~4M, 400M~4M , 408M~4M + // and at bmap: 0~1M, 16M~1M, 18M~2M, 24~4M + + PExtentVector extents; + // allocate 4K, to be served from bitmap + EXPECT_EQ(block_size, ha.allocate(block_size, block_size, + 0, (int64_t)0, &extents)); + ASSERT_EQ(1, extents.size()); + ASSERT_EQ(0, extents[0].offset); + + ASSERT_EQ(_1m * 14, ha.get_avl_free()); + ASSERT_EQ(_1m * 8 - block_size, ha.get_bmap_free()); + + interval_set<uint64_t> release_set; + // release 4K, to be returned to bitmap + release_set.insert(extents[0].offset, extents[0].length); + ha.release(release_set); + + ASSERT_EQ(_1m * 14, ha.get_avl_free()); + ASSERT_EQ(_1m * 8, ha.get_bmap_free()); + extents.clear(); + release_set.clear(); + + // again we have at avl: 2M~2M, 8M~4M, 400M~4M , 408M~4M + // and at bmap: 0~1M, 16M~1M, 18M~2M, 24~4M + + // add 12M~3M which will go to avl + ha.init_add_free(3 * _4m, 3 * _1m); + ASSERT_EQ(_1m * 17, ha.get_avl_free()); + ASSERT_EQ(_1m * 8, ha.get_bmap_free()); + + + // add 15M~4K which will be appended to existing slot + ha.init_add_free(15 * _1m, 0x1000); + ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free()); + ASSERT_EQ(_1m * 8, ha.get_bmap_free()); + + + // again we have at avl: 2M~2M, 8M~(7M+4K), 400M~4M , 408M~4M + // and at bmap: 0~1M, 16M~1M, 18M~2M, 24~4M + + //some removals from bmap + ha.init_rm_free(28 * _1m - 0x1000, 0x1000); + ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free()); + ASSERT_EQ(_1m * 8 - 0x1000, ha.get_bmap_free()); + + ha.init_rm_free(24 * _1m + 0x1000, 0x1000); + ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free()); + ASSERT_EQ(_1m * 8 - 0x2000, ha.get_bmap_free()); + + ha.init_rm_free(24 * _1m + 0x1000, _4m - 0x2000); + ASSERT_EQ(_1m * 17 + 0x1000, ha.get_avl_free()); + ASSERT_EQ(_1m * 4, ha.get_bmap_free()); + + //4K removal from avl + ha.init_rm_free(15 * _1m, 0x1000); + ASSERT_EQ(_1m * 17, ha.get_avl_free()); + ASSERT_EQ(_1m * 4, ha.get_bmap_free()); + + //remove highest 4Ms from avl + ha.init_rm_free(_1m * 400, _4m); + ha.init_rm_free(_1m * 408, _4m); + ASSERT_EQ(_1m * 9, ha.get_avl_free()); + ASSERT_EQ(_1m * 4, ha.get_bmap_free()); + + // we have at avl: 2M~2M, 8M~7M + // and at bmap: 0~1M, 16M~1M, 18M~2M + + // this will be merged with neighbors from bmap and go to avl + ha.init_add_free(17 * _1m, _1m); + ASSERT_EQ(_1m * 1, ha.get_bmap_free()); + ASSERT_EQ(_1m * 13, ha.get_avl_free()); + + // we have at avl: 2M~2M, 8M~7M, 16M~4M + // and at bmap: 0~1M + + // and now do some cutoffs from 0~1M span + + //cut off 4K from bmap + ha.init_rm_free(0 * _1m, 0x1000); + ASSERT_EQ(_1m * 13, ha.get_avl_free()); + ASSERT_EQ(_1m * 1 - 0x1000, ha.get_bmap_free()); + + //cut off 1M-4K from bmap + ha.init_rm_free(0 * _1m + 0x1000, _1m - 0x1000); + ASSERT_EQ(_1m * 13, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + //cut off 512K avl + ha.init_rm_free(17 * _1m + 0x1000, _1m / 2); + ASSERT_EQ(_1m * 13 - _1m / 2, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + //cut off the rest from avl + ha.init_rm_free(17 * _1m + 0x1000 + _1m / 2, _1m / 2); + ASSERT_EQ(_1m * 12, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + } + + { + uint64_t block_size = 0x1000; + uint64_t capacity = 0x10000 * _1m; // = 64GB + TestHybridAllocator ha(g_ceph_context, capacity, block_size, + 4 * sizeof(range_seg_t), "test_hybrid_allocator"); + + ha.init_add_free(_1m, _1m); + ha.init_add_free(_1m * 3, _1m); + ha.init_add_free(_1m * 5, _1m); + ha.init_add_free(0x4000, 0x1000); + + ASSERT_EQ(_1m * 3 + 0x1000, ha.get_free()); + ASSERT_EQ(_1m * 3 + 0x1000, ha.get_avl_free()); + ASSERT_EQ(0, ha.get_bmap_free()); + + // This will substitute chunk 0x4000~1000. + // Since new chunk insertion into into AvlAllocator:range_tree + // happens immediately before 0x4000~1000 chunk care should be taken + // to order operations properly and do not use already disposed iterator. + ha.init_add_free(0, 0x2000); + + ASSERT_EQ(_1m * 3 + 0x3000, ha.get_free()); + ASSERT_EQ(_1m * 3 + 0x2000, ha.get_avl_free()); + ASSERT_EQ(0x1000, ha.get_bmap_free()); + } +} + +TEST(HybridAllocator, fragmentation) +{ + { + uint64_t block_size = 0x1000; + uint64_t capacity = 0x1000 * 0x1000; // = 16M + TestHybridAllocator ha(g_ceph_context, capacity, block_size, + 4 * sizeof(range_seg_t), "test_hybrid_allocator"); + + ha.init_add_free(0, 0x2000); + ha.init_add_free(0x4000, 0x2000); + ha.init_add_free(0x8000, 0x2000); + ha.init_add_free(0xc000, 0x1000); + + ASSERT_EQ(0.5, ha.get_fragmentation()); + + // this will got to bmap with fragmentation = 1 + ha.init_add_free(0x10000, 0x1000); + + // which results in the following total fragmentation + ASSERT_EQ(0.5 * 7 / 8 + 1.0 / 8, ha.get_fragmentation()); + } +} diff --git a/src/test/objectstore/run_seed_to.sh b/src/test/objectstore/run_seed_to.sh new file mode 100755 index 000000000..5a624a5d4 --- /dev/null +++ b/src/test/objectstore/run_seed_to.sh @@ -0,0 +1,293 @@ +#!/usr/bin/env bash +# vim: ts=8 sw=2 smarttab +# +# run_seed_to.sh - Run ceph_test_filestore_idempotent_sequence up until an +# injection point, generating a sequence of operations based on a +# provided seed. +# +# We also perform three additional tests, focused on assessing if +# replaying a larger chunck of the journal affects the expected store +# behavior. These tests will be performed by increasing the store's +# journal sync interval to a very large value, allowing the store to +# finish execution before the first sync (unless the store runs for +# over 10 hours, case on which the interval variables must be changed +# to an appropriate value). Unless the '--no-journal-test' option is +# specified, we will run the 3 following scenarios: +# +# 1) journal sync'ing for both stores is good as disabled +# (we call it '00', for store naming purposes) +# 2) journal sync'ing for store A is as good as disabled +# (we call it '01', for store naming purposes) +# 3) journal sync'ing for store B is as good as disabled +# (we call it '10', for store naming purposes) +# +# All log files are also appropriately named accordingly (i.e., a.00.fail, +# a.10.recover, or b.01.clean). +# +# By default, the test will not exit on error, although it will show the +# fail message. This behavior is so defined so we run the whole battery of +# tests, and obtain as many mismatches as possible in one go. We may force +# the test to exit on error by specifying the '--exit-on-error' option. +# +# +set -e + +test_opts="" + +usage() { + echo "usage: $1 [options..] <seed> <kill-at>" + echo + echo "options:" + echo " -c, --colls <VAL> # of collections" + echo " -o, --objs <VAL> # of objects" + echo " -b, --btrfs <VAL> seq number for btrfs stores" + echo " --no-journal-test don't perform journal replay tests" + echo " -e, --exit-on-error exit with 1 on error" + echo " -v, --valgrind run commands through valgrind" + echo + echo "env vars:" + echo " OPTS_STORE additional opts for both stores" + echo " OPTS_STORE_A additional opts for store A" + echo " OPTS_STORE_B additional opts for store B" + echo +} + +echo $0 $* + +die_on_missing_arg() { + if [[ "$2" == "" ]]; then + echo "$1: missing required parameter" + exit 1 + fi +} + + +required_args=2 +obtained_args=0 + +seed="" +killat="" +on_btrfs=0 +on_btrfs_seq=0 +journal_test=1 +min_sync_interval="36000" # ten hours, yes. +max_sync_interval="36001" +exit_on_error=0 +v="" + +do_rm() { + if [[ $on_btrfs -eq 0 ]]; then + rm -fr $* + fi +} + +set_arg() { + if [[ $1 -eq 1 ]]; then + seed=$2 + elif [[ $1 -eq 2 ]]; then + killat=$2 + else + echo "error: unknown purpose for '$2'" + usage $0 + exit 1 + fi +} + +while [[ $# -gt 0 ]]; +do + case "$1" in + -c | --colls) + die_on_missing_arg "$1" "$2" + test_opts="$test_opts --test-num-colls $2" + shift 2 + ;; + -o | --objs) + die_on_missing_arg "$1" "$2" + test_opts="$test_opts --test-num-objs $2" + shift 2 + ;; + -h | --help) + usage $0 ; + exit 0 + ;; + -b | --btrfs) + die_on_missing_arg "$1" "$2" + on_btrfs=1 + on_btrfs_seq=$2 + shift 2 + ;; + --no-journal-test) + journal_test=0 + shift + ;; + -e | --exit-on-error) + exit_on_error=1 + shift + ;; + -v | --valgrind) + v="valgrind --leak-check=full" + shift + ;; + --) + shift + break + ;; + -*) + echo "$1: unknown option" >&2 + usage $0 + exit 1 + ;; + *) + obtained_args=$(($obtained_args+1)) + set_arg $obtained_args $1 + shift + ;; + esac +done + +if [[ $obtained_args -ne $required_args ]]; then + echo "error: missing argument" + usage $0 ; + exit 1 +fi + +if [[ "$OPTS_STORE" != "" ]]; then + test_opts="$test_opts $OPTS_STORE" +fi + +test_opts_a="$test_opts" +test_opts_b="$test_opts" + +if [[ "$OPTS_STORE_A" != "" ]]; then + test_opts_a="$test_opts_a $OPTS_STORE_A" +fi +if [[ "$OPTS_STORE_B" != "" ]]; then + test_opts_b="$test_opts_b $OPTS_STORE_B" +fi + +echo seed $seed +echo kill at $killat + +# run forever, until $killat... +to=1000000000 + +# +# store names +# +# We need these for two reasons: +# 1) if we are running the tests on a btrfs volume, then we need to use +# a seq number for each run. Being on btrfs means we will fail when +# removing the store's directories and it's far more simple to just +# specify differente store names such as 'a.$seq' or 'b.$seq'. +# +# 2) unless the '--no-journal-test' option is specified, we will run +# three additional tests for each store, and we will reuse the same +# command for each one of the runs, but varying the store's name and +# arguments. +# +store_a="a" +store_b="b" + +if [[ $on_btrfs -eq 1 ]]; then + store_a="$store_a.$on_btrfs_seq" + store_b="$store_b.$on_btrfs_seq" +fi + +total_runs=1 + +if [[ $journal_test -eq 1 ]]; then + total_runs=$(($total_runs + 3)) +fi + +num_runs=0 + +opt_min_sync="--filestore-min-sync-interval $min_sync_interval" +opt_max_sync="--filestore-max-sync-interval $max_sync_interval" + +ret=0 + +while [[ $num_runs -lt $total_runs ]]; +do + tmp_name_a=$store_a + tmp_name_b=$store_b + tmp_opts_a=$test_opts_a + tmp_opts_b=$test_opts_b + + # + # We have already tested whether there are diffs when both journals + # are properly working. Now let's try on three other scenarios: + # 1) journal sync'ing for both stores is good as disabled + # (we call it '00') + # 2) journal sync'ing for store A is as good as disabled + # (we call it '01') + # 3) journal sync'ing for store B is as good as disabled + # (we call it '10') + # + if [[ $num_runs -gt 0 && $journal_test -eq 1 ]]; then + echo "run #$num_runs" + case $num_runs in + 1) + tmp_name_a="$tmp_name_a.00" + tmp_name_b="$tmp_name_b.00" + tmp_opts_a="$tmp_opts_a $opt_min_sync $opt_max_sync" + tmp_opts_b="$tmp_opts_b $opt_min_sync $opt_max_sync" + ;; + 2) + tmp_name_a="$tmp_name_a.01" + tmp_name_b="$tmp_name_b.01" + tmp_opts_a="$tmp_opts_a $opt_min_sync $opt_max_sync" + ;; + 3) + tmp_name_a="$tmp_name_a.10" + tmp_name_b="$tmp_name_b.10" + tmp_opts_b="$tmp_opts_b $opt_min_sync $opt_max_sync" + ;; + esac + fi + + do_rm $tmp_name_a $tmp_name_a.fail $tmp_name_a.recover + $v ceph_test_filestore_idempotent_sequence run-sequence-to $to \ + $tmp_name_a $tmp_name_a/journal \ + --test-seed $seed --osd-journal-size 100 \ + --filestore-kill-at $killat $tmp_opts_a \ + --log-file $tmp_name_a.fail --debug-filestore 20 --no-log-to-stderr || true + + stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \ + $tmp_name_a $tmp_name_a/journal \ + --log-file $tmp_name_a.recover \ + --debug-filestore 20 --debug-journal 20 --no-log-to-stderr` + + if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then + echo "error: get-last-op returned '$stop_at'" + exit 1 + fi + + echo stopped at $stop_at + + do_rm $tmp_name_b $tmp_name_b.clean + $v ceph_test_filestore_idempotent_sequence run-sequence-to \ + $stop_at $tmp_name_b $tmp_name_b/journal \ + --test-seed $seed --osd-journal-size 100 \ + --log-file $tmp_name_b.clean --debug-filestore 20 --no-log-to-stderr \ + $tmp_opts_b + + if $v ceph_test_filestore_idempotent_sequence diff \ + $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal --no-log-to-stderr --log-file $tmp_name_a.diff.log --debug-filestore 20 ; then + echo OK + else + echo "FAIL" + echo " see:" + echo " $tmp_name_a.fail -- leading up to failure" + echo " $tmp_name_a.recover -- journal replay" + echo " $tmp_name_b.clean -- the clean reference" + + ret=1 + if [[ $exit_on_error -eq 1 ]]; then + exit 1 + fi + fi + + num_runs=$(($num_runs+1)) +done + +exit $ret diff --git a/src/test/objectstore/run_seed_to_range.sh b/src/test/objectstore/run_seed_to_range.sh new file mode 100755 index 000000000..7af2e59ce --- /dev/null +++ b/src/test/objectstore/run_seed_to_range.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +set -x +set -e + +seed=$1 +from=$2 +to=$3 +dir=$4 + +mydir=`dirname $0` + +for f in `seq $from $to` +do + if ! $mydir/run_seed_to.sh -o 10 -e $seed $f; then + if [ -d "$dir" ]; then + echo copying evidence to $dir + cp -a . $dir + else + echo no dir provided for evidence disposal + fi + exit 1 + fi +done diff --git a/src/test/objectstore/run_smr_bluestore_test.sh b/src/test/objectstore/run_smr_bluestore_test.sh new file mode 100644 index 000000000..d689cf2c5 --- /dev/null +++ b/src/test/objectstore/run_smr_bluestore_test.sh @@ -0,0 +1,48 @@ +#!/bin/bash -ex + +# 1) run_smr_bluestore_test.sh +# Setup smr device, run all tests + +# 2) run_smr_bluestore_test.sh --smr +# Setup smr device but skip tests failing on smr + + +before_creation=$(mktemp) +lsscsi > $before_creation + +echo "cd /backstores/user:zbc +create name=zbc0 size=20G cfgstring=model-HM/zsize-256/conv-10@zbc0.raw +/loopback create +cd /loopback +create naa.50014055e5f25aa0 +cd naa.50014055e5f25aa0/luns +create /backstores/user:zbc/zbc0 0 +" | sudo targetcli + +sleep 1 #if too fast device does not show up +after_creation=$(mktemp) +lsscsi > $after_creation +if [[ $(diff $before_creation $after_creation | wc -l ) != 2 ]] +then + echo New zbc device not created + false +fi + +function cleanup() { + echo "cd /loopback +delete naa.50014055e5f25aa0 +cd /backstores/user:zbc +delete zbc0" | sudo targetcli + sudo rm -f zbc0.raw + rm -f $before_creation $after_creation +} +trap cleanup EXIT + +DEV=$(diff $before_creation $after_creation |grep zbc |sed "s@.* /@/@") +sudo chmod 666 $DEV +# Need sudo +# https://patchwork.kernel.org/project/linux-block/patch/20210811110505.29649-3-Niklas.Cassel@wdc.com/ +sudo ceph_test_objectstore \ + --bluestore-block-path $DEV \ + --gtest_filter=*/2 \ + $* diff --git a/src/test/objectstore/run_test_deferred.sh b/src/test/objectstore/run_test_deferred.sh new file mode 100755 index 000000000..1be4d9104 --- /dev/null +++ b/src/test/objectstore/run_test_deferred.sh @@ -0,0 +1,52 @@ +#!/bin/bash + + +if [[ ! (-x ./bin/unittest_deferred) || ! (-x ./bin/ceph-kvstore-tool) || ! (-x ./bin/ceph-bluestore-tool)]] +then + echo Test must be run from ceph build directory + echo with unittest_deferred, ceph-kvstore-tool and ceph-bluestore-tool compiled + exit 1 +fi + +# Create BlueStore, only main block device, 4K AU, forced deferred 4K, 64K AU for BlueFS + +# Create file zapchajdziura, that is 0xe000 in size. +# This adds to 0x0000 - 0x1000 of BlueStore superblock and 0x1000 - 0x2000 of BlueFS superblock, +# making 0x00000 - 0x10000 filled, nicely aligning for 64K BlueFS requirements + +# Prefill 10 objects Object-0 .. Object-9, each 64K. Sync to disk. +# Do transactions like: +# - fill Object-x+1 16 times at offsets 0x0000, 0x1000, ... 0xf000 with 8bytes, trigerring deferred writes +# - fill Object-x with 64K data +# Repeat for Object-0 to Object-8. + +# Right after getting notification on_complete for all 9 transactions, immediately exit(1). +./bin/unittest_deferred --log-to-stderr=false + +# Now we should have a considerable amount of pending deferred writes. +# They do refer disk regions that do not belong to any object. + +# Perform compaction on RocksDB +# This initializes BlueFS, but does not replay deferred writes. +# It jiggles RocksDB files around. CURRENT and MANIFEST are recreated, with some .sst files too. +# The hope here is that newly created RocksDB files will occupy space that is free, +# but targetted by pending deferred writes. +./bin/ceph-kvstore-tool bluestore-kv bluestore.test_temp_dir/ compact --log-to-stderr=false + +# It this step we (hopefully) get RocksDB files overwritten +# We initialize BlueFS and RocksDB, there should be no problem here. +# Then we apply deferred writes. Now some of RocksDB files might get corrupted. +# It is very likely that this will not cause any problems, since CURRENT and MANIFEST are only read at bootup. +./bin/ceph-bluestore-tool --path bluestore.test_temp_dir/ --command fsck --deep 1 --debug-bluestore=30/30 --debug-bdev=30/30 --log-file=log-bs-corrupts.txt --log-to-file --log-to-stderr=false + +# If we were lucky, this command now fails +./bin/ceph-bluestore-tool --path bluestore.test_temp_dir/ --command fsck --deep 1 --debug-bluestore=30/30 --debug-bdev=30/30 --log-file=log-bs-crash.txt --log-to-file --log-to-stderr=false +if [[ $? != 0 ]] +then + echo "Deferred writes corruption successfully created !" +else + echo "No deferred write problems detected." +fi + +#cleanup +rm -rf bluestore.test_temp_dir/ diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc new file mode 100644 index 000000000..9edfebd6b --- /dev/null +++ b/src/test/objectstore/store_test.cc @@ -0,0 +1,10932 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <glob.h> +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <memory> +#include <time.h> +#include <sys/mount.h> +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_int.hpp> +#include <boost/random/binomial_distribution.hpp> +#include <fmt/format.h> +#include <gtest/gtest.h> + +#include "os/ObjectStore.h" +#if defined(WITH_BLUESTORE) +#include "os/bluestore/BlueStore.h" +#include "os/bluestore/BlueFS.h" +#endif +#include "include/Context.h" +#include "common/buffer_instrumentation.h" +#include "common/ceph_argparse.h" +#include "common/admin_socket.h" +#include "global/global_init.h" +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "common/options.h" // for the size literals +#include "common/pretty_binary.h" +#include "include/stringify.h" +#include "include/coredumpctl.h" +#include "include/unordered_map.h" +#include "os/kv.h" +#include "store_test_fixture.h" + + +using namespace std; +using namespace std::placeholders; + +typedef boost::mt11213b gen_type; + +const uint64_t DEF_STORE_TEST_BLOCKDEV_SIZE = 10240000000; +#define dout_context g_ceph_context + +bool smr = false; + +static bool bl_eq(bufferlist& expected, bufferlist& actual) +{ + if (expected.contents_equal(actual)) + return true; + + unsigned first = 0; + if(expected.length() != actual.length()) { + cout << "--- buffer lengths mismatch " << std::hex + << "expected 0x" << expected.length() << " != actual 0x" + << actual.length() << std::dec << std::endl; + derr << "--- buffer lengths mismatch " << std::hex + << "expected 0x" << expected.length() << " != actual 0x" + << actual.length() << std::dec << dendl; + } + auto len = std::min(expected.length(), actual.length()); + while ( first<len && expected[first] == actual[first]) + ++first; + unsigned last = len; + while (last > 0 && expected[last-1] == actual[last-1]) + --last; + if(len > 0) { + cout << "--- buffer mismatch between offset 0x" << std::hex << first + << " and 0x" << last << ", total 0x" << len << std::dec + << std::endl; + derr << "--- buffer mismatch between offset 0x" << std::hex << first + << " and 0x" << last << ", total 0x" << len << std::dec + << dendl; + cout << "--- expected:\n"; + expected.hexdump(cout); + cout << "--- actual:\n"; + actual.hexdump(cout); + } + return false; +} + + + +template <typename T> +int queue_transaction( + T &store, + ObjectStore::CollectionHandle ch, + ObjectStore::Transaction &&t) { + if (rand() % 2) { + ObjectStore::Transaction t2; + t2.append(t); + return store->queue_transaction(ch, std::move(t2)); + } else { + return store->queue_transaction(ch, std::move(t)); + } +} + +template <typename T> +int collection_list(T &store, ObjectStore::CollectionHandle &c, + const ghobject_t& start, const ghobject_t& end, int max, + vector<ghobject_t> *ls, ghobject_t *pnext, + bool disable_legacy = false) { + if (disable_legacy || rand() % 2) { + return store->collection_list(c, start, end, max, ls, pnext); + } else { + return store->collection_list_legacy(c, start, end, max, ls, pnext); + } +} + +bool sorted(const vector<ghobject_t> &in) { + ghobject_t start; + for (vector<ghobject_t>::const_iterator i = in.begin(); + i != in.end(); + ++i) { + if (start > *i) { + cout << start << " should follow " << *i << std::endl; + return false; + } + start = *i; + } + return true; +} + +class StoreTest : public StoreTestFixture, + public ::testing::WithParamInterface<const char*> { +public: + StoreTest() + : StoreTestFixture(GetParam()) + {} + void doCompressionTest(); + void doSyntheticTest( + int num_ops, + uint64_t max_obj, uint64_t max_wr, uint64_t align); +}; + +class StoreTestDeferredSetup : public StoreTest { + void SetUp() override { + //do nothing + } + +protected: + void DeferredSetup() { + StoreTest::SetUp(); + } + +public: +}; + + +class StoreTestSpecificAUSize : public StoreTestDeferredSetup { + +public: + typedef + std::function<void( + uint64_t num_ops, + uint64_t max_obj, + uint64_t max_wr, + uint64_t align)> MatrixTest; + + void StartDeferred(size_t min_alloc_size) { + SetVal(g_conf(), "bluestore_min_alloc_size", stringify(min_alloc_size).c_str()); + DeferredSetup(); + } + +private: + // bluestore matrix testing + uint64_t max_write = 40 * 1024; + uint64_t max_size = 400 * 1024; + uint64_t alignment = 0; + uint64_t num_ops = 10000; + +protected: + string matrix_get(const char *k) { + if (string(k) == "max_write") { + return stringify(max_write); + } else if (string(k) == "max_size") { + return stringify(max_size); + } else if (string(k) == "alignment") { + return stringify(alignment); + } else if (string(k) == "num_ops") { + return stringify(num_ops); + } else { + char *buf; + g_conf().get_val(k, &buf, -1); + string v = buf; + free(buf); + return v; + } + } + + void matrix_set(const char *k, const char *v) { + if (string(k) == "max_write") { + max_write = atoll(v); + } else if (string(k) == "max_size") { + max_size = atoll(v); + } else if (string(k) == "alignment") { + alignment = atoll(v); + } else if (string(k) == "num_ops") { + num_ops = atoll(v); + } else { + SetVal(g_conf(), k, v); + } + } + + void do_matrix_choose(const char *matrix[][10], + int i, int pos, int num, + MatrixTest fn) { + if (matrix[i][0]) { + int count; + for (count = 0; matrix[i][count+1]; ++count) ; + for (int j = 1; matrix[i][j]; ++j) { + matrix_set(matrix[i][0], matrix[i][j]); + do_matrix_choose(matrix, + i + 1, + pos * count + j - 1, + num * count, + fn); + } + } else { + cout << "---------------------- " << (pos + 1) << " / " << num + << " ----------------------" << std::endl; + for (unsigned k=0; matrix[k][0]; ++k) { + cout << " " << matrix[k][0] << " = " << matrix_get(matrix[k][0]) + << std::endl; + } + g_ceph_context->_conf.apply_changes(nullptr); + fn(num_ops, max_size, max_write, alignment); + } + } + + void do_matrix(const char *matrix[][10], + MatrixTest fn) { + + if (strcmp(matrix[0][0], "bluestore_min_alloc_size") == 0) { + int count; + for (count = 0; matrix[0][count+1]; ++count) ; + for (size_t j = 1; matrix[0][j]; ++j) { + if (j > 1) { + TearDown(); + } + StartDeferred(strtoll(matrix[0][j], NULL, 10)); + do_matrix_choose(matrix, 1, j - 1, count, fn); + } + } else { + StartDeferred(0); + do_matrix_choose(matrix, 0, 0, 1, fn); + } + } + +}; + +class StoreTestOmapUpgrade : public StoreTestDeferredSetup { +protected: + void StartDeferred() { + DeferredSetup(); + } + +public: + struct generator { + double r = 3.6; + double x = 0.5; + double operator()(){ + double v = x; + x = r * x * (1 - x); + return v; + } + }; + + std::string generate_monotonic_name(uint32_t SUM, uint32_t i, double r, double x) + { + generator gen{r, x}; + //std::cout << "r=" << r << " x=" << x << std::endl; + std::string s; + while (SUM > 1) { + uint32_t lo = 0; + uint32_t hi = 1 + gen() * 10; + uint32_t start = ('z' - 'a' + 1 - hi) * gen(); + while (hi - lo > 0) { + uint32_t mid = (lo + hi + 1 + (SUM&1)) / 2; // round up or down, depending on SUM + // std::cout << "SUM=" << SUM << " x=" << gen.x << std::endl; + uint32_t mid_val = gen() * (SUM - 1) + 1; + // LEFT = lo .. mid - 1 + // RIGHT = mid .. hi + // std::cout << "lo=" << lo << " hi=" << hi << " mid=" << mid + // << " SUM=" << SUM << " i=" << i << " x=" << gen.x << " mid_val=" << mid_val << std::endl; + if (i < mid_val) { + hi = mid - 1; + SUM = mid_val; + } else { + lo = mid; + SUM = SUM - mid_val; + i = i - mid_val; + } + } + //std::cout << "lo=" << lo << " hi=" << hi + // << " SUM=" << SUM << " i=" << i << std::endl; + + s.push_back('a' + lo + start); // to keep alphabetic order + uint32_t cnt = gen() * 8; + for (uint32_t j = 0; j < cnt; j++) { + s.push_back('a' + ('z' - 'a' + 1) * gen()); + } + s.push_back('.'); + } + return s; + } + + std::string gen_string(size_t size, generator& gen) { + std::string s; + for (size_t i = 0; i < size; i++) { + s.push_back('a' + ('z' - 'a' + 1 ) * gen()); + } + return s; + } + + void make_omap_data(size_t object_count, + int64_t poolid, + coll_t cid) { + int r; + ObjectStore::CollectionHandle ch = store->open_collection(cid); + for (size_t o = 0; o < object_count; o++) + { + ObjectStore::Transaction t; + std::string oid = generate_monotonic_name(object_count, o, 3.71, 0.5); + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 0, poolid, "")); + t.touch(cid, hoid); + generator gen{3.85 + 0.1 * o / object_count, 1 - double(o) / object_count}; + + map<string, bufferlist> start_set; + size_t omap_count = 1 + gen() * 20; + bool do_omap_header = gen() > 0.5; + if (do_omap_header) { + bufferlist header; + header.append(gen_string(50, gen)); + t.omap_setheader(cid, hoid, header); + } + for (size_t i = 0; i < omap_count; i++) { + std::string name = generate_monotonic_name(omap_count, i, 3.66 + 0.22 * o / object_count, 0.5); + bufferlist val; + val.append(gen_string(100, gen)); + start_set.emplace(name, val); + } + t.omap_setkeys(cid, hoid, start_set); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + } + + void check_omap_data(size_t object_count, + int64_t poolid, + coll_t cid) { + int r; + ObjectStore::CollectionHandle ch = store->open_collection(cid); + + for (size_t o = 0; o < object_count; o++) + { + ObjectStore::Transaction t; + std::string oid = generate_monotonic_name(object_count, o, 3.71, 0.5); + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 0, poolid, "")); + generator gen{3.85 + 0.1 * o / object_count, 1 - double(o) / object_count}; + + bufferlist omap_header; + map<string, bufferlist> omap_set; + r = store->omap_get(ch, hoid, &omap_header, &omap_set); + ASSERT_EQ(r, 0); + size_t omap_count = 1 + gen() * 20; + bool do_omap_header = gen() > 0.5; + if (do_omap_header) { + std::string header_str = gen_string(50, gen); + ASSERT_EQ(header_str, omap_header.to_str()); + } + auto it = omap_set.begin(); + for (size_t i = 0; i < omap_count; i++) { + ASSERT_TRUE(it != omap_set.end()); + std::string name = generate_monotonic_name(omap_count, i, 3.66 + 0.22 * o / object_count, 0.5); + std::string val_gen = gen_string(100, gen); + ASSERT_EQ(it->first, name); + ASSERT_EQ(it->second.to_str(), val_gen); + ++it; + } + } + } +}; + +TEST_P(StoreTest, collect_metadata) { + map<string,string> pm; + store->collect_metadata(&pm); + if (GetParam() == string("filestore")) { + ASSERT_NE(pm.count("filestore_backend"), 0u); + ASSERT_NE(pm.count("filestore_f_type"), 0u); + ASSERT_NE(pm.count("backend_filestore_partition_path"), 0u); + ASSERT_NE(pm.count("backend_filestore_dev_node"), 0u); + } +} + +TEST_P(StoreTest, Trivial) { +} + +TEST_P(StoreTest, TrivialRemount) { + int r = store->umount(); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); +} + +TEST_P(StoreTest, TrivialRemountFsck) { + if(string(GetParam()) != "bluestore") + return; + int r = store->umount(); + ASSERT_EQ(0, r); + r = store->fsck(false); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); +} + +TEST_P(StoreTest, SimpleRemount) { + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); + bufferlist bl; + bl.append("1234512345"); + int r; + auto ch = store->create_new_collection(cid); + { + cerr << "create collection + write" << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ch.reset(); + r = store->umount(); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); + ch = store->open_collection(cid); + { + ObjectStore::Transaction t; + t.write(cid, hoid2, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ch.reset(); + r = store->umount(); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); + ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + } + { + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, IORemount) { + coll_t cid; + bufferlist bl; + bl.append("1234512345"); + int r; + auto ch = store->create_new_collection(cid); + { + cerr << "create collection + objects" << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid, 0); + for (int n=1; n<=100; ++n) { + ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP))); + t.write(cid, hoid, 0, bl.length(), bl); + } + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + // overwrites + { + cout << "overwrites" << std::endl; + for (int n=1; n<=100; ++n) { + ObjectStore::Transaction t; + ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP))); + t.write(cid, hoid, 1, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + } + ch.reset(); + r = store->umount(); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); + { + ObjectStore::Transaction t; + for (int n=1; n<=100; ++n) { + ghobject_t hoid(hobject_t(sobject_t("Object " + stringify(n), CEPH_NOSNAP))); + t.remove(cid, hoid); + } + t.remove_collection(cid); + auto ch = store->open_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, UnprintableCharsName) { + coll_t cid; + string name = "funnychars_"; + for (unsigned i = 0; i < 256; ++i) { + name.push_back(i); + } + ghobject_t oid(hobject_t(sobject_t(name, CEPH_NOSNAP))); + int r; + auto ch = store->create_new_collection(cid); + { + cerr << "create collection + object" << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ch.reset(); + r = store->umount(); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); + { + cout << "removing" << std::endl; + ObjectStore::Transaction t; + t.remove(cid, oid); + t.remove_collection(cid); + auto ch = store->open_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, FiemapEmpty) { + coll_t cid; + int r = 0; + ghobject_t oid(hobject_t(sobject_t("fiemap_object", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oid); + t.truncate(cid, oid, 100000); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist bl; + store->fiemap(ch, oid, 0, 100000, bl); + map<uint64_t,uint64_t> m, e; + auto p = bl.cbegin(); + decode(m, p); + cout << " got " << m << std::endl; + e[0] = 100000; + EXPECT_TRUE(m == e || m.empty()); + } + { + ObjectStore::Transaction t; + t.remove(cid, oid); + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, FiemapHoles) { + const uint64_t MAX_EXTENTS = 4000; + const uint64_t SKIP_STEP = 65536; + coll_t cid; + int r = 0; + ghobject_t oid(hobject_t(sobject_t("fiemap_object", CEPH_NOSNAP))); + bufferlist bl; + bl.append("foo"); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oid); + for (uint64_t i = 0; i < MAX_EXTENTS; i++) + t.write(cid, oid, SKIP_STEP * i, 3, bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + //fiemap test from 0 to SKIP_STEP * (MAX_EXTENTS - 1) + 3 + bufferlist bl; + store->fiemap(ch, oid, 0, SKIP_STEP * (MAX_EXTENTS - 1) + 3, bl); + map<uint64_t,uint64_t> m, e; + auto p = bl.cbegin(); + decode(m, p); + cout << " got " << m << std::endl; + ASSERT_TRUE(!m.empty()); + ASSERT_GE(m[0], 3u); + auto last = m.crbegin(); + if (m.size() == 1) { + ASSERT_EQ(0u, last->first); + } else if (m.size() == MAX_EXTENTS) { + for (uint64_t i = 0; i < MAX_EXTENTS; i++) { + ASSERT_TRUE(m.count(SKIP_STEP * i)); + } + } + ASSERT_GT(last->first + last->second, SKIP_STEP * (MAX_EXTENTS - 1)); + } + { + // fiemap test from SKIP_STEP to SKIP_STEP * (MAX_EXTENTS - 2) + 3 + bufferlist bl; + store->fiemap(ch, oid, SKIP_STEP, SKIP_STEP * (MAX_EXTENTS - 2) + 3, bl); + map<uint64_t,uint64_t> m, e; + auto p = bl.cbegin(); + decode(m, p); + cout << " got " << m << std::endl; + ASSERT_TRUE(!m.empty()); + // kstore always returns [0, object_size] regardless of offset and length + // FIXME: if fiemap logic in kstore is refined + if (string(GetParam()) != "kstore") { + ASSERT_GE(m[SKIP_STEP], 3u); + auto last = m.crbegin(); + if (m.size() == 1) { + ASSERT_EQ(SKIP_STEP, last->first); + } else if (m.size() == MAX_EXTENTS - 2) { + for (uint64_t i = 1; i < MAX_EXTENTS - 1; i++) { + ASSERT_TRUE(m.count(SKIP_STEP*i)); + } + } + ASSERT_GT(last->first + last->second, SKIP_STEP * (MAX_EXTENTS - 1)); + } + } + { + ObjectStore::Transaction t; + t.remove(cid, oid); + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SimpleMetaColTest) { + coll_t cid; + int r = 0; + { + auto ch = store->create_new_collection(cid); + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "create collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + auto ch = store->open_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + auto ch = store->create_new_collection(cid); + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "add collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + auto ch = store->open_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SimplePGColTest) { + coll_t cid(spg_t(pg_t(1,2), shard_id_t::NO_SHARD)); + int r = 0; + { + ObjectStore::Transaction t; + auto ch = store->create_new_collection(cid); + t.create_collection(cid, 4); + cerr << "create collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + auto ch = store->open_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.create_collection(cid, 4); + cerr << "add collection" << std::endl; + auto ch = store->create_new_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + auto ch = store->open_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SimpleColPreHashTest) { + // Firstly we will need to revert the value making sure + // collection hint actually works + int merge_threshold = g_ceph_context->_conf->filestore_merge_threshold; + std::ostringstream oss; + if (merge_threshold > 0) { + oss << "-" << merge_threshold; + SetVal(g_conf(), "filestore_merge_threshold", oss.str().c_str()); + } + + uint32_t pg_num = 128; + + boost::uniform_int<> pg_id_range(0, pg_num); + gen_type rng(time(NULL)); + int pg_id = pg_id_range(rng); + + int objs_per_folder = abs(merge_threshold) * 16 * g_ceph_context->_conf->filestore_split_multiple; + boost::uniform_int<> folders_range(5, 256); + uint64_t expected_num_objs = (uint64_t)objs_per_folder * (uint64_t)folders_range(rng); + + coll_t cid(spg_t(pg_t(pg_id, 15), shard_id_t::NO_SHARD)); + int r; + auto ch = store->create_new_collection(cid); + { + // Create a collection along with a hint + ObjectStore::Transaction t; + t.create_collection(cid, 5); + cerr << "create collection" << std::endl; + bufferlist hint; + encode(pg_num, hint); + encode(expected_num_objs, hint); + t.collection_hint(cid, ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS, hint); + cerr << "collection hint" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // Remove the collection + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "remove collection" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SmallBlockWrites) { + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist a; + bufferptr ap(0x1000); + memset(ap.c_str(), 'a', 0x1000); + a.append(ap); + bufferlist b; + bufferptr bp(0x1000); + memset(bp.c_str(), 'b', 0x1000); + b.append(bp); + bufferlist c; + bufferptr cp(0x1000); + memset(cp.c_str(), 'c', 0x1000); + c.append(cp); + bufferptr zp(0x1000); + zp.zero(); + bufferlist z; + z.append(zp); + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0, 0x1000, a); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in, exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(0x1000, r); + exp.append(a); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0x1000, 0x1000, b); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in, exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(0x2000, r); + exp.append(a); + exp.append(b); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0x3000, 0x1000, c); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in, exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(0x4000, r); + exp.append(a); + exp.append(b); + exp.append(z); + exp.append(c); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0x2000, 0x1000, a); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in, exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(0x4000, r); + exp.append(a); + exp.append(b); + exp.append(a); + exp.append(c); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0, 0x1000, c); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist in, exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(0x4000, r); + exp.append(c); + exp.append(b); + exp.append(a); + exp.append(c); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, BufferCacheReadTest) { + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + { + auto ch = store->open_collection(cid); + ASSERT_FALSE(ch); + } + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append("abcde"); + t.write(cid, hoid, 0, 5, bl); + t.write(cid, hoid, 10, 5, bl); + cerr << "TwinWrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, 15, newdata); + ASSERT_EQ(r, 15); + { + bufferlist expected; + expected.append(bl); + expected.append_zero(5); + expected.append(bl); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } + //overwrite over the same extents + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append("edcba"); + t.write(cid, hoid, 0, 5, bl); + t.write(cid, hoid, 10, 5, bl); + cerr << "TwinWrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, 15, newdata); + ASSERT_EQ(r, 15); + { + bufferlist expected; + expected.append(bl); + expected.append_zero(5); + expected.append(bl); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } + //additional write to an unused region of some blob + { + ObjectStore::Transaction t; + bufferlist bl2, newdata; + bl2.append("1234567890"); + + t.write(cid, hoid, 20, bl2.length(), bl2); + cerr << "Append" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, 30, newdata); + ASSERT_EQ(r, 30); + { + bufferlist expected; + expected.append("edcba"); + expected.append_zero(5); + expected.append("edcba"); + expected.append_zero(5); + expected.append(bl2); + + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } + //additional write to an unused region of some blob and partial owerite over existing extents + { + ObjectStore::Transaction t; + bufferlist bl, bl2, bl3, newdata; + bl.append("DCB"); + bl2.append("1234567890"); + bl3.append("BA"); + + t.write(cid, hoid, 30, bl2.length(), bl2); + t.write(cid, hoid, 1, bl.length(), bl); + t.write(cid, hoid, 13, bl3.length(), bl3); + cerr << "TripleWrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, 40, newdata); + ASSERT_EQ(r, 40); + { + bufferlist expected; + expected.append("eDCBa"); + expected.append_zero(5); + expected.append("edcBA"); + expected.append_zero(5); + expected.append(bl2); + expected.append(bl2); + + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } +} + +void StoreTest::doCompressionTest() +{ + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + { + auto ch = store->open_collection(cid); + ASSERT_FALSE(ch); + } + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + std::string data; + data.resize(0x10000 * 4); + for(size_t i = 0;i < data.size(); i++) + data[i] = i / 256; + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append(data); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "CompressibleData (4xAU) Write" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, data.size() , newdata); + + ASSERT_EQ(r, (int)data.size()); + { + bufferlist expected; + expected.append(data); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + r = store->read(ch, hoid, 0, 711 , newdata); + ASSERT_EQ(r, 711); + { + bufferlist expected; + expected.append(data.substr(0,711)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + r = store->read(ch, hoid, 0xf00f, data.size(), newdata); + ASSERT_EQ(r, int(data.size() - 0xf00f) ); + { + bufferlist expected; + expected.append(data.substr(0xf00f)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)data.size()); + ASSERT_LE(statfs.data_compressed, (unsigned)data.size()); + ASSERT_EQ(statfs.data_compressed_original, (unsigned)data.size()); + ASSERT_LE(statfs.data_compressed_allocated, (unsigned)data.size()); + } + } + std::string data2; + data2.resize(0x10000 * 4 - 0x9000); + for(size_t i = 0;i < data2.size(); i++) + data2[i] = (i+1) / 256; + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append(data2); + t.write(cid, hoid, 0x8000, bl.length(), bl); + cerr << "CompressibleData partial overwrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, 0x10000, newdata); + ASSERT_EQ(r, (int)0x10000); + { + bufferlist expected; + expected.append(data.substr(0, 0x8000)); + expected.append(data2.substr(0, 0x8000)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + r = store->read(ch, hoid, 0x9000, 711 , newdata); + ASSERT_EQ(r, 711); + { + bufferlist expected; + expected.append(data2.substr(0x1000,711)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + r = store->read(ch, hoid, 0x0, 0x40000, newdata); + ASSERT_EQ(r, int(0x40000) ); + { + bufferlist expected; + expected.append(data.substr(0, 0x8000)); + expected.append(data2.substr(0, 0x37000)); + expected.append(data.substr(0x3f000, 0x1000)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } + data2.resize(0x3f000); + for(size_t i = 0;i < data2.size(); i++) + data2[i] = (i+2) / 256; + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append(data2); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "CompressibleData partial overwrite, two extents overlapped, single one to be removed" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0, 0x3e000 - 1, newdata); + ASSERT_EQ(r, (int)0x3e000 - 1); + { + bufferlist expected; + expected.append(data2.substr(0, 0x3e000 - 1)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + r = store->read(ch, hoid, 0x3e000-1, 0x2001, newdata); + ASSERT_EQ(r, 0x2001); + { + bufferlist expected; + expected.append(data2.substr(0x3e000-1, 0x1001)); + expected.append(data.substr(0x3f000, 0x1000)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + r = store->read(ch, hoid, 0x0, 0x40000, newdata); + ASSERT_EQ(r, int(0x40000) ); + { + bufferlist expected; + expected.append(data2.substr(0, 0x3f000)); + expected.append(data.substr(0x3f000, 0x1000)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } + data.resize(0x1001); + for(size_t i = 0;i < data.size(); i++) + data[i] = (i+3) / 256; + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append(data); + t.write(cid, hoid, 0x3f000-1, bl.length(), bl); + cerr << "Small chunk partial overwrite, two extents overlapped, single one to be removed" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 0x3e000, 0x2000, newdata); + ASSERT_EQ(r, (int)0x2000); + { + bufferlist expected; + expected.append(data2.substr(0x3e000, 0x1000 - 1)); + expected.append(data.substr(0, 0x1001)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + cerr << "Cleaning object" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + auto settingsBookmark = BookmarkSettings(); + SetVal(g_conf(), "bluestore_compression_min_blob_size", "262144"); + g_ceph_context->_conf.apply_changes(nullptr); + { + data.resize(0x10000*6); + + for(size_t i = 0;i < data.size(); i++) + data[i] = i / 256; + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append(data); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "CompressibleData large blob" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, CompressionTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "TODO: need to adjust statfs check for smr" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_compression_algorithm", "snappy"); + SetVal(g_conf(), "bluestore_compression_mode", "force"); + g_ceph_context->_conf.apply_changes(nullptr); + doCompressionTest(); + + SetVal(g_conf(), "bluestore_compression_algorithm", "zlib"); + SetVal(g_conf(), "bluestore_compression_mode", "aggressive"); + g_ceph_context->_conf.apply_changes(nullptr); + doCompressionTest(); +} + +TEST_P(StoreTest, SimpleObjectTest) { + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + { + auto ch = store->open_collection(cid); + ASSERT_FALSE(ch); + } + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.touch(cid, hoid); + cerr << "Remove then create" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + bl.append("abcde"); + orig = bl; + t.remove(cid, hoid); + t.write(cid, hoid, 0, 5, bl); + cerr << "Remove then create" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in; + r = store->read(ch, hoid, 0, 5, in); + ASSERT_EQ(5, r); + ASSERT_TRUE(bl_eq(orig, in)); + } + { + ObjectStore::Transaction t; + bufferlist bl, exp; + bl.append("abcde"); + exp = bl; + exp.append(bl); + t.write(cid, hoid, 5, 5, bl); + cerr << "Append" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in; + r = store->read(ch, hoid, 0, 10, in); + ASSERT_EQ(10, r); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + bufferlist bl, exp; + bl.append("abcdeabcde"); + exp = bl; + t.write(cid, hoid, 0, 10, bl); + cerr << "Full overwrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in; + r = store->read(ch, hoid, 0, 10, in); + ASSERT_EQ(10, r); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("abcde"); + t.write(cid, hoid, 3, 5, bl); + cerr << "Partial overwrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in, exp; + exp.append("abcabcdede"); + r = store->read(ch, hoid, 0, 10, in); + ASSERT_EQ(10, r); + in.hexdump(cout); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("fghij"); + t.truncate(cid, hoid, 0); + t.write(cid, hoid, 5, 5, bl); + cerr << "Truncate + hole" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("abcde"); + t.write(cid, hoid, 0, 5, bl); + cerr << "Reverse fill-in" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + bufferlist in, exp; + exp.append("abcdefghij"); + r = store->read(ch, hoid, 0, 10, in); + ASSERT_EQ(10, r); + in.hexdump(cout); + ASSERT_TRUE(bl_eq(exp, in)); + } + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234"); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "larger overwrite" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist in; + r = store->read(ch, hoid, 0, bl.length(), in); + ASSERT_EQ((int)bl.length(), r); + in.hexdump(cout); + ASSERT_TRUE(bl_eq(bl, in)); + } + { + bufferlist bl; + bl.append("abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234abcde01234012340123401234"); + + //test: offset=len=0 mean read all data + bufferlist in; + r = store->read(ch, hoid, 0, 0, in); + ASSERT_EQ((int)bl.length(), r); + in.hexdump(cout); + ASSERT_TRUE(bl_eq(bl, in)); + } + { + //verifying unaligned csums + std::string s1("1"), s2(0x1000, '2'), s3("00"); + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(s1); + bl.append(s2); + t.truncate(cid, hoid, 0); + t.write(cid, hoid, 0x1000-1, bl.length(), bl); + cerr << "Write unaligned csum, stage 1" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + bufferlist in, exp1, exp2, exp3; + exp1.append(s1); + exp2.append(s2); + exp3.append(s3); + r = store->read(ch, hoid, 0x1000-1, 1, in); + ASSERT_EQ(1, r); + ASSERT_TRUE(bl_eq(exp1, in)); + in.clear(); + r = store->read(ch, hoid, 0x1000, 0x1000, in); + ASSERT_EQ(0x1000, r); + ASSERT_TRUE(bl_eq(exp2, in)); + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(s3); + t.write(cid, hoid, 1, bl.length(), bl); + cerr << "Write unaligned csum, stage 2" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + in.clear(); + r = store->read(ch, hoid, 1, 2, in); + ASSERT_EQ(2, r); + ASSERT_TRUE(bl_eq(exp3, in)); + in.clear(); + r = store->read(ch, hoid, 0x1000-1, 1, in); + ASSERT_EQ(1, r); + ASSERT_TRUE(bl_eq(exp1, in)); + in.clear(); + r = store->read(ch, hoid, 0x1000, 0x1000, in); + ASSERT_EQ(0x1000, r); + ASSERT_TRUE(bl_eq(exp2, in)); + + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +#if defined(WITH_BLUESTORE) + +TEST_P(StoreTestSpecificAUSize, ReproBug41901Test) { + if(string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP (smr)" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_max_blob_size", "524288"); + SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd"); + g_conf().apply_changes(nullptr); + StartDeferred(65536); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + const PerfCounters* logger = store->get_perf_counters(); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(4096, 'a'); + bl.append(s); + t.write(cid, hoid, 0x11000, bl.length(), bl); + cerr << "write1" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(4096 * 3, 'a'); + bl.append(s); + t.write(cid, hoid, 0x15000, bl.length(), bl); + cerr << "write2" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_small), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_small_unused), 1u); + + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(4096 * 2, 'a'); + bl.append(s); + t.write(cid, hoid, 0xe000, bl.length(), bl); + cerr << "write3" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_small), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_small_unused), 2u); + + + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(4096, 'a'); + bl.append(s); + t.write(cid, hoid, 0xf000, bl.length(), bl); + t.write(cid, hoid, 0x10000, bl.length(), bl); + cerr << "write3" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_small), 5u); + ASSERT_EQ(logger->get(l_bluestore_write_small_unused), 2u); + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + + +TEST_P(StoreTestSpecificAUSize, BluestoreStatFSTest) { + if(string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "TODO: fix this for smr" << std::endl; + return; + } + SetVal(g_conf(), "bluestore_block_db_path", ""); + StartDeferred(65536); + SetVal(g_conf(), "bluestore_compression_mode", "force"); + SetVal(g_conf(), "bluestore_max_blob_size", "524288"); + // just a big number to disble gc + SetVal(g_conf(), "bluestore_gc_enable_total_threshold", "100000"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "true"); + g_conf().apply_changes(nullptr); + int r; + + int poolid = 4373; + coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD)); + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP), + string(), + 0, + poolid, + string())); + ghobject_t hoid2 = hoid; + hoid2.hobj.snap = 1; + { + auto ch = store->open_collection(cid); + ASSERT_FALSE(ch); + } + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ( 0u, statfs.allocated); + ASSERT_EQ( 0u, statfs.data_stored); + ASSERT_EQ(g_conf()->bluestore_block_size, statfs.total); + ASSERT_TRUE(statfs.available > 0u && statfs.available < g_conf()->bluestore_block_size); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ( 0u, statfs_pool.allocated); + ASSERT_EQ( 0u, statfs_pool.data_stored); + + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("abcde"); + t.write(cid, hoid, 0, 5, bl); + cerr << "Append 5 bytes" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(5, statfs.data_stored); + ASSERT_EQ(0x10000, statfs.allocated); + ASSERT_EQ(0, statfs.data_compressed); + ASSERT_EQ(0, statfs.data_compressed_original); + ASSERT_EQ(0, statfs.data_compressed_allocated); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(5, statfs_pool.data_stored); + ASSERT_EQ(0x10000, statfs_pool.allocated); + ASSERT_EQ(0, statfs_pool.data_compressed); + ASSERT_EQ(0, statfs_pool.data_compressed_original); + ASSERT_EQ(0, statfs_pool.data_compressed_allocated); + + // accessing unknown pool + r = store->pool_statfs(poolid + 1, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0, statfs_pool.data_stored); + ASSERT_EQ(0, statfs_pool.allocated); + ASSERT_EQ(0, statfs_pool.data_compressed); + ASSERT_EQ(0, statfs_pool.data_compressed_original); + ASSERT_EQ(0, statfs_pool.data_compressed_allocated); + + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + std::string s(0x30000, 'a'); + bufferlist bl; + bl.append(s); + t.write(cid, hoid, 0x10000, bl.length(), bl); + cerr << "Append 0x30000 compressible bytes" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30005, statfs.data_stored); + ASSERT_EQ(0x30000, statfs.allocated); + ASSERT_LE(statfs.data_compressed, 0x10000); + ASSERT_EQ(0x20000, statfs.data_compressed_original); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30005, statfs_pool.data_stored); + ASSERT_EQ(0x30000, statfs_pool.allocated); + ASSERT_LE(statfs_pool.data_compressed, 0x10000); + ASSERT_EQ(0x20000, statfs_pool.data_compressed_original); + ASSERT_EQ(statfs_pool.data_compressed_allocated, 0x10000); + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 1, 3); + t.zero(cid, hoid, 0x20000, 9); + cerr << "Punch hole at 1~3, 0x20000~9" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30005 - 3 - 9, statfs.data_stored); + ASSERT_EQ(0x30000, statfs.allocated); + ASSERT_LE(statfs.data_compressed, 0x10000); + ASSERT_EQ(0x20000 - 9, statfs.data_compressed_original); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30005 - 3 - 9, statfs_pool.data_stored); + ASSERT_EQ(0x30000, statfs_pool.allocated); + ASSERT_LE(statfs_pool.data_compressed, 0x10000); + ASSERT_EQ(0x20000 - 9, statfs_pool.data_compressed_original); + ASSERT_EQ(statfs_pool.data_compressed_allocated, 0x10000); + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + std::string s(0x1000, 'b'); + bufferlist bl; + bl.append(s); + t.write(cid, hoid, 1, bl.length(), bl); + t.write(cid, hoid, 0x10001, bl.length(), bl); + cerr << "Overwrite first and second(compressible) extents" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30001 - 9 + 0x1000, statfs.data_stored); + ASSERT_EQ(0x40000, statfs.allocated); + ASSERT_LE(statfs.data_compressed, 0x10000); + ASSERT_EQ(0x20000 - 9 - 0x1000, statfs.data_compressed_original); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30001 - 9 + 0x1000, statfs_pool.data_stored); + ASSERT_EQ(0x40000, statfs_pool.allocated); + ASSERT_LE(statfs_pool.data_compressed, 0x10000); + ASSERT_EQ(0x20000 - 9 - 0x1000, statfs_pool.data_compressed_original); + ASSERT_EQ(statfs_pool.data_compressed_allocated, 0x10000); + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + std::string s(0x10000, 'c'); + bufferlist bl; + bl.append(s); + t.write(cid, hoid, 0x10000, bl.length(), bl); + t.write(cid, hoid, 0x20000, bl.length(), bl); + t.write(cid, hoid, 0x30000, bl.length(), bl); + cerr << "Overwrite compressed extent with 3 uncompressible ones" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30000 + 0x1001, statfs.data_stored); + ASSERT_EQ(0x40000, statfs.allocated); + ASSERT_LE(statfs.data_compressed, 0); + ASSERT_EQ(0, statfs.data_compressed_original); + ASSERT_EQ(0, statfs.data_compressed_allocated); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30000 + 0x1001, statfs_pool.data_stored); + ASSERT_EQ(0x40000, statfs_pool.allocated); + ASSERT_LE(statfs_pool.data_compressed, 0); + ASSERT_EQ(0, statfs_pool.data_compressed_original); + ASSERT_EQ(0, statfs_pool.data_compressed_allocated); + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 0, 0x40000); + cerr << "Zero object" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0u, statfs.allocated); + ASSERT_EQ(0u, statfs.data_stored); + ASSERT_EQ(0u, statfs.data_compressed_original); + ASSERT_EQ(0u, statfs.data_compressed); + ASSERT_EQ(0u, statfs.data_compressed_allocated); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0u, statfs_pool.allocated); + ASSERT_EQ(0u, statfs_pool.data_stored); + ASSERT_EQ(0u, statfs_pool.data_compressed_original); + ASSERT_EQ(0u, statfs_pool.data_compressed); + ASSERT_EQ(0u, statfs_pool.data_compressed_allocated); + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + ObjectStore::Transaction t; + std::string s(0x10000, 'c'); + bufferlist bl; + bl.append(s); + bl.append(s); + bl.append(s); + bl.append(s.substr(0, 0x10000-2)); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "Yet another compressible write" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + struct store_statfs_t statfs; + r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x40000 - 2, statfs.data_stored); + ASSERT_EQ(0x30000, statfs.allocated); + ASSERT_LE(statfs.data_compressed, 0x10000); + ASSERT_EQ(0x20000, statfs.data_compressed_original); + ASSERT_EQ(0x10000, statfs.data_compressed_allocated); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x40000 - 2, statfs_pool.data_stored); + ASSERT_EQ(0x30000, statfs_pool.allocated); + ASSERT_LE(statfs_pool.data_compressed, 0x10000); + ASSERT_EQ(0x20000, statfs_pool.data_compressed_original); + ASSERT_EQ(0x10000, statfs_pool.data_compressed_allocated); + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + { + struct store_statfs_t statfs; + r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + + ObjectStore::Transaction t; + t.clone(cid, hoid, hoid2); + cerr << "Clone compressed objecte" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + struct store_statfs_t statfs2; + r = store->statfs(&statfs2); + ASSERT_EQ(r, 0); + ASSERT_GT(statfs2.data_stored, statfs.data_stored); + ASSERT_EQ(statfs2.allocated, statfs.allocated); + ASSERT_GT(statfs2.data_compressed, statfs.data_compressed); + ASSERT_GT(statfs2.data_compressed_original, statfs.data_compressed_original); + ASSERT_EQ(statfs2.data_compressed_allocated, statfs.data_compressed_allocated); + + struct store_statfs_t statfs2_pool; + r = store->pool_statfs(poolid, &statfs2_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_GT(statfs2_pool.data_stored, statfs_pool.data_stored); + ASSERT_EQ(statfs2_pool.allocated, statfs_pool.allocated); + ASSERT_GT(statfs2_pool.data_compressed, statfs_pool.data_compressed); + ASSERT_GT(statfs2_pool.data_compressed_original, + statfs_pool.data_compressed_original); + ASSERT_EQ(statfs2_pool.data_compressed_allocated, + statfs_pool.data_compressed_allocated); + } + + { + // verify no + auto poolid2 = poolid + 1; + coll_t cid2 = coll_t(spg_t(pg_t(20, poolid2), shard_id_t::NO_SHARD)); + ghobject_t hoid(hobject_t(sobject_t("Object 2", CEPH_NOSNAP), + string(), + 0, + poolid2, + string())); + auto ch = store->create_new_collection(cid2); + + { + + struct store_statfs_t statfs1_pool; + bool per_pool_omap; + int r = store->pool_statfs(poolid, &statfs1_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + + cerr << "Creating second collection " << cid2 << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid2, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + t = ObjectStore::Transaction(); + bufferlist bl; + bl.append("abcde"); + t.write(cid2, hoid, 0, 5, bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs2_pool; + r = store->pool_statfs(poolid2, &statfs2_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(5, statfs2_pool.data_stored); + ASSERT_EQ(0x10000, statfs2_pool.allocated); + ASSERT_EQ(0, statfs2_pool.data_compressed); + ASSERT_EQ(0, statfs2_pool.data_compressed_original); + ASSERT_EQ(0, statfs2_pool.data_compressed_allocated); + + struct store_statfs_t statfs1_pool_again; + r = store->pool_statfs(poolid, &statfs1_pool_again, &per_pool_omap); + ASSERT_EQ(r, 0); + // adjust 'available' since it has changed + statfs1_pool_again.available = statfs1_pool.available; + ASSERT_EQ(statfs1_pool_again, statfs1_pool); + + t = ObjectStore::Transaction(); + t.remove(cid2, hoid); + t.remove_collection(cid2); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + } + + { + // verify ops on temporary object + + auto poolid3 = poolid + 2; + coll_t cid3 = coll_t(spg_t(pg_t(20, poolid3), shard_id_t::NO_SHARD)); + ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP), + string(), + 0, + poolid3, + string())); + ghobject_t hoid3_temp; + hoid3_temp.hobj = hoid3.hobj.make_temp_hobject("Object 3 temp"); + auto ch3 = store->create_new_collection(cid3); + { + struct store_statfs_t statfs1_pool; + bool per_pool_omap; + int r = store->pool_statfs(poolid, &statfs1_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + + cerr << "Creating third collection " << cid3 << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid3, 0); + r = queue_transaction(store, ch3, std::move(t)); + ASSERT_EQ(r, 0); + + t = ObjectStore::Transaction(); + bufferlist bl; + bl.append("abcde"); + t.write(cid3, hoid3_temp, 0, 5, bl); + r = queue_transaction(store, ch3, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs3_pool; + r = store->pool_statfs(poolid3, &statfs3_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(5, statfs3_pool.data_stored); + ASSERT_EQ(0x10000, statfs3_pool.allocated); + ASSERT_EQ(0, statfs3_pool.data_compressed); + ASSERT_EQ(0, statfs3_pool.data_compressed_original); + ASSERT_EQ(0, statfs3_pool.data_compressed_allocated); + + struct store_statfs_t statfs1_pool_again; + r = store->pool_statfs(poolid, &statfs1_pool_again, &per_pool_omap); + ASSERT_EQ(r, 0); + // adjust 'available' since it has changed + statfs1_pool_again.available = statfs1_pool.available; + ASSERT_EQ(statfs1_pool_again, statfs1_pool); + + //force fsck + ch.reset(); + ch3.reset(); + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + ch3 = store->open_collection(cid3); + + t = ObjectStore::Transaction(); + t.collection_move_rename( + cid3, hoid3_temp, + cid3, hoid3); + r = queue_transaction(store, ch3, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs3_pool_again; + r = store->pool_statfs(poolid3, &statfs3_pool_again, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs3_pool_again, statfs3_pool); + + //force fsck + ch.reset(); + ch3.reset(); + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + ch3 = store->open_collection(cid3); + + t = ObjectStore::Transaction(); + t.remove(cid3, hoid3); + t.remove_collection(cid3); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch3, std::move(t)); + ASSERT_EQ(r, 0); + } + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ( 0u, statfs.allocated); + ASSERT_EQ( 0u, statfs.data_stored); + ASSERT_EQ( 0u, statfs.data_compressed_original); + ASSERT_EQ( 0u, statfs.data_compressed); + ASSERT_EQ( 0u, statfs.data_compressed_allocated); + + struct store_statfs_t statfs_pool; + bool per_pool_omap; + r = store->pool_statfs(poolid, &statfs_pool, &per_pool_omap); + ASSERT_EQ(r, 0); + ASSERT_EQ( 0u, statfs_pool.allocated); + ASSERT_EQ( 0u, statfs_pool.data_stored); + ASSERT_EQ( 0u, statfs_pool.data_compressed_original); + ASSERT_EQ( 0u, statfs_pool.data_compressed); + ASSERT_EQ( 0u, statfs_pool.data_compressed_allocated); + } +} + +TEST_P(StoreTestSpecificAUSize, BluestoreFragmentedBlobTest) { + if(string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "TODO: fix this for smr" << std::endl; + return; + } + SetVal(g_conf(), "bluestore_block_db_path", ""); + StartDeferred(0x10000); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(g_conf()->bluestore_block_size, statfs.total); + ASSERT_EQ(0u, statfs.allocated); + ASSERT_EQ(0u, statfs.data_stored); + ASSERT_TRUE(statfs.available > 0u && statfs.available < g_conf()->bluestore_block_size); + } + std::string data; + data.resize(0x10000 * 3); + { + ObjectStore::Transaction t; + for(size_t i = 0;i < data.size(); i++) + data[i] = i / 256 + 1; + bufferlist bl, newdata; + bl.append(data); + t.write(cid, hoid, 0, bl.length(), bl); + t.zero(cid, hoid, 0x10000, 0x10000); + cerr << "Append 3*0x10000 bytes and punch a hole 0x10000~10000" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x20000, statfs.data_stored); + ASSERT_EQ(0x20000, statfs.allocated); + + r = store->read(ch, hoid, 0, data.size(), newdata); + ASSERT_EQ(r, (int)data.size()); + { + bufferlist expected; + expected.append(data.substr(0, 0x10000)); + expected.append(string(0x10000, 0)); + expected.append(data.substr(0x20000, 0x10000)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + + r = store->read(ch, hoid, 1, data.size()-2, newdata); + ASSERT_EQ(r, (int)data.size()-2); + { + bufferlist expected; + expected.append(data.substr(1, 0x10000-1)); + expected.append(string(0x10000, 0)); + expected.append(data.substr(0x20000, 0x10000 - 1)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + } + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + + { + ObjectStore::Transaction t; + std::string data2(3, 'b'); + bufferlist bl, newdata; + bl.append(data2); + t.write(cid, hoid, 0x20000, bl.length(), bl); + cerr << "Write 3 bytes after the hole" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x20000, statfs.allocated); + ASSERT_EQ(0x20000, statfs.data_stored); + + r = store->read(ch, hoid, 0x20000-1, 21, newdata); + ASSERT_EQ(r, (int)21); + { + bufferlist expected; + expected.append(string(0x1, 0)); + expected.append(string(data2)); + expected.append(data.substr(0x20003, 21-4)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + } + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + + { + ObjectStore::Transaction t; + std::string data2(3, 'a'); + bufferlist bl, newdata; + bl.append(data2); + t.write(cid, hoid, 0x10000+1, bl.length(), bl); + cerr << "Write 3 bytes to the hole" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x30000, statfs.allocated); + ASSERT_EQ(0x20003, statfs.data_stored); + + r = store->read(ch, hoid, 0x10000-1, 0x10000+22, newdata); + ASSERT_EQ(r, (int)0x10000+22); + { + bufferlist expected; + expected.append(data.substr(0x10000-1, 1)); + expected.append(string(0x1, 0)); + expected.append(data2); + expected.append(string(0x10000-4, 0)); + expected.append(string(0x3, 'b')); + expected.append(data.substr(0x20004, 21-3)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + } + { + ObjectStore::Transaction t; + bufferlist bl, newdata; + bl.append(string(0x30000, 'c')); + t.write(cid, hoid, 0, 0x30000, bl); + t.zero(cid, hoid, 0, 0x10000); + t.zero(cid, hoid, 0x20000, 0x10000); + cerr << "Rewrite an object and create two holes at the beginning and the end" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(0x10000, statfs.allocated); + ASSERT_EQ(0x10000, statfs.data_stored); + + r = store->read(ch, hoid, 0, 0x30000, newdata); + ASSERT_EQ(r, (int)0x30000); + { + bufferlist expected; + expected.append(string(0x10000, 0)); + expected.append(string(0x10000, 'c')); + expected.append(string(0x10000, 0)); + ASSERT_TRUE(bl_eq(expected, newdata)); + } + newdata.clear(); + } + + //force fsck + ch.reset(); + EXPECT_EQ(store->umount(), 0); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + struct store_statfs_t statfs; + r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ( 0u, statfs.allocated); + ASSERT_EQ( 0u, statfs.data_stored); + ASSERT_EQ( 0u, statfs.data_compressed_original); + ASSERT_EQ( 0u, statfs.data_compressed); + ASSERT_EQ( 0u, statfs.data_compressed_allocated); + } +} +#endif + +TEST_P(StoreTest, ManySmallWrite) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + ghobject_t b(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + bufferptr bp(4096); + bp.zero(); + bl.append(bp); + for (int i=0; i<100; ++i) { + ObjectStore::Transaction t; + t.write(cid, a, i*4096, 4096, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + for (int i=0; i<100; ++i) { + ObjectStore::Transaction t; + t.write(cid, b, (rand() % 1024)*4096, 4096, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove(cid, b); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, MultiSmallWriteSameBlock) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + bl.append("short"); + C_SaferCond c, d; + // touch same block in both same transaction, tls, and pipelined txns + { + ObjectStore::Transaction t, u; + t.write(cid, a, 0, 5, bl, 0); + t.write(cid, a, 5, 5, bl, 0); + t.write(cid, a, 4094, 5, bl, 0); + t.write(cid, a, 9000, 5, bl, 0); + u.write(cid, a, 10, 5, bl, 0); + u.write(cid, a, 7000, 5, bl, 0); + t.register_on_commit(&c); + vector<ObjectStore::Transaction> v = {t, u}; + store->queue_transactions(ch, v); + } + { + ObjectStore::Transaction t, u; + t.write(cid, a, 40, 5, bl, 0); + t.write(cid, a, 45, 5, bl, 0); + t.write(cid, a, 4094, 5, bl, 0); + t.write(cid, a, 6000, 5, bl, 0); + u.write(cid, a, 610, 5, bl, 0); + u.write(cid, a, 11000, 5, bl, 0); + t.register_on_commit(&d); + vector<ObjectStore::Transaction> v = {t, u}; + store->queue_transactions(ch, v); + } + c.wait(); + d.wait(); + { + bufferlist bl2; + r = store->read(ch, a, 0, 16000, bl2); + ASSERT_GE(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SmallSkipFront) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.touch(cid, a); + t.truncate(cid, a, 3000); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist bl; + bufferptr bp(4096); + memset(bp.c_str(), 1, 4096); + bl.append(bp); + ObjectStore::Transaction t; + t.write(cid, a, 4096, 4096, bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist bl; + ASSERT_EQ(8192, store->read(ch, a, 0, 8192, bl)); + for (unsigned i=0; i<4096; ++i) + ASSERT_EQ(0, bl[i]); + for (unsigned i=4096; i<8192; ++i) + ASSERT_EQ(1, bl[i]); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, AppendDeferredVsTailCache) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("fooo", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + unsigned min_alloc = g_conf()->bluestore_min_alloc_size; + unsigned size = min_alloc / 3; + bufferptr bpa(size); + memset(bpa.c_str(), 1, bpa.length()); + bufferlist bla; + bla.append(bpa); + { + ObjectStore::Transaction t; + t.write(cid, a, 0, bla.length(), bla, 0); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // force cached tail to clear ... + { + ch.reset(); + int r = store->umount(); + ASSERT_EQ(0, r); + r = store->mount(); + ASSERT_EQ(0, r); + ch = store->open_collection(cid); + } + + bufferptr bpb(size); + memset(bpb.c_str(), 2, bpb.length()); + bufferlist blb; + blb.append(bpb); + { + ObjectStore::Transaction t; + t.write(cid, a, bla.length(), blb.length(), blb, 0); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferptr bpc(size); + memset(bpc.c_str(), 3, bpc.length()); + bufferlist blc; + blc.append(bpc); + { + ObjectStore::Transaction t; + t.write(cid, a, bla.length() + blb.length(), blc.length(), blc, 0); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist final; + final.append(bla); + final.append(blb); + final.append(blc); + bufferlist actual; + { + ASSERT_EQ((int)final.length(), + store->read(ch, a, 0, final.length(), actual)); + ASSERT_TRUE(bl_eq(final, actual)); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, AppendZeroTrailingSharedBlock) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("fooo", CEPH_NOSNAP))); + ghobject_t b = a; + b.hobj.snap = 1; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + unsigned min_alloc = g_conf()->bluestore_min_alloc_size; + unsigned size = min_alloc / 3; + bufferptr bpa(size); + memset(bpa.c_str(), 1, bpa.length()); + bufferlist bla; + bla.append(bpa); + // make sure there is some trailing gunk in the last block + { + bufferlist bt; + bt.append(bla); + bt.append("BADBADBADBAD"); + ObjectStore::Transaction t; + t.write(cid, a, 0, bt.length(), bt, 0); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.truncate(cid, a, size); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // clone + { + ObjectStore::Transaction t; + t.clone(cid, a, b); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // append with implicit zeroing + bufferptr bpb(size); + memset(bpb.c_str(), 2, bpb.length()); + bufferlist blb; + blb.append(bpb); + { + ObjectStore::Transaction t; + t.write(cid, a, min_alloc * 3, blb.length(), blb, 0); + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist final; + final.append(bla); + bufferlist zeros; + zeros.append_zero(min_alloc * 3 - size); + final.append(zeros); + final.append(blb); + bufferlist actual; + { + ASSERT_EQ((int)final.length(), + store->read(ch, a, 0, final.length(), actual)); + final.hexdump(cout); + actual.hexdump(cout); + ASSERT_TRUE(bl_eq(final, actual)); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove(cid, b); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = store->queue_transaction(ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SmallSequentialUnaligned) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + int len = 1000; + bufferptr bp(len); + bp.zero(); + bl.append(bp); + for (int i=0; i<1000; ++i) { + ObjectStore::Transaction t; + t.write(cid, a, i*len, len, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, ManyBigWrite) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + ghobject_t b(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + bufferptr bp(4 * 1048576); + bp.zero(); + bl.append(bp); + for (int i=0; i<10; ++i) { + ObjectStore::Transaction t; + t.write(cid, a, i*4*1048586, 4*1048576, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + // aligned + for (int i=0; i<10; ++i) { + ObjectStore::Transaction t; + t.write(cid, b, (rand() % 256)*4*1048576, 4*1048576, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + // unaligned + for (int i=0; i<10; ++i) { + ObjectStore::Transaction t; + t.write(cid, b, (rand() % (256*4096))*1024, 4*1048576, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + // do some zeros + for (int i=0; i<10; ++i) { + ObjectStore::Transaction t; + t.zero(cid, b, (rand() % (256*4096))*1024, 16*1048576); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove(cid, b); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, BigWriteBigZero) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + bufferptr bp(1048576); + memset(bp.c_str(), 'b', bp.length()); + bl.append(bp); + bufferlist s; + bufferptr sp(4096); + memset(sp.c_str(), 's', sp.length()); + s.append(sp); + { + ObjectStore::Transaction t; + t.write(cid, a, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.zero(cid, a, bl.length() / 4, bl.length() / 2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.write(cid, a, bl.length() / 2, s.length(), s); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, MiscFragmentTests) { + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + bufferptr bp(524288); + bp.zero(); + bl.append(bp); + { + ObjectStore::Transaction t; + t.write(cid, a, 0, 524288, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.write(cid, a, 1048576, 524288, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist inbl; + int r = store->read(ch, a, 524288 + 131072, 1024, inbl); + ASSERT_EQ(r, 1024); + ASSERT_EQ(inbl.length(), 1024u); + ASSERT_TRUE(inbl.is_zero()); + } + { + ObjectStore::Transaction t; + t.write(cid, a, 1048576 - 4096, 524288, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + +} + +TEST_P(StoreTest, ZeroVsObjectSize) { + int r; + coll_t cid; + struct stat stat; + ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist a; + a.append("stuff"); + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0, 5, a); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(0, store->stat(ch, hoid, &stat)); + ASSERT_EQ(5, stat.st_size); + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 1, 2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(0, store->stat(ch, hoid, &stat)); + ASSERT_EQ(5, stat.st_size); + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 3, 200); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(0, store->stat(ch, hoid, &stat)); + ASSERT_EQ(203, stat.st_size); + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 100000, 200); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(0, store->stat(ch, hoid, &stat)); + ASSERT_EQ(100200, stat.st_size); +} + +TEST_P(StoreTest, ZeroLengthWrite) { + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist empty; + t.write(cid, hoid, 1048576, 0, empty); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + struct stat stat; + r = store->stat(ch, hoid, &stat); + ASSERT_EQ(0, r); + ASSERT_EQ(0, stat.st_size); + + bufferlist newdata; + r = store->read(ch, hoid, 0, 1048576, newdata); + ASSERT_EQ(0, r); +} + +TEST_P(StoreTest, ZeroLengthZero) { + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 1048576, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + struct stat stat; + r = store->stat(ch, hoid, &stat); + ASSERT_EQ(0, r); + ASSERT_EQ(0, stat.st_size); + + bufferlist newdata; + r = store->read(ch, hoid, 0, 1048576, newdata); + ASSERT_EQ(0, r); +} + +TEST_P(StoreTest, SimpleAttrTest) { + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("attr object 1", CEPH_NOSNAP))); + bufferlist val, val2; + val.append("value"); + val.append("value2"); + { + auto ch = store->open_collection(cid); + ASSERT_FALSE(ch); + } + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool empty; + int r = store->collection_empty(ch, &empty); + ASSERT_EQ(0, r); + ASSERT_TRUE(empty); + } + { + bufferptr bp; + r = store->getattr(ch, hoid, "nofoo", bp); + ASSERT_EQ(-ENOENT, r); + } + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.setattr(cid, hoid, "foo", val); + t.setattr(cid, hoid, "bar", val2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool empty; + int r = store->collection_empty(ch, &empty); + ASSERT_EQ(0, r); + ASSERT_TRUE(!empty); + } + { + bufferptr bp; + r = store->getattr(ch, hoid, "nofoo", bp); + ASSERT_EQ(-ENODATA, r); + + r = store->getattr(ch, hoid, "foo", bp); + ASSERT_EQ(0, r); + bufferlist bl; + bl.append(bp); + ASSERT_TRUE(bl_eq(val, bl)); + + map<string,bufferptr,less<>> bm; + r = store->getattrs(ch, hoid, bm); + ASSERT_EQ(0, r); + + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SimpleListTest) { + int r; + coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + set<ghobject_t> all; + { + ObjectStore::Transaction t; + for (int i=0; i<200; ++i) { + string name("object_"); + name += stringify(i); + ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)), + ghobject_t::NO_GEN, shard_id_t(1)); + hoid.hobj.pool = 1; + all.insert(hoid); + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + } + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + set<ghobject_t> saw; + vector<ghobject_t> objects; + ghobject_t next, current; + while (!next.is_max()) { + int r = collection_list(store, ch, current, ghobject_t::get_max(), 50, + &objects, &next); + ASSERT_EQ(r, 0); + ASSERT_TRUE(sorted(objects)); + cout << " got " << objects.size() << " next " << next << std::endl; + for (vector<ghobject_t>::iterator p = objects.begin(); p != objects.end(); + ++p) { + if (saw.count(*p)) { + cout << "got DUP " << *p << std::endl; + } else { + //cout << "got new " << *p << std::endl; + } + saw.insert(*p); + } + objects.clear(); + current = next; + } + ASSERT_EQ(saw.size(), all.size()); + ASSERT_EQ(saw, all); + } + { + ObjectStore::Transaction t; + for (set<ghobject_t>::iterator p = all.begin(); p != all.end(); ++p) + t.remove(cid, *p); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, ListEndTest) { + int r; + coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + set<ghobject_t> all; + { + ObjectStore::Transaction t; + for (int i=0; i<200; ++i) { + string name("object_"); + name += stringify(i); + ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)), + ghobject_t::NO_GEN, shard_id_t(1)); + hoid.hobj.pool = 1; + all.insert(hoid); + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + } + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ghobject_t end(hobject_t(sobject_t("object_100", CEPH_NOSNAP)), + ghobject_t::NO_GEN, shard_id_t(1)); + end.hobj.pool = 1; + vector<ghobject_t> objects; + ghobject_t next; + int r = collection_list(store, ch, ghobject_t(), end, 500, &objects, &next); + ASSERT_EQ(r, 0); + for (auto &p : objects) { + ASSERT_NE(p, end); + } + } + { + ObjectStore::Transaction t; + for (set<ghobject_t>::iterator p = all.begin(); p != all.end(); ++p) + t.remove(cid, *p); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, List_0xfffffff_Hash_Test_in_meta) { + int r = 0; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + ghobject_t hoid(hobject_t(sobject_t("obj", CEPH_NOSNAP), + "", UINT32_C(0xffffffff), -1, "nspace")); + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, nullptr, true); + ASSERT_EQ(r, 0); + ASSERT_EQ(objects.size(), 1); + } +} + +TEST_P(StoreTest, List_0xfffffff_Hash_Test_in_PG) { + int r = 0; + const int64_t poolid = 1; + coll_t cid(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + ghobject_t hoid(hobject_t(sobject_t("obj", CEPH_NOSNAP), + "", UINT32_C(0xffffffff), poolid, "nspace")); + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, nullptr, true); + ASSERT_EQ(r, 0); + ASSERT_EQ(objects.size(), 1); + } +} + +TEST_P(StoreTest, Sort) { + { + hobject_t a(sobject_t("a", CEPH_NOSNAP)); + hobject_t b = a; + ASSERT_EQ(a, b); + b.oid.name = "b"; + ASSERT_NE(a, b); + ASSERT_TRUE(a < b); + a.pool = 1; + b.pool = 2; + ASSERT_TRUE(a < b); + a.pool = 3; + ASSERT_TRUE(a > b); + } + { + ghobject_t a(hobject_t(sobject_t("a", CEPH_NOSNAP))); + ghobject_t b(hobject_t(sobject_t("b", CEPH_NOSNAP))); + a.hobj.pool = 1; + b.hobj.pool = 1; + ASSERT_TRUE(a < b); + a.hobj.pool = -3; + ASSERT_TRUE(a < b); + a.hobj.pool = 1; + b.hobj.pool = -3; + ASSERT_TRUE(a > b); + } +} + +TEST_P(StoreTest, MultipoolListTest) { + int r; + int poolid = 4373; + coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + set<ghobject_t> all, saw; + { + ObjectStore::Transaction t; + for (int i=0; i<200; ++i) { + string name("object_"); + name += stringify(i); + ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP))); + if (rand() & 1) + hoid.hobj.pool = -2 - poolid; + else + hoid.hobj.pool = poolid; + all.insert(hoid); + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + } + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + vector<ghobject_t> objects; + ghobject_t next, current; + while (!next.is_max()) { + int r = collection_list(store, ch, current, ghobject_t::get_max(), 50, + &objects, &next); + ASSERT_EQ(r, 0); + cout << " got " << objects.size() << " next " << next << std::endl; + for (vector<ghobject_t>::iterator p = objects.begin(); p != objects.end(); + ++p) { + saw.insert(*p); + } + objects.clear(); + current = next; + } + ASSERT_EQ(saw, all); + } + { + ObjectStore::Transaction t; + for (set<ghobject_t>::iterator p = all.begin(); p != all.end(); ++p) + t.remove(cid, *p); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SimpleCloneTest) { + int r; + coll_t cid; + + SetDeathTestStyle("threadsafe"); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP), + "key", 123, -1, "")); + bufferlist small, large, xlarge, newdata, attr; + small.append("small"); + large.append("large"); + xlarge.append("xlarge"); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.setattr(cid, hoid, "attr1", small); + t.setattr(cid, hoid, "attr2", large); + t.setattr(cid, hoid, "attr3", xlarge); + t.write(cid, hoid, 0, small.length(), small); + t.write(cid, hoid, 10, small.length(), small); + cerr << "Creating object and set attr " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP), + "key", 123, -1, "")); + ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP))); + { + ObjectStore::Transaction t; + t.clone(cid, hoid, hoid2); + t.setattr(cid, hoid2, "attr2", small); + t.rmattr(cid, hoid2, "attr1"); + t.write(cid, hoid, 10, large.length(), large); + t.setattr(cid, hoid, "attr1", large); + t.setattr(cid, hoid, "attr2", small); + cerr << "Clone object and rm attr" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + r = store->read(ch, hoid, 10, 5, newdata); + ASSERT_EQ(r, 5); + ASSERT_TRUE(bl_eq(large, newdata)); + + newdata.clear(); + r = store->read(ch, hoid, 0, 5, newdata); + ASSERT_EQ(r, 5); + ASSERT_TRUE(bl_eq(small, newdata)); + + newdata.clear(); + r = store->read(ch, hoid2, 10, 5, newdata); + ASSERT_EQ(r, 5); + ASSERT_TRUE(bl_eq(small, newdata)); + + r = store->getattr(ch, hoid2, "attr2", attr); + ASSERT_EQ(r, 0); + ASSERT_TRUE(bl_eq(small, attr)); + + attr.clear(); + r = store->getattr(ch, hoid2, "attr3", attr); + ASSERT_EQ(r, 0); + ASSERT_TRUE(bl_eq(xlarge, attr)); + + attr.clear(); + r = store->getattr(ch, hoid, "attr1", attr); + ASSERT_EQ(r, 0); + ASSERT_TRUE(bl_eq(large, attr)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + } + { + bufferlist final; + bufferptr p(16384); + memset(p.c_str(), 1, p.length()); + bufferlist pl; + pl.append(p); + final.append(p); + ObjectStore::Transaction t; + t.write(cid, hoid, 0, pl.length(), pl); + t.clone(cid, hoid, hoid2); + bufferptr a(4096); + memset(a.c_str(), 2, a.length()); + bufferlist al; + al.append(a); + final.append(a); + t.write(cid, hoid, pl.length(), a.length(), al); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + bufferlist rl; + ASSERT_EQ((int)final.length(), + store->read(ch, hoid, 0, final.length(), rl)); + ASSERT_TRUE(bl_eq(rl, final)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + } + { + bufferlist final; + bufferptr p(16384); + memset(p.c_str(), 111, p.length()); + bufferlist pl; + pl.append(p); + final.append(p); + ObjectStore::Transaction t; + t.write(cid, hoid, 0, pl.length(), pl); + t.clone(cid, hoid, hoid2); + bufferptr z(4096); + z.zero(); + final.append(z); + bufferptr a(4096); + memset(a.c_str(), 112, a.length()); + bufferlist al; + al.append(a); + final.append(a); + t.write(cid, hoid, pl.length() + z.length(), a.length(), al); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + bufferlist rl; + ASSERT_EQ((int)final.length(), + store->read(ch, hoid, 0, final.length(), rl)); + ASSERT_TRUE(bl_eq(rl, final)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + } + { + bufferlist final; + bufferptr p(16000); + memset(p.c_str(), 5, p.length()); + bufferlist pl; + pl.append(p); + final.append(p); + ObjectStore::Transaction t; + t.write(cid, hoid, 0, pl.length(), pl); + t.clone(cid, hoid, hoid2); + bufferptr z(1000); + z.zero(); + final.append(z); + bufferptr a(8000); + memset(a.c_str(), 6, a.length()); + bufferlist al; + al.append(a); + final.append(a); + t.write(cid, hoid, 17000, a.length(), al); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + bufferlist rl; + ASSERT_EQ((int)final.length(), + store->read(ch, hoid, 0, final.length(), rl)); + /*cout << "expected:\n"; + final.hexdump(cout); + cout << "got:\n"; + rl.hexdump(cout);*/ + ASSERT_TRUE(bl_eq(rl, final)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + } + { + bufferptr p(1048576); + memset(p.c_str(), 3, p.length()); + bufferlist pl; + pl.append(p); + ObjectStore::Transaction t; + t.write(cid, hoid, 0, pl.length(), pl); + t.clone(cid, hoid, hoid2); + bufferptr a(65536); + memset(a.c_str(), 4, a.length()); + bufferlist al; + al.append(a); + t.write(cid, hoid, a.length(), a.length(), al); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + bufferlist rl; + bufferlist final; + final.substr_of(pl, 0, al.length()); + final.append(al); + bufferlist end; + end.substr_of(pl, al.length()*2, pl.length() - al.length()*2); + final.append(end); + ASSERT_EQ((int)final.length(), + store->read(ch, hoid, 0, final.length(), rl)); + /*cout << "expected:\n"; + final.hexdump(cout); + cout << "got:\n"; + rl.hexdump(cout);*/ + ASSERT_TRUE(bl_eq(rl, final)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + } + { + bufferptr p(65536); + memset(p.c_str(), 7, p.length()); + bufferlist pl; + pl.append(p); + ObjectStore::Transaction t; + t.write(cid, hoid, 0, pl.length(), pl); + t.clone(cid, hoid, hoid2); + bufferptr a(4096); + memset(a.c_str(), 8, a.length()); + bufferlist al; + al.append(a); + t.write(cid, hoid, 32768, a.length(), al); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + bufferlist rl; + bufferlist final; + final.substr_of(pl, 0, 32768); + final.append(al); + bufferlist end; + end.substr_of(pl, final.length(), pl.length() - final.length()); + final.append(end); + ASSERT_EQ((int)final.length(), + store->read(ch, hoid, 0, final.length(), rl)); + /*cout << "expected:\n"; + final.hexdump(cout); + cout << "got:\n"; + rl.hexdump(cout);*/ + ASSERT_TRUE(bl_eq(rl, final)); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + } + { + bufferptr p(65536); + memset(p.c_str(), 9, p.length()); + bufferlist pl; + pl.append(p); + ObjectStore::Transaction t; + t.write(cid, hoid, 0, pl.length(), pl); + t.clone(cid, hoid, hoid2); + bufferptr a(4096); + memset(a.c_str(), 10, a.length()); + bufferlist al; + al.append(a); + t.write(cid, hoid, 33768, a.length(), al); + ASSERT_EQ(0, queue_transaction(store, ch, std::move(t))); + bufferlist rl; + bufferlist final; + final.substr_of(pl, 0, 33768); + final.append(al); + bufferlist end; + end.substr_of(pl, final.length(), pl.length() - final.length()); + final.append(end); + ASSERT_EQ((int)final.length(), + store->read(ch, hoid, 0, final.length(), rl)); + /*cout << "expected:\n"; + final.hexdump(cout); + cout << "got:\n"; + rl.hexdump(cout);*/ + ASSERT_TRUE(bl_eq(rl, final)); + } + + { + //verify if non-empty collection is properly handled after store reload + ch.reset(); + r = store->umount(); + ASSERT_EQ(r, 0); + r = store->mount(); + ASSERT_EQ(r, 0); + ch = store->open_collection(cid); + + ObjectStore::Transaction t; + t.remove_collection(cid); + cerr << "Invalid rm coll" << std::endl; + PrCtl unset_dumpable; + EXPECT_DEATH(queue_transaction(store, ch, std::move(t)), ""); + } + { + ObjectStore::Transaction t; + t.touch(cid, hoid3); //new record in db + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + //verify if non-empty collection is properly handled when there are some pending removes and live records in db + cerr << "Invalid rm coll again" << std::endl; + ch.reset(); + r = store->umount(); + ASSERT_EQ(r, 0); + r = store->mount(); + ASSERT_EQ(r, 0); + ch = store->open_collection(cid); + + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + PrCtl unset_dumpable; + EXPECT_DEATH(queue_transaction(store, ch, std::move(t)), ""); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove(cid, hoid3); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, OmapSimple) { + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("omap_obj", CEPH_NOSNAP), + "key", 123, -1, "")); + bufferlist small; + small.append("small"); + map<string,bufferlist> km; + km["foo"] = small; + km["bar"].append("asdfjkasdkjdfsjkafskjsfdj"); + bufferlist header; + header.append("this is a header"); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.omap_setkeys(cid, hoid, km); + t.omap_setheader(cid, hoid, header); + cerr << "Creating object and set omap " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + // get header, keys + { + bufferlist h; + map<string,bufferlist> r; + store->omap_get(ch, hoid, &h, &r); + ASSERT_TRUE(bl_eq(header, h)); + ASSERT_EQ(r.size(), km.size()); + cout << "r: " << r << std::endl; + } + // test iterator with seek_to_first + { + map<string,bufferlist> r; + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + r[iter->key()] = iter->value(); + } + cout << "r: " << r << std::endl; + ASSERT_EQ(r.size(), km.size()); + } + // test iterator with initial lower_bound + { + map<string,bufferlist> r; + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid); + for (iter->lower_bound(string()); iter->valid(); iter->next()) { + r[iter->key()] = iter->value(); + } + cout << "r: " << r << std::endl; + ASSERT_EQ(r.size(), km.size()); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, OmapCloneTest) { + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP), + "key", 123, -1, "")); + bufferlist small; + small.append("small"); + map<string,bufferlist> km; + km["foo"] = small; + km["bar"].append("asdfjkasdkjdfsjkafskjsfdj"); + bufferlist header; + header.append("this is a header"); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.omap_setkeys(cid, hoid, km); + t.omap_setheader(cid, hoid, header); + cerr << "Creating object and set omap " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP), + "key", 123, -1, "")); + { + ObjectStore::Transaction t; + t.clone(cid, hoid, hoid2); + cerr << "Clone object" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + map<string,bufferlist> r; + bufferlist h; + store->omap_get(ch, hoid2, &h, &r); + ASSERT_TRUE(bl_eq(header, h)); + ASSERT_EQ(r.size(), km.size()); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, SimpleCloneRangeTest) { + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + hoid.hobj.pool = -1; + bufferlist small, newdata; + small.append("small"); + { + ObjectStore::Transaction t; + t.write(cid, hoid, 10, 5, small); + cerr << "Creating object and write bl " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP))); + hoid2.hobj.pool = -1; + { + ObjectStore::Transaction t; + t.clone_range(cid, hoid, hoid2, 10, 5, 10); + cerr << "Clone range object" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + r = store->read(ch, hoid2, 10, 5, newdata); + ASSERT_EQ(r, 5); + ASSERT_TRUE(bl_eq(small, newdata)); + } + { + ObjectStore::Transaction t; + t.truncate(cid, hoid, 1024*1024); + t.clone_range(cid, hoid, hoid2, 0, 1024*1024, 0); + cerr << "Clone range object" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + struct stat stat, stat2; + r = store->stat(ch, hoid, &stat); + r = store->stat(ch, hoid2, &stat2); + ASSERT_EQ(stat.st_size, stat2.st_size); + ASSERT_EQ(1024*1024, stat2.st_size); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +#if defined(WITH_BLUESTORE) +TEST_P(StoreTest, BlueStoreUnshareBlobTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: non-deterministic behavior with smr" << std::endl; + return; + } + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + hoid.hobj.pool = -1; + ghobject_t hoid2(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + hoid2.hobj.pool = -1; + hoid2.generation = 2; + { + // check if blob is unshared properly + bufferlist data, newdata; + data.append(string(8192, 'a')); + + ObjectStore::Transaction t; + t.write(cid, hoid, 0, data.length(), data); + cerr << "Creating object and write 8K " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + ObjectStore::Transaction t2; + t2.clone_range(cid, hoid, hoid2, 0, 4096, 0); + cerr << "Clone range object" << std::endl; + r = queue_transaction(store, ch, std::move(t2)); + ASSERT_EQ(r, 0); + + data.clear(); + data.append(string(4096, 'b')); + + ObjectStore::Transaction t3; + t3.write(cid, hoid, 0, data.length(), data); + cerr << "Writing 4k to source object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t3)); + ASSERT_EQ(r, 0); + + { + // this trims hoid one out of onode cache + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + + ObjectStore::Transaction t4; + t4.remove(cid, hoid2); + cerr << "Deleting dest object" << hoid2 << std::endl; + r = queue_transaction(store, ch, std::move(t4)); + ASSERT_EQ(r, 0); + + { + // this ensures remove operation submitted to kv store + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + + bufferlist resdata; + r = store->read(ch, hoid, 0, 0x2000, resdata); + ASSERT_EQ(r, 0x2000); + + { + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + auto* kv = bstore->get_kv(); + + // to be inline with BlueStore.cc + const string PREFIX_SHARED_BLOB = "X"; + + size_t cnt = 0; + auto it = kv->get_iterator(PREFIX_SHARED_BLOB); + ceph_assert(it); + for (it->lower_bound(string()); it->valid(); it->next()) { + ++cnt; + } + ASSERT_EQ(cnt, 0); + } + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, BlueStoreUnshareBlobBugTest) { + if (string(GetParam()) != "bluestore") + return; + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + hoid.hobj.pool = -1; + ghobject_t hoid2(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + hoid2.hobj.pool = -1; + hoid2.generation = 2; + { + // check if blob is unshared properly + bufferlist data, newdata; + data.append(string(8192, 'a')); + + ObjectStore::Transaction t; + t.write(cid, hoid, 0, data.length(), data); + cerr << "Creating object and write 8K " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + ObjectStore::Transaction t2; + t2.clone_range(cid, hoid, hoid2, 0, 4096, 0); + cerr << "Clone range object" << std::endl; + r = queue_transaction(store, ch, std::move(t2)); + ASSERT_EQ(r, 0); + + data.clear(); + data.append(string(4096, 'b')); + + ObjectStore::Transaction t3; + t3.write(cid, hoid, 0, data.length(), data); + cerr << "Writing 4k to source object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t3)); + ASSERT_EQ(r, 0); + + { + // this trims hoid one out of onode cache + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + + ObjectStore::Transaction t4; + t4.write(cid, hoid2, 0, data.length(), data); + cerr << "Writing 4k to second object " << hoid2 << std::endl; + r = queue_transaction(store, ch, std::move(t4)); + ASSERT_EQ(r, 0); + + bufferlist resdata; + r = store->read(ch, hoid, 0, 0x2000, resdata); + ASSERT_EQ(r, 0x2000); + + { + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + auto* kv = bstore->get_kv(); + + // to be inline with BlueStore.cc + const string PREFIX_SHARED_BLOB = "X"; + + size_t cnt = 0; + auto it = kv->get_iterator(PREFIX_SHARED_BLOB); + ceph_assert(it); + for (it->lower_bound(string()); it->valid(); it->next()) { + ++cnt; + } + // This shows a bug in unsharing a blob, + // after writing to 0x0~1000 to hoid2 share blob at hoid should be + //unshared but it doesn't in the current implementation + ASSERT_EQ(cnt, 1); + } + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} +#endif + +TEST_P(StoreTest, SimpleObjectLongnameTest) { + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP))); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +ghobject_t generate_long_name(unsigned i) +{ + stringstream name; + name << "object id " << i << " "; + for (unsigned j = 0; j < 500; ++j) name << 'a'; + ghobject_t hoid(hobject_t(sobject_t(name.str(), CEPH_NOSNAP))); + hoid.hobj.set_hash(i % 2); + return hoid; +} + +TEST_P(StoreTest, LongnameSplitTest) { + int r; + coll_t cid; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + for (unsigned i = 0; i < 320; ++i) { + ObjectStore::Transaction t; + ghobject_t hoid = generate_long_name(i); + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + + ghobject_t test_obj = generate_long_name(319); + ghobject_t test_obj_2 = test_obj; + test_obj_2.generation = 0; + { + ObjectStore::Transaction t; + // should cause a split + t.collection_move_rename( + cid, test_obj, + cid, test_obj_2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + + for (unsigned i = 0; i < 319; ++i) { + ObjectStore::Transaction t; + ghobject_t hoid = generate_long_name(i); + t.remove(cid, hoid); + cerr << "Removing object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + { + ObjectStore::Transaction t; + t.remove(cid, test_obj_2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(0, r); + } + +} + +TEST_P(StoreTest, ManyObjectTest) { + int NUM_OBJS = 2000; + int r = 0; + coll_t cid; + string base = ""; + for (int i = 0; i < 100; ++i) base.append("aaaaa"); + set<ghobject_t> created; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + for (int i = 0; i < NUM_OBJS; ++i) { + if (!(i % 5)) { + cerr << "Object " << i << std::endl; + } + ObjectStore::Transaction t; + char buf[100]; + snprintf(buf, sizeof(buf), "%d", i); + ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP))); + t.touch(cid, hoid); + created.insert(hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + for (set<ghobject_t>::iterator i = created.begin(); + i != created.end(); + ++i) { + struct stat buf; + ASSERT_TRUE(!store->stat(ch, *i, &buf)); + } + + set<ghobject_t> listed, listed2; + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + + cerr << "objects.size() is " << objects.size() << std::endl; + for (vector<ghobject_t> ::iterator i = objects.begin(); + i != objects.end(); + ++i) { + listed.insert(*i); + ASSERT_TRUE(created.count(*i)); + } + ASSERT_TRUE(listed.size() == created.size()); + + ghobject_t start, next; + objects.clear(); + r = collection_list( + store, + ch, + ghobject_t::get_max(), + ghobject_t::get_max(), + 50, + &objects, + &next + ); + ASSERT_EQ(r, 0); + ASSERT_TRUE(objects.empty()); + + objects.clear(); + listed.clear(); + ghobject_t start2, next2; + while (1) { + r = collection_list(store, ch, start, ghobject_t::get_max(), 50, &objects, + &next); + ASSERT_TRUE(sorted(objects)); + ASSERT_EQ(r, 0); + listed.insert(objects.begin(), objects.end()); + if (objects.size() < 50) { + ASSERT_TRUE(next.is_max()); + break; + } + objects.clear(); + + start = next; + } + cerr << "listed.size() is " << listed.size() << std::endl; + ASSERT_TRUE(listed.size() == created.size()); + if (listed2.size()) { + ASSERT_EQ(listed.size(), listed2.size()); + } + for (set<ghobject_t>::iterator i = listed.begin(); + i != listed.end(); + ++i) { + ASSERT_TRUE(created.count(*i)); + } + + for (set<ghobject_t>::iterator i = created.begin(); + i != created.end(); + ++i) { + ObjectStore::Transaction t; + t.remove(cid, *i); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + cerr << "cleaning up" << std::endl; + { + ObjectStore::Transaction t; + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + + +class ObjectGenerator { +public: + virtual ghobject_t create_object(gen_type *gen) = 0; + virtual ~ObjectGenerator() {} +}; + +class MixedGenerator : public ObjectGenerator { +public: + unsigned seq; + int64_t poolid; + explicit MixedGenerator(int64_t p) : seq(0), poolid(p) {} + ghobject_t create_object(gen_type *gen) override { + char buf[100]; + snprintf(buf, sizeof(buf), "OBJ_%u", seq); + string name(buf); + if (seq % 2) { + for (unsigned i = 0; i < 300; ++i) { + name.push_back('a'); + } + } + ++seq; + return ghobject_t( + hobject_t( + name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), + (((seq / 1024) % 2) * 0xF00 ) + + (seq & 0xFF), + poolid, "")); + } +}; + +class SyntheticWorkloadState { + struct Object { + bufferlist data; + map<string, bufferlist> attrs; + }; +public: + static const unsigned max_in_flight = 16; + static const unsigned max_objects = 3000; + static const unsigned max_attr_size = 5; + static const unsigned max_attr_name_len = 100; + static const unsigned max_attr_value_len = 1024 * 64; + coll_t cid; + unsigned write_alignment; + unsigned max_object_len, max_write_len; + unsigned in_flight; + map<ghobject_t, Object> contents; + set<ghobject_t> available_objects; + set<ghobject_t>::iterator next_available_object; + set<ghobject_t> in_flight_objects; + ObjectGenerator *object_gen; + gen_type *rng; + ObjectStore *store; + ObjectStore::CollectionHandle ch; + + ceph::mutex lock = ceph::make_mutex("State lock"); + ceph::condition_variable cond; + + struct EnterExit { + const char *msg; + explicit EnterExit(const char *m) : msg(m) { + //cout << pthread_self() << " enter " << msg << std::endl; + } + ~EnterExit() { + //cout << pthread_self() << " exit " << msg << std::endl; + } + }; + + class C_SyntheticOnReadable : public Context { + public: + SyntheticWorkloadState *state; + ghobject_t hoid; + C_SyntheticOnReadable(SyntheticWorkloadState *state, ghobject_t hoid) + : state(state), hoid(hoid) {} + + void finish(int r) override { + std::lock_guard locker{state->lock}; + EnterExit ee("onreadable finish"); + ASSERT_TRUE(state->in_flight_objects.count(hoid)); + ASSERT_EQ(r, 0); + state->in_flight_objects.erase(hoid); + if (state->contents.count(hoid)) + state->available_objects.insert(hoid); + --(state->in_flight); + state->cond.notify_all(); + + bufferlist r2; + r = state->store->read(state->ch, hoid, 0, state->contents[hoid].data.length(), r2); + ceph_assert(bl_eq(state->contents[hoid].data, r2)); + state->cond.notify_all(); + } + }; + + class C_SyntheticOnStash : public Context { + public: + SyntheticWorkloadState *state; + ghobject_t oid, noid; + + C_SyntheticOnStash(SyntheticWorkloadState *state, + ghobject_t oid, ghobject_t noid) + : state(state), oid(oid), noid(noid) {} + + void finish(int r) override { + std::lock_guard locker{state->lock}; + EnterExit ee("stash finish"); + ASSERT_TRUE(state->in_flight_objects.count(oid)); + ASSERT_EQ(r, 0); + state->in_flight_objects.erase(oid); + if (state->contents.count(noid)) + state->available_objects.insert(noid); + --(state->in_flight); + bufferlist r2; + r = state->store->read( + state->ch, noid, 0, + state->contents[noid].data.length(), r2); + ceph_assert(bl_eq(state->contents[noid].data, r2)); + state->cond.notify_all(); + } + }; + + class C_SyntheticOnClone : public Context { + public: + SyntheticWorkloadState *state; + ghobject_t oid, noid; + + C_SyntheticOnClone(SyntheticWorkloadState *state, + ghobject_t oid, ghobject_t noid) + : state(state), oid(oid), noid(noid) {} + + void finish(int r) override { + std::lock_guard locker{state->lock}; + EnterExit ee("clone finish"); + ASSERT_TRUE(state->in_flight_objects.count(oid)); + ASSERT_EQ(r, 0); + state->in_flight_objects.erase(oid); + if (state->contents.count(oid)) + state->available_objects.insert(oid); + if (state->contents.count(noid)) + state->available_objects.insert(noid); + --(state->in_flight); + bufferlist r2; + r = state->store->read(state->ch, noid, 0, state->contents[noid].data.length(), r2); + ceph_assert(bl_eq(state->contents[noid].data, r2)); + state->cond.notify_all(); + } + }; + + static void filled_byte_array(bufferlist& bl, size_t size) + { + static const char alphanum[] = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + if (!size) { + return; + } + bufferptr bp(size); + for (unsigned int i = 0; i < size - 1; i++) { + // severely limit entropy so we can compress... + bp[i] = alphanum[rand() % 10]; //(sizeof(alphanum) - 1)]; + } + bp[size - 1] = '\0'; + + bl.append(bp); + } + + SyntheticWorkloadState(ObjectStore *store, + ObjectGenerator *gen, + gen_type *rng, + coll_t cid, + unsigned max_size, + unsigned max_write, + unsigned alignment) + : cid(cid), write_alignment(alignment), max_object_len(max_size), + max_write_len(max_write), in_flight(0), + next_available_object(available_objects.end()), + object_gen(gen), rng(rng), store(store) {} + + int init() { + ObjectStore::Transaction t; + ch = store->create_new_collection(cid); + t.create_collection(cid, 0); + return queue_transaction(store, ch, std::move(t)); + } + void shutdown() { + ghobject_t next; + while (1) { + vector<ghobject_t> objects; + int r = collection_list(store, ch, next, ghobject_t::get_max(), 10, + &objects, &next); + ceph_assert(r >= 0); + if (objects.size() == 0) + break; + ObjectStore::Transaction t; + std::map<std::string, ceph::buffer::list> attrset; + for (vector<ghobject_t>::iterator p = objects.begin(); + p != objects.end(); ++p) { + t.remove(cid, *p); + } + queue_transaction(store, ch, std::move(t)); + } + ObjectStore::Transaction t; + t.remove_collection(cid); + queue_transaction(store, ch, std::move(t)); + } + void statfs(store_statfs_t& stat) { + store->statfs(&stat); + } + + ghobject_t get_uniform_random_object(std::unique_lock<ceph::mutex>& locker) { + cond.wait(locker, [this] { + return in_flight < max_in_flight && !available_objects.empty(); + }); + boost::uniform_int<> choose(0, available_objects.size() - 1); + int index = choose(*rng); + set<ghobject_t>::iterator i = available_objects.begin(); + for ( ; index > 0; --index, ++i) ; + ghobject_t ret = *i; + return ret; + } + + ghobject_t get_next_object(std::unique_lock<ceph::mutex>& locker) { + cond.wait(locker, [this] { + return in_flight < max_in_flight && !available_objects.empty(); + }); + + if (next_available_object == available_objects.end()) { + next_available_object = available_objects.begin(); + } + + ghobject_t ret = *next_available_object; + ++next_available_object; + return ret; + } + + void wait_for_ready(std::unique_lock<ceph::mutex>& locker) { + cond.wait(locker, [this] { return in_flight < max_in_flight; }); + } + + void wait_for_done() { + std::unique_lock locker{lock}; + cond.wait(locker, [this] { return in_flight == 0; }); + } + + bool can_create() { + return (available_objects.size() + in_flight_objects.size()) < max_objects; + } + + bool can_unlink() { + return (available_objects.size() + in_flight_objects.size()) > 0; + } + + unsigned get_random_alloc_hints() { + unsigned f = 0; + { + boost::uniform_int<> u(0, 3); + switch (u(*rng)) { + case 1: + f |= CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE; + break; + case 2: + f |= CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE; + break; + } + } + { + boost::uniform_int<> u(0, 3); + switch (u(*rng)) { + case 1: + f |= CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ; + break; + case 2: + f |= CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ; + break; + } + } + { + // append_only, immutable + boost::uniform_int<> u(0, 4); + f |= u(*rng) << 4; + } + { + boost::uniform_int<> u(0, 3); + switch (u(*rng)) { + case 1: + f |= CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED; + break; + case 2: + f |= CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED; + break; + } + } + { + boost::uniform_int<> u(0, 3); + switch (u(*rng)) { + case 1: + f |= CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; + break; + case 2: + f |= CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; + break; + } + } + return f; + } + + int touch() { + std::unique_lock locker{lock}; + EnterExit ee("touch"); + if (!can_create()) + return -ENOSPC; + wait_for_ready(locker); + ghobject_t new_obj = object_gen->create_object(rng); + available_objects.erase(new_obj); + ObjectStore::Transaction t; + t.touch(cid, new_obj); + boost::uniform_int<> u(17, 22); + boost::uniform_int<> v(12, 17); + t.set_alloc_hint(cid, new_obj, + 1ull << u(*rng), + 1ull << v(*rng), + get_random_alloc_hints()); + ++in_flight; + in_flight_objects.insert(new_obj); + if (!contents.count(new_obj)) + contents[new_obj] = Object(); + t.register_on_applied(new C_SyntheticOnReadable(this, new_obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + int stash() { + std::unique_lock locker{lock}; + EnterExit ee("stash"); + if (!can_unlink()) + return -ENOENT; + if (!can_create()) + return -ENOSPC; + wait_for_ready(locker); + + ghobject_t old_obj; + int max = 20; + do { + old_obj = get_uniform_random_object(locker); + } while (--max && !contents[old_obj].data.length()); + available_objects.erase(old_obj); + ghobject_t new_obj = old_obj; + new_obj.generation++; + available_objects.erase(new_obj); + + ObjectStore::Transaction t; + t.collection_move_rename(cid, old_obj, cid, new_obj); + ++in_flight; + in_flight_objects.insert(old_obj); + + contents[new_obj].attrs = contents[old_obj].attrs; + contents[new_obj].data = contents[old_obj].data; + contents.erase(old_obj); + t.register_on_applied(new C_SyntheticOnStash(this, old_obj, new_obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + int clone() { + std::unique_lock locker{lock}; + EnterExit ee("clone"); + if (!can_unlink()) + return -ENOENT; + if (!can_create()) + return -ENOSPC; + wait_for_ready(locker); + + ghobject_t old_obj; + int max = 20; + do { + old_obj = get_uniform_random_object(locker); + } while (--max && !contents[old_obj].data.length()); + available_objects.erase(old_obj); + ghobject_t new_obj = object_gen->create_object(rng); + // make the hash match + new_obj.hobj.set_hash(old_obj.hobj.get_hash()); + available_objects.erase(new_obj); + + ObjectStore::Transaction t; + t.clone(cid, old_obj, new_obj); + ++in_flight; + in_flight_objects.insert(old_obj); + + contents[new_obj].attrs = contents[old_obj].attrs; + contents[new_obj].data = contents[old_obj].data; + + t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + int clone_range() { + std::unique_lock locker{lock}; + EnterExit ee("clone_range"); + if (!can_unlink()) + return -ENOENT; + if (!can_create()) + return -ENOSPC; + wait_for_ready(locker); + + ghobject_t old_obj; + int max = 20; + do { + old_obj = get_uniform_random_object(locker); + } while (--max && !contents[old_obj].data.length()); + bufferlist &srcdata = contents[old_obj].data; + if (srcdata.length() == 0) { + return 0; + } + available_objects.erase(old_obj); + ghobject_t new_obj = get_uniform_random_object(locker); + available_objects.erase(new_obj); + + boost::uniform_int<> u1(0, max_object_len - max_write_len); + boost::uniform_int<> u2(0, max_write_len); + uint64_t srcoff = u1(*rng); + // make src and dst offsets match, since that's what the osd does + uint64_t dstoff = srcoff; //u1(*rng); + uint64_t len = u2(*rng); + if (write_alignment) { + srcoff = round_up_to(srcoff, write_alignment); + dstoff = round_up_to(dstoff, write_alignment); + len = round_up_to(len, write_alignment); + } + + if (srcoff > srcdata.length() - 1) { + srcoff = srcdata.length() - 1; + } + if (srcoff + len > srcdata.length()) { + len = srcdata.length() - srcoff; + } + if (0) + cout << __func__ << " from " << srcoff << "~" << len + << " (size " << srcdata.length() << ") to " + << dstoff << "~" << len << std::endl; + + ObjectStore::Transaction t; + t.clone_range(cid, old_obj, new_obj, srcoff, len, dstoff); + ++in_flight; + in_flight_objects.insert(old_obj); + + bufferlist bl; + if (srcoff < srcdata.length()) { + if (srcoff + len > srcdata.length()) { + bl.substr_of(srcdata, srcoff, srcdata.length() - srcoff); + } else { + bl.substr_of(srcdata, srcoff, len); + } + } + + bufferlist& dstdata = contents[new_obj].data; + if (dstdata.length() <= dstoff) { + if (bl.length() > 0) { + dstdata.append_zero(dstoff - dstdata.length()); + dstdata.append(bl); + } + } else { + bufferlist value; + ceph_assert(dstdata.length() > dstoff); + dstdata.cbegin().copy(dstoff, value); + value.append(bl); + if (value.length() < dstdata.length()) + dstdata.cbegin(value.length()).copy( + dstdata.length() - value.length(), value); + value.swap(dstdata); + } + + t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + + int write() { + std::unique_lock locker{lock}; + EnterExit ee("write"); + if (!can_unlink()) + return -ENOENT; + wait_for_ready(locker); + + ghobject_t new_obj = get_uniform_random_object(locker); + available_objects.erase(new_obj); + ObjectStore::Transaction t; + + boost::uniform_int<> u1(0, max_object_len - max_write_len); + boost::uniform_int<> u2(0, max_write_len); + uint64_t offset = u1(*rng); + uint64_t len = u2(*rng); + bufferlist bl; + if (write_alignment) { + offset = round_up_to(offset, write_alignment); + len = round_up_to(len, write_alignment); + } + + filled_byte_array(bl, len); + + bufferlist& data = contents[new_obj].data; + if (data.length() <= offset) { + if (len > 0) { + data.append_zero(offset-data.length()); + data.append(bl); + } + } else { + bufferlist value; + ceph_assert(data.length() > offset); + data.cbegin().copy(offset, value); + value.append(bl); + if (value.length() < data.length()) + data.cbegin(value.length()).copy( + data.length()-value.length(), value); + value.swap(data); + } + + t.write(cid, new_obj, offset, len, bl); + ++in_flight; + in_flight_objects.insert(new_obj); + t.register_on_applied(new C_SyntheticOnReadable(this, new_obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + int truncate() { + std::unique_lock locker{lock}; + EnterExit ee("truncate"); + if (!can_unlink()) + return -ENOENT; + wait_for_ready(locker); + + ghobject_t obj = get_uniform_random_object(locker); + available_objects.erase(obj); + ObjectStore::Transaction t; + + boost::uniform_int<> choose(0, max_object_len); + size_t len = choose(*rng); + if (write_alignment) { + len = round_up_to(len, write_alignment); + } + + t.truncate(cid, obj, len); + ++in_flight; + in_flight_objects.insert(obj); + bufferlist& data = contents[obj].data; + if (data.length() <= len) { + data.append_zero(len - data.length()); + } else { + bufferlist bl; + data.cbegin().copy(len, bl); + bl.swap(data); + } + + t.register_on_applied(new C_SyntheticOnReadable(this, obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + int zero() { + std::unique_lock locker{lock}; + EnterExit ee("zero"); + if (!can_unlink()) + return -ENOENT; + wait_for_ready(locker); + + ghobject_t new_obj = get_uniform_random_object(locker); + available_objects.erase(new_obj); + ObjectStore::Transaction t; + + boost::uniform_int<> u1(0, max_object_len - max_write_len); + boost::uniform_int<> u2(0, max_write_len); + uint64_t offset = u1(*rng); + uint64_t len = u2(*rng); + if (write_alignment) { + offset = round_up_to(offset, write_alignment); + len = round_up_to(len, write_alignment); + } + + if (len > 0) { + auto& data = contents[new_obj].data; + if (data.length() < offset + len) { + data.append_zero(offset+len-data.length()); + } + bufferlist n; + n.substr_of(data, 0, offset); + n.append_zero(len); + if (data.length() > offset + len) + data.cbegin(offset + len).copy(data.length() - offset - len, n); + data.swap(n); + } + + t.zero(cid, new_obj, offset, len); + ++in_flight; + in_flight_objects.insert(new_obj); + t.register_on_applied(new C_SyntheticOnReadable(this, new_obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + void read() { + EnterExit ee("read"); + boost::uniform_int<> u1(0, max_object_len/2); + boost::uniform_int<> u2(0, max_object_len); + uint64_t offset = u1(*rng); + uint64_t len = u2(*rng); + if (offset > len) + swap(offset, len); + + ghobject_t obj; + bufferlist expected; + int r; + { + std::unique_lock locker{lock}; + EnterExit ee("read locked"); + if (!can_unlink()) + return ; + wait_for_ready(locker); + + obj = get_uniform_random_object(locker); + expected = contents[obj].data; + } + bufferlist bl, result; + if (0) cout << " obj " << obj + << " size " << expected.length() + << " offset " << offset + << " len " << len << std::endl; + r = store->read(ch, obj, offset, len, result); + if (offset >= expected.length()) { + ASSERT_EQ(r, 0); + } else { + size_t max_len = expected.length() - offset; + if (len > max_len) + len = max_len; + ceph_assert(len == result.length()); + ASSERT_EQ(len, result.length()); + expected.cbegin(offset).copy(len, bl); + ASSERT_EQ(r, (int)len); + ASSERT_TRUE(bl_eq(bl, result)); + } + } + + int setattrs() { + std::unique_lock locker{lock}; + EnterExit ee("setattrs"); + if (!can_unlink()) + return -ENOENT; + wait_for_ready(locker); + + ghobject_t obj = get_uniform_random_object(locker); + available_objects.erase(obj); + ObjectStore::Transaction t; + + boost::uniform_int<> u0(1, max_attr_size); + boost::uniform_int<> u1(4, max_attr_name_len); + boost::uniform_int<> u2(4, max_attr_value_len); + boost::uniform_int<> u3(0, 100); + uint64_t size = u0(*rng); + uint64_t name_len; + map<string, bufferlist, less<>> attrs; + set<string> keys; + for (map<string, bufferlist>::iterator it = contents[obj].attrs.begin(); + it != contents[obj].attrs.end(); ++it) + keys.insert(it->first); + + while (size--) { + bufferlist name, value; + uint64_t get_exist = u3(*rng); + uint64_t value_len = u2(*rng); + filled_byte_array(value, value_len); + if (get_exist < 50 && keys.size()) { + set<string>::iterator k = keys.begin(); + attrs[*k] = value; + contents[obj].attrs[*k] = value; + keys.erase(k); + } else { + name_len = u1(*rng); + filled_byte_array(name, name_len); + attrs[name.c_str()] = value; + contents[obj].attrs[name.c_str()] = value; + } + } + t.setattrs(cid, obj, attrs); + ++in_flight; + in_flight_objects.insert(obj); + t.register_on_applied(new C_SyntheticOnReadable(this, obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + int set_fixed_attrs(size_t entries, size_t key_size, size_t val_size) { + std::unique_lock locker{ lock }; + EnterExit ee("setattrs"); + if (!can_unlink()) + return -ENOENT; + wait_for_ready(locker); + + ghobject_t obj = get_next_object(locker); + available_objects.erase(obj); + ObjectStore::Transaction t; + + map<string, bufferlist, less<>> attrs; + set<string> keys; + + while (entries--) { + bufferlist name, value; + filled_byte_array(value, val_size); + filled_byte_array(name, key_size); + attrs[name.c_str()] = value; + contents[obj].attrs[name.c_str()] = value; + } + t.setattrs(cid, obj, attrs); + ++in_flight; + in_flight_objects.insert(obj); + t.register_on_applied(new C_SyntheticOnReadable(this, obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + void getattrs() { + EnterExit ee("getattrs"); + ghobject_t obj; + map<string, bufferlist> expected; + { + std::unique_lock locker{lock}; + EnterExit ee("getattrs locked"); + if (!can_unlink()) + return ; + wait_for_ready(locker); + + int retry = 10; + do { + obj = get_uniform_random_object(locker); + if (!--retry) + return ; + } while (contents[obj].attrs.empty()); + expected = contents[obj].attrs; + } + map<string, bufferlist, less<>> attrs; + int r = store->getattrs(ch, obj, attrs); + ASSERT_TRUE(r == 0); + ASSERT_TRUE(attrs.size() == expected.size()); + for (map<string, bufferlist>::iterator it = expected.begin(); + it != expected.end(); ++it) { + ASSERT_TRUE(bl_eq(attrs[it->first], it->second)); + } + } + + void getattr() { + EnterExit ee("getattr"); + ghobject_t obj; + int r; + int retry; + map<string, bufferlist> expected; + { + std::unique_lock locker{lock}; + EnterExit ee("getattr locked"); + if (!can_unlink()) + return ; + wait_for_ready(locker); + + retry = 10; + do { + obj = get_uniform_random_object(locker); + if (!--retry) + return ; + } while (contents[obj].attrs.empty()); + expected = contents[obj].attrs; + } + boost::uniform_int<> u(0, expected.size()-1); + retry = u(*rng); + map<string, bufferlist>::iterator it = expected.begin(); + while (retry) { + retry--; + ++it; + } + + bufferlist bl; + r = store->getattr(ch, obj, it->first, bl); + ASSERT_EQ(r, 0); + ASSERT_TRUE(bl_eq(it->second, bl)); + } + + int rmattr() { + std::unique_lock locker{lock}; + EnterExit ee("rmattr"); + if (!can_unlink()) + return -ENOENT; + wait_for_ready(locker); + + ghobject_t obj; + int retry = 10; + do { + obj = get_uniform_random_object(locker); + if (!--retry) + return 0; + } while (contents[obj].attrs.empty()); + + boost::uniform_int<> u(0, contents[obj].attrs.size()-1); + retry = u(*rng); + map<string, bufferlist>::iterator it = contents[obj].attrs.begin(); + while (retry) { + retry--; + ++it; + } + + available_objects.erase(obj); + ObjectStore::Transaction t; + t.rmattr(cid, obj, it->first); + + contents[obj].attrs.erase(it->first); + ++in_flight; + in_flight_objects.insert(obj); + t.register_on_applied(new C_SyntheticOnReadable(this, obj)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + void fsck(bool deep) { + std::unique_lock locker{lock}; + EnterExit ee("fsck"); + cond.wait(locker, [this] { return in_flight == 0; }); + ch.reset(); + store->umount(); + int r = store->fsck(deep); + ceph_assert(r == 0 || r == -EOPNOTSUPP); + store->mount(); + ch = store->open_collection(cid); + } + + void scan() { + std::unique_lock locker{lock}; + EnterExit ee("scan"); + cond.wait(locker, [this] { return in_flight == 0; }); + vector<ghobject_t> objects; + set<ghobject_t> objects_set, objects_set2; + ghobject_t next, current; + while (1) { + //cerr << "scanning..." << std::endl; + int r = collection_list(store, ch, current, ghobject_t::get_max(), 100, + &objects, &next); + ASSERT_EQ(r, 0); + ASSERT_TRUE(sorted(objects)); + objects_set.insert(objects.begin(), objects.end()); + objects.clear(); + if (next.is_max()) break; + current = next; + } + if (objects_set.size() != available_objects.size()) { + for (set<ghobject_t>::iterator p = objects_set.begin(); + p != objects_set.end(); + ++p) + if (available_objects.count(*p) == 0) { + cerr << "+ " << *p << std::endl; + ceph_abort(); + } + for (set<ghobject_t>::iterator p = available_objects.begin(); + p != available_objects.end(); + ++p) + if (objects_set.count(*p) == 0) + cerr << "- " << *p << std::endl; + //cerr << " objects_set: " << objects_set << std::endl; + //cerr << " available_set: " << available_objects << std::endl; + ceph_abort_msg("badness"); + } + + ASSERT_EQ(objects_set.size(), available_objects.size()); + for (set<ghobject_t>::iterator i = objects_set.begin(); + i != objects_set.end(); + ++i) { + ASSERT_GT(available_objects.count(*i), (unsigned)0); + } + + int r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), + INT_MAX, &objects, 0); + ASSERT_EQ(r, 0); + objects_set2.insert(objects.begin(), objects.end()); + ASSERT_EQ(objects_set2.size(), available_objects.size()); + for (set<ghobject_t>::iterator i = objects_set2.begin(); + i != objects_set2.end(); + ++i) { + ASSERT_GT(available_objects.count(*i), (unsigned)0); + if (available_objects.count(*i) == 0) { + cerr << "+ " << *i << std::endl; + } + } + } + + void stat() { + EnterExit ee("stat"); + ghobject_t hoid; + uint64_t expected; + { + std::unique_lock locker{lock}; + EnterExit ee("stat lock1"); + if (!can_unlink()) + return ; + hoid = get_uniform_random_object(locker); + in_flight_objects.insert(hoid); + available_objects.erase(hoid); + ++in_flight; + expected = contents[hoid].data.length(); + } + struct stat buf; + int r = store->stat(ch, hoid, &buf); + ASSERT_EQ(0, r); + ceph_assert((uint64_t)buf.st_size == expected); + ASSERT_TRUE((uint64_t)buf.st_size == expected); + { + std::lock_guard locker{lock}; + EnterExit ee("stat lock2"); + --in_flight; + cond.notify_all(); + in_flight_objects.erase(hoid); + available_objects.insert(hoid); + } + } + + int unlink() { + std::unique_lock locker{lock}; + EnterExit ee("unlink"); + if (!can_unlink()) + return -ENOENT; + ghobject_t to_remove = get_uniform_random_object(locker); + ObjectStore::Transaction t; + t.remove(cid, to_remove); + ++in_flight; + available_objects.erase(to_remove); + in_flight_objects.insert(to_remove); + contents.erase(to_remove); + t.register_on_applied(new C_SyntheticOnReadable(this, to_remove)); + int status = store->queue_transaction(ch, std::move(t)); + return status; + } + + void print_internal_state() { + std::lock_guard locker{lock}; + cerr << "available_objects: " << available_objects.size() + << " in_flight_objects: " << in_flight_objects.size() + << " total objects: " << in_flight_objects.size() + available_objects.size() + << " in_flight " << in_flight << std::endl; + } +}; + + +void StoreTest::doSyntheticTest( + int num_ops, + uint64_t max_obj, uint64_t max_wr, uint64_t align) +{ + MixedGenerator gen(555); + gen_type rng(time(NULL)); + coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD)); + + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + g_ceph_context->_conf.apply_changes(nullptr); + + SyntheticWorkloadState test_obj(store.get(), &gen, &rng, cid, + max_obj, max_wr, align); + test_obj.init(); + for (int i = 0; i < num_ops/10; ++i) { + if (!(i % 500)) cerr << "seeding object " << i << std::endl; + test_obj.touch(); + } + for (int i = 0; i < num_ops; ++i) { + if (!(i % 1000)) { + cerr << "Op " << i << std::endl; + test_obj.print_internal_state(); + } + boost::uniform_int<> true_false(0, 999); + int val = true_false(rng); + if (val > 998) { + test_obj.fsck(true); + } else if (val > 997) { + test_obj.fsck(false); + } else if (val > 970) { + test_obj.scan(); + } else if (val > 950) { + test_obj.stat(); + } else if (val > 850) { + test_obj.zero(); + } else if (val > 800) { + test_obj.unlink(); + } else if (val > 550) { + test_obj.write(); + } else if (val > 500) { + test_obj.clone(); + } else if (val > 450) { + test_obj.clone_range(); + } else if (val > 300) { + test_obj.stash(); + } else if (val > 100) { + test_obj.read(); + } else { + test_obj.truncate(); + } + } + test_obj.wait_for_done(); + test_obj.shutdown(); +} + +TEST_P(StoreTest, Synthetic) { + doSyntheticTest(10000, 400*1024, 40*1024, 0); +} + +#if defined(WITH_BLUESTORE) +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixSharding) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", 0 }, // must be the first! + { "num_ops", "50000", 0 }, + { "max_write", "65536", 0 }, + { "max_size", "262144", 0 }, + { "alignment", "4096", 0 }, + { "bluestore_max_blob_size", "65536", 0 }, + { "bluestore_extent_map_shard_min_size", "60", 0 }, + { "bluestore_extent_map_shard_max_size", "300", 0 }, + { "bluestore_extent_map_shard_target_size", "150", 0 }, + { "bluestore_default_buffered_read", "true", 0 }, + { "bluestore_default_buffered_write", "true", 0 }, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} + +TEST_P(StoreTestSpecificAUSize, ZipperPatternSharded) { + if(string(GetParam()) != "bluestore") + return; + StartDeferred(4096); + + int r; + coll_t cid; + ghobject_t a(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + int len = 4096; + bufferptr bp(len); + bp.zero(); + bl.append(bp); + for (int i=0; i<1000; ++i) { + ObjectStore::Transaction t; + t.write(cid, a, i*2*len, len, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + for (int i=0; i<1000; ++i) { + ObjectStore::Transaction t; + t.write(cid, a, i*2*len + 1, len, bl, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCsumAlgorithm) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "65536", 0 }, // must be the first! + { "max_write", "65536", 0 }, + { "max_size", "1048576", 0 }, + { "alignment", "16", 0 }, + { "bluestore_csum_type", "crc32c", "crc32c_16", "crc32c_8", "xxhash32", + "xxhash64", "none", 0 }, + { "bluestore_default_buffered_write", "false", 0 }, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} + +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCsumVsCompression) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", "16384", 0 }, //to be the first! + { "max_write", "131072", 0 }, + { "max_size", "262144", 0 }, + { "alignment", "512", 0 }, + { "bluestore_compression_mode", "force", 0}, + { "bluestore_compression_algorithm", "snappy", "zlib", 0 }, + { "bluestore_csum_type", "crc32c", 0 }, + { "bluestore_default_buffered_read", "true", "false", 0 }, + { "bluestore_default_buffered_write", "true", "false", 0 }, + { "bluestore_sync_submit_transaction", "false", 0 }, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} + +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCompression) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first! + { "max_write", "1048576", 0 }, + { "max_size", "4194304", 0 }, + { "alignment", "65536", 0 }, + { "bluestore_compression_mode", "force", "aggressive", "passive", "none", 0}, + { "bluestore_default_buffered_write", "false", 0 }, + { "bluestore_sync_submit_transaction", "true", 0 }, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} + +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixCompressionAlgorithm) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first! + { "max_write", "1048576", 0 }, + { "max_size", "4194304", 0 }, + { "alignment", "65536", 0 }, + { "bluestore_compression_algorithm", "zlib", "snappy", 0 }, + { "bluestore_compression_mode", "force", 0 }, + { "bluestore_default_buffered_write", "false", 0 }, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} + +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixNoCsum) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first! + { "max_write", "65536", 0 }, + { "max_size", "1048576", 0 }, + { "alignment", "512", 0 }, + { "bluestore_max_blob_size", "262144", 0 }, + { "bluestore_compression_mode", "force", "none", 0}, + { "bluestore_csum_type", "none", 0}, + { "bluestore_default_buffered_read", "true", "false", 0 }, + { "bluestore_default_buffered_write", "true", 0 }, + { "bluestore_sync_submit_transaction", "true", "false", 0 }, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} + +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixPreferDeferred) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first! + { "max_write", "65536", 0 }, + { "max_size", "1048576", 0 }, + { "alignment", "512", 0 }, + { "bluestore_max_blob_size", "262144", 0 }, + { "bluestore_compression_mode", "force", "none", 0}, + { "bluestore_prefer_deferred_size", "32768", "0", 0}, + { 0 }, + }; + do_matrix(m, std::bind(&StoreTest::doSyntheticTest, this, _1, _2, _3, _4)); +} +#endif // WITH_BLUESTORE + +TEST_P(StoreTest, AttrSynthetic) { + MixedGenerator gen(447); + gen_type rng(time(NULL)); + coll_t cid(spg_t(pg_t(0,447),shard_id_t::NO_SHARD)); + + SyntheticWorkloadState test_obj(store.get(), &gen, &rng, cid, 40*1024, 4*1024, 0); + test_obj.init(); + for (int i = 0; i < 500; ++i) { + if (!(i % 10)) cerr << "seeding object " << i << std::endl; + test_obj.touch(); + } + for (int i = 0; i < 1000; ++i) { + if (!(i % 100)) { + cerr << "Op " << i << std::endl; + test_obj.print_internal_state(); + } + boost::uniform_int<> true_false(0, 99); + int val = true_false(rng); + if (val > 97) { + test_obj.scan(); + } else if (val > 93) { + test_obj.stat(); + } else if (val > 75) { + test_obj.rmattr(); + } else if (val > 47) { + test_obj.setattrs(); + } else if (val > 45) { + test_obj.clone(); + } else if (val > 37) { + test_obj.stash(); + } else if (val > 30) { + test_obj.getattrs(); + } else { + test_obj.getattr(); + } + } + test_obj.wait_for_done(); + test_obj.shutdown(); +} + +TEST_P(StoreTest, HashCollisionTest) { + int64_t poolid = 11; + coll_t cid(spg_t(pg_t(0,poolid),shard_id_t::NO_SHARD)); + int r; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + string base = ""; + for (int i = 0; i < 100; ++i) base.append("aaaaa"); + set<ghobject_t> created; + for (int n = 0; n < 10; ++n) { + char nbuf[100]; + sprintf(nbuf, "n%d", n); + for (int i = 0; i < 1000; ++i) { + char buf[100]; + sprintf(buf, "%d", i); + if (!(i % 100)) { + cerr << "Object n" << n << " "<< i << std::endl; + } + ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, poolid, string(nbuf))); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + created.insert(hoid); + } + } + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + set<ghobject_t> listed(objects.begin(), objects.end()); + cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl; + ASSERT_TRUE(listed.size() == created.size()); + objects.clear(); + listed.clear(); + ghobject_t current, next; + while (1) { + r = collection_list(store, ch, current, ghobject_t::get_max(), 60, &objects, + &next); + ASSERT_EQ(r, 0); + ASSERT_TRUE(sorted(objects)); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (listed.count(*i)) + cerr << *i << " repeated" << std::endl; + listed.insert(*i); + } + if (objects.size() < 50) { + ASSERT_TRUE(next.is_max()); + break; + } + objects.clear(); + current = next; + } + cerr << "listed.size() is " << listed.size() << std::endl; + ASSERT_TRUE(listed.size() == created.size()); + for (set<ghobject_t>::iterator i = listed.begin(); + i != listed.end(); + ++i) { + ASSERT_TRUE(created.count(*i)); + } + + for (set<ghobject_t>::iterator i = created.begin(); + i != created.end(); + ++i) { + ObjectStore::Transaction t; + t.remove(cid, *i); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ObjectStore::Transaction t; + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); +} + +TEST_P(StoreTest, HashCollisionSorting) { + bool disable_legacy = (string(GetParam()) == "bluestore"); + + char buf121664318_1[] = {18, -119, -121, -111, 0}; + char buf121664318_2[] = {19, 127, -121, 32, 0}; + char buf121664318_3[] = {19, -118, 15, 19, 0}; + char buf121664318_4[] = {28, 27, -116, -113, 0}; + char buf121664318_5[] = {28, 27, -115, -124, 0}; + + char buf121666222_1[] = {18, -119, -120, -111, 0}; + char buf121666222_2[] = {19, 127, -120, 32, 0}; + char buf121666222_3[] = {19, -118, 15, 30, 0}; + char buf121666222_4[] = {29, 17, -126, -113, 0}; + char buf121666222_5[] = {29, 17, -125, -124, 0}; + + std::map<uint32_t, std::vector<std::string>> object_names = { + {121664318, {{buf121664318_1}, + {buf121664318_2}, + {buf121664318_3}, + {buf121664318_4}, + {buf121664318_5}}}, + {121666222, {{buf121666222_1}, + {buf121666222_2}, + {buf121666222_3}, + {buf121666222_4}, + {buf121666222_5}}}}; + + int64_t poolid = 111; + coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + int r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + std::set<ghobject_t> created; + for (auto &[hash, names] : object_names) { + for (auto &name : names) { + ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP), + string(), + hash, + poolid, + string())); + ASSERT_EQ(hash, hoid.hobj.get_hash()); + ObjectStore::Transaction t; + t.touch(cid, hoid); + int r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + created.insert(hoid); + } + } + + vector<ghobject_t> objects; + int r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), + INT_MAX, &objects, 0, disable_legacy); + ASSERT_EQ(r, 0); + ASSERT_EQ(created.size(), objects.size()); + auto it = objects.begin(); + for (auto &hoid : created) { + ASSERT_EQ(hoid, *it); + it++; + } + + for (auto i = created.begin(); i != created.end(); i++) { + auto j = i; + for (j++; j != created.end(); j++) { + std::set<ghobject_t> created_sub(i, j); + objects.clear(); + ghobject_t next; + r = collection_list(store, ch, *i, ghobject_t::get_max(), + created_sub.size(), &objects, &next, disable_legacy); + ASSERT_EQ(r, 0); + ASSERT_EQ(created_sub.size(), objects.size()); + it = objects.begin(); + for (auto &hoid : created_sub) { + ASSERT_EQ(hoid, *it); + it++; + } + if (j == created.end()) { + ASSERT_TRUE(next.is_max()); + } else { + ASSERT_EQ(*j, next); + } + } + } + + for (auto i = created.begin(); i != created.end(); i++) { + auto j = i; + for (j++; j != created.end(); j++) { + std::set<ghobject_t> created_sub(i, j); + objects.clear(); + ghobject_t next; + r = collection_list(store, ch, *i, *j, INT_MAX, &objects, &next, + disable_legacy); + ASSERT_EQ(r, 0); + ASSERT_EQ(created_sub.size(), objects.size()); + it = objects.begin(); + for (auto &hoid : created_sub) { + ASSERT_EQ(hoid, *it); + it++; + } + if (j == created.end()) { + ASSERT_TRUE(next.is_max()); + } else { + ASSERT_EQ(*j, next); + } + } + } +} + +TEST_P(StoreTest, ScrubTest) { + int64_t poolid = 111; + coll_t cid(spg_t(pg_t(0, poolid),shard_id_t(1))); + int r; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + string base = "aaaaa"; + set<ghobject_t> created; + for (int i = 0; i < 1000; ++i) { + char buf[100]; + sprintf(buf, "%d", i); + if (!(i % 5)) { + cerr << "Object " << i << std::endl; + } + ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, i, + poolid, ""), + ghobject_t::NO_GEN, shard_id_t(1)); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + created.insert(hoid); + } + + // Add same hobject_t but different generation + { + ghobject_t hoid1(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), + ghobject_t::NO_GEN, shard_id_t(1)); + ghobject_t hoid2(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), (gen_t)1, shard_id_t(1)); + ghobject_t hoid3(hobject_t("same-object", string(), CEPH_NOSNAP, 0, poolid, ""), (gen_t)2, shard_id_t(1)); + ObjectStore::Transaction t; + t.touch(cid, hoid1); + t.touch(cid, hoid2); + t.touch(cid, hoid3); + r = queue_transaction(store, ch, std::move(t)); + created.insert(hoid1); + created.insert(hoid2); + created.insert(hoid3); + ASSERT_EQ(r, 0); + } + + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + set<ghobject_t> listed(objects.begin(), objects.end()); + cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl; + ASSERT_TRUE(listed.size() == created.size()); + objects.clear(); + listed.clear(); + ghobject_t current, next; + while (1) { + r = collection_list(store, ch, current, ghobject_t::get_max(), 60, &objects, + &next); + ASSERT_EQ(r, 0); + ASSERT_TRUE(sorted(objects)); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); ++i) { + if (listed.count(*i)) + cerr << *i << " repeated" << std::endl; + listed.insert(*i); + } + if (objects.size() < 50) { + ASSERT_TRUE(next.is_max()); + break; + } + objects.clear(); + current = next.get_boundary(); + } + cerr << "listed.size() is " << listed.size() << std::endl; + ASSERT_TRUE(listed.size() == created.size()); + for (set<ghobject_t>::iterator i = listed.begin(); + i != listed.end(); + ++i) { + ASSERT_TRUE(created.count(*i)); + } + + for (set<ghobject_t>::iterator i = created.begin(); + i != created.end(); + ++i) { + ObjectStore::Transaction t; + t.remove(cid, *i); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ObjectStore::Transaction t; + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); +} + + +TEST_P(StoreTest, OMapTest) { + coll_t cid; + ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + map<string, bufferlist> attrs; + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.omap_clear(cid, hoid); + map<string, bufferlist> start_set; + t.omap_setkeys(cid, hoid, start_set); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + for (int i = 0; i < 100; i++) { + if (!(i%5)) { + std::cout << "On iteration " << i << std::endl; + } + ObjectStore::Transaction t; + bufferlist bl; + map<string, bufferlist> cur_attrs; + r = store->omap_get(ch, hoid, &bl, &cur_attrs); + ASSERT_EQ(r, 0); + for (map<string, bufferlist>::iterator j = attrs.begin(); + j != attrs.end(); + ++j) { + bool correct = cur_attrs.count(j->first) && string(cur_attrs[j->first].c_str()) == string(j->second.c_str()); + if (!correct) { + std::cout << j->first << " is present in cur_attrs " << cur_attrs.count(j->first) << " times " << std::endl; + if (cur_attrs.count(j->first) > 0) { + std::cout << j->second.c_str() << " : " << cur_attrs[j->first].c_str() << std::endl; + } + } + ASSERT_EQ(correct, true); + } + ASSERT_EQ(attrs.size(), cur_attrs.size()); + + char buf[100]; + snprintf(buf, sizeof(buf), "%d", i); + bl.clear(); + bufferptr bp(buf, strlen(buf) + 1); + bl.append(bp); + map<string, bufferlist> to_add; + to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl)); + attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl)); + t.omap_setkeys(cid, hoid, to_add); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + int i = 0; + while (attrs.size()) { + if (!(i%5)) { + std::cout << "removal: On iteration " << i << std::endl; + } + ObjectStore::Transaction t; + bufferlist bl; + map<string, bufferlist> cur_attrs; + r = store->omap_get(ch, hoid, &bl, &cur_attrs); + ASSERT_EQ(r, 0); + for (map<string, bufferlist>::iterator j = attrs.begin(); + j != attrs.end(); + ++j) { + bool correct = cur_attrs.count(j->first) && string(cur_attrs[j->first].c_str()) == string(j->second.c_str()); + if (!correct) { + std::cout << j->first << " is present in cur_attrs " << cur_attrs.count(j->first) << " times " << std::endl; + if (cur_attrs.count(j->first) > 0) { + std::cout << j->second.c_str() << " : " << cur_attrs[j->first].c_str() << std::endl; + } + } + ASSERT_EQ(correct, true); + } + + string to_remove = attrs.begin()->first; + t.omap_rmkey(cid, hoid, to_remove); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + attrs.erase(to_remove); + + ++i; + } + + { + bufferlist bl1; + bl1.append("omap_header"); + ObjectStore::Transaction t; + t.omap_setheader(cid, hoid, bl1); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + t = ObjectStore::Transaction(); + + bufferlist bl2; + bl2.append("value"); + map<string, bufferlist> to_add; + to_add.insert(pair<string, bufferlist>("key", bl2)); + t.omap_setkeys(cid, hoid, to_add); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bufferlist bl3; + map<string, bufferlist> cur_attrs; + r = store->omap_get(ch, hoid, &bl3, &cur_attrs); + ASSERT_EQ(r, 0); + ASSERT_EQ(cur_attrs.size(), size_t(1)); + ASSERT_TRUE(bl_eq(bl1, bl3)); + + set<string> keys; + r = store->omap_get_keys(ch, hoid, &keys); + ASSERT_EQ(r, 0); + ASSERT_EQ(keys.size(), size_t(1)); + } + + // test omap_clear, omap_rmkey_range + { + { + map<string,bufferlist> to_set; + for (int n=0; n<10; ++n) { + to_set[stringify(n)].append("foo"); + } + bufferlist h; + h.append("header"); + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.touch(cid, hoid); + t.omap_setheader(cid, hoid, h); + t.omap_setkeys(cid, hoid, to_set); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.omap_rmkeyrange(cid, hoid, "3", "7"); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist hdr; + map<string,bufferlist> m; + store->omap_get(ch, hoid, &hdr, &m); + ASSERT_EQ(6u, hdr.length()); + ASSERT_TRUE(m.count("2")); + ASSERT_TRUE(!m.count("3")); + ASSERT_TRUE(!m.count("6")); + ASSERT_TRUE(m.count("7")); + ASSERT_TRUE(m.count("8")); + //cout << m << std::endl; + ASSERT_EQ(6u, m.size()); + } + { + ObjectStore::Transaction t; + t.omap_clear(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist hdr; + map<string,bufferlist> m; + store->omap_get(ch, hoid, &hdr, &m); + ASSERT_EQ(0u, hdr.length()); + ASSERT_EQ(0u, m.size()); + } + } + + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); +} + +TEST_P(StoreTest, OMapIterator) { + coll_t cid; + ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); + int count = 0; + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + map<string, bufferlist> attrs; + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.omap_clear(cid, hoid); + map<string, bufferlist> start_set; + t.omap_setkeys(cid, hoid, start_set); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ObjectMap::ObjectMapIterator iter; + bool correct; + //basic iteration + for (int i = 0; i < 100; i++) { + if (!(i%5)) { + std::cout << "On iteration " << i << std::endl; + } + bufferlist bl; + + // FileStore may deadlock two active iterators over the same data + iter = ObjectMap::ObjectMapIterator(); + + iter = store->get_omap_iterator(ch, hoid); + for (iter->seek_to_first(), count=0; iter->valid(); iter->next(), count++) { + string key = iter->key(); + bufferlist value = iter->value(); + correct = attrs.count(key) && (string(value.c_str()) == string(attrs[key].c_str())); + if (!correct) { + if (attrs.count(key) > 0) { + std::cout << "key " << key << "in omap , " << value.c_str() << " : " << attrs[key].c_str() << std::endl; + } + else + std::cout << "key " << key << "should not exists in omap" << std::endl; + } + ASSERT_EQ(correct, true); + } + ASSERT_EQ((int)attrs.size(), count); + + // FileStore may deadlock an active iterator vs queue_transaction + iter = ObjectMap::ObjectMapIterator(); + + char buf[100]; + snprintf(buf, sizeof(buf), "%d", i); + bl.clear(); + bufferptr bp(buf, strlen(buf) + 1); + bl.append(bp); + map<string, bufferlist> to_add; + to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl)); + attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl)); + ObjectStore::Transaction t; + t.omap_setkeys(cid, hoid, to_add); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + iter = store->get_omap_iterator(ch, hoid); + //lower bound + string bound_key = "key-5"; + iter->lower_bound(bound_key); + correct = bound_key <= iter->key(); + if (!correct) { + std::cout << "lower bound, bound key is " << bound_key << " < iter key is " << iter->key() << std::endl; + } + ASSERT_EQ(correct, true); + //upper bound + iter->upper_bound(bound_key); + correct = iter->key() > bound_key; + if (!correct) { + std::cout << "upper bound, bound key is " << bound_key << " >= iter key is " << iter->key() << std::endl; + } + ASSERT_EQ(correct, true); + + // FileStore may deadlock an active iterator vs queue_transaction + iter = ObjectMap::ObjectMapIterator(); + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, XattrTest) { + coll_t cid; + ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, "")); + bufferlist big; + for (unsigned i = 0; i < 10000; ++i) { + big.append('\0'); + } + bufferlist small; + for (unsigned i = 0; i < 10; ++i) { + small.append('\0'); + } + int r; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + map<string, bufferlist> attrs; + { + ObjectStore::Transaction t; + t.setattr(cid, hoid, "attr1", small); + attrs["attr1"] = small; + t.setattr(cid, hoid, "attr2", big); + attrs["attr2"] = big; + t.setattr(cid, hoid, "attr3", small); + attrs["attr3"] = small; + t.setattr(cid, hoid, "attr1", small); + attrs["attr1"] = small; + t.setattr(cid, hoid, "attr4", big); + attrs["attr4"] = big; + t.setattr(cid, hoid, "attr3", big); + attrs["attr3"] = big; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + map<string, bufferptr, less<>> aset; + store->getattrs(ch, hoid, aset); + ASSERT_EQ(aset.size(), attrs.size()); + for (map<string, bufferptr>::iterator i = aset.begin(); + i != aset.end(); + ++i) { + bufferlist bl; + bl.push_back(i->second); + ASSERT_TRUE(attrs[i->first] == bl); + } + + { + ObjectStore::Transaction t; + t.rmattr(cid, hoid, "attr2"); + attrs.erase("attr2"); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + aset.clear(); + store->getattrs(ch, hoid, aset); + ASSERT_EQ(aset.size(), attrs.size()); + for (map<string, bufferptr>::iterator i = aset.begin(); + i != aset.end(); + ++i) { + bufferlist bl; + bl.push_back(i->second); + ASSERT_TRUE(attrs[i->first] == bl); + } + + bufferptr bp; + r = store->getattr(ch, hoid, "attr2", bp); + ASSERT_EQ(r, -ENODATA); + + r = store->getattr(ch, hoid, "attr3", bp); + ASSERT_EQ(r, 0); + bufferlist bl2; + bl2.push_back(bp); + ASSERT_TRUE(bl2 == attrs["attr3"]); + + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); +} + +void colsplittest( + ObjectStore *store, + unsigned num_objects, + unsigned common_suffix_size, + bool clones + ) { + coll_t cid(spg_t(pg_t(0,52),shard_id_t::NO_SHARD)); + coll_t tid(spg_t(pg_t(1<<common_suffix_size,52),shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + auto tch = store->create_new_collection(tid); + int r = 0; + { + ObjectStore::Transaction t; + t.create_collection(cid, common_suffix_size); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist small; + small.append("small"); + { + ObjectStore::Transaction t; + for (uint32_t i = 0; i < (2 - (int)clones)*num_objects; ++i) { + stringstream objname; + objname << "obj" << i; + ghobject_t a(hobject_t( + objname.str(), + "", + CEPH_NOSNAP, + i<<common_suffix_size, + 52, "")); + t.write(cid, a, 0, small.length(), small, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + if (clones) { + objname << "-clone"; + ghobject_t b(hobject_t( + objname.str(), + "", + CEPH_NOSNAP, + i<<common_suffix_size, + 52, "")); + t.clone(cid, a, b); + } + if (i % 100) { + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + t = ObjectStore::Transaction(); + } + } + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.create_collection(tid, common_suffix_size + 1); + t.split_collection(cid, common_suffix_size+1, 1<<common_suffix_size, tid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ch->flush(); + + // check + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + ASSERT_EQ(objects.size(), num_objects); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + ASSERT_EQ(!!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u); + } + + objects.clear(); + r = collection_list(store, tch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + ASSERT_EQ(objects.size(), num_objects); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + ASSERT_EQ(!(i->hobj.get_hash() & (1<<common_suffix_size)), 0u); + } + + // merge them again! + { + ObjectStore::Transaction t; + t.merge_collection(tid, cid, common_suffix_size); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // check and clean up + ObjectStore::Transaction t; + { + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + ASSERT_EQ(objects.size(), num_objects * 2); // both halves + unsigned size = 0; + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + t.remove(cid, *i); + if (++size > 100) { + size = 0; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + t = ObjectStore::Transaction(); + } + } + } + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + ch->flush(); + ASSERT_TRUE(!store->collection_exists(tid)); +} + +TEST_P(StoreTest, ColSplitTest0) { + colsplittest(store.get(), 10, 5, false); +} +TEST_P(StoreTest, ColSplitTest1) { + colsplittest(store.get(), 10000, 11, false); +} +TEST_P(StoreTest, ColSplitTest1Clones) { + colsplittest(store.get(), 10000, 11, true); +} +TEST_P(StoreTest, ColSplitTest2) { + colsplittest(store.get(), 100, 7, false); +} +TEST_P(StoreTest, ColSplitTest2Clones) { + colsplittest(store.get(), 100, 7, true); +} + +#if 0 +TEST_P(StoreTest, ColSplitTest3) { + colsplittest(store.get(), 100000, 25); +} +#endif + +void test_merge_skewed(ObjectStore *store, + unsigned base, unsigned bits, + unsigned anum, unsigned bnum) +{ + cout << __func__ << " 0x" << std::hex << base << std::dec + << " bits " << bits + << " anum " << anum << " bnum " << bnum << std::endl; + /* + make merge source pgs have radically different # of objects in them, + which should trigger different splitting in filestore, and verify that + post-merge all objects are accessible. + */ + int r; + coll_t a(spg_t(pg_t(base, 0), shard_id_t::NO_SHARD)); + coll_t b(spg_t(pg_t(base | (1<<bits), 0), shard_id_t::NO_SHARD)); + + auto cha = store->create_new_collection(a); + auto chb = store->create_new_collection(b); + { + ObjectStore::Transaction t; + t.create_collection(a, bits + 1); + r = queue_transaction(store, cha, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.create_collection(b, bits + 1); + r = queue_transaction(store, chb, std::move(t)); + ASSERT_EQ(r, 0); + } + + bufferlist small; + small.append("small"); + string suffix = "ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooaaaaaaaaaa"; + set<ghobject_t> aobjects, bobjects; + { + // fill a + ObjectStore::Transaction t; + for (unsigned i = 0; i < 1000; ++i) { + string objname = "a" + stringify(i) + suffix; + ghobject_t o(hobject_t( + objname, + "", + CEPH_NOSNAP, + i<<(bits+1) | base, + 52, "")); + aobjects.insert(o); + t.write(a, o, 0, small.length(), small, 0); + if (i % 100) { + r = queue_transaction(store, cha, std::move(t)); + ASSERT_EQ(r, 0); + t = ObjectStore::Transaction(); + } + } + r = queue_transaction(store, cha, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // fill b + ObjectStore::Transaction t; + for (unsigned i = 0; i < 10; ++i) { + string objname = "b" + stringify(i) + suffix; + ghobject_t o(hobject_t( + objname, + "", + CEPH_NOSNAP, + (i<<(base+1)) | base | (1<<bits), + 52, "")); + bobjects.insert(o); + t.write(b, o, 0, small.length(), small, 0); + if (i % 100) { + r = queue_transaction(store, chb, std::move(t)); + ASSERT_EQ(r, 0); + t = ObjectStore::Transaction(); + } + } + r = queue_transaction(store, chb, std::move(t)); + ASSERT_EQ(r, 0); + } + + // merge b->a + { + ObjectStore::Transaction t; + t.merge_collection(b, a, bits); + r = queue_transaction(store, cha, std::move(t)); + ASSERT_EQ(r, 0); + } + + // verify + { + vector<ghobject_t> got; + collection_list(store, cha, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &got, 0); + set<ghobject_t> gotset; + for (auto& o : got) { + ASSERT_TRUE(aobjects.count(o) || bobjects.count(o)); + gotset.insert(o); + } + // check both listing and stat-ability (different code paths!) + struct stat st; + for (auto& o : aobjects) { + ASSERT_TRUE(gotset.count(o)); + int r = store->stat(cha, o, &st, false); + ASSERT_EQ(r, 0); + } + for (auto& o : bobjects) { + ASSERT_TRUE(gotset.count(o)); + int r = store->stat(cha, o, &st, false); + ASSERT_EQ(r, 0); + } + } + + // clean up + { + ObjectStore::Transaction t; + for (auto &o : aobjects) { + t.remove(a, o); + } + r = queue_transaction(store, cha, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + for (auto &o : bobjects) { + t.remove(a, o); + } + t.remove_collection(a); + r = queue_transaction(store, cha, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, MergeSkewed) { + if (string(GetParam()) != "filestore") + return; + + // this is sufficient to exercise merges with different hashing levels + test_merge_skewed(store.get(), 0xf, 4, 10, 10000); + test_merge_skewed(store.get(), 0xf, 4, 10000, 10); + + /* + // this covers a zillion variations that all boil down to the same thing + for (unsigned base = 3; base < 0x1000; base *= 5) { + unsigned bits; + unsigned t = base; + for (bits = 0; t; t >>= 1) { + ++bits; + } + for (unsigned b = bits; b < bits + 10; b += 3) { + for (auto anum : { 10, 1000, 10000 }) { + for (auto bnum : { 10, 1000, 10000 }) { + if (anum == bnum) { + continue; + } + test_merge_skewed(store.get(), base, b, anum, bnum); + } + } + } + } + */ +} + + +/** + * This test tests adding two different groups + * of objects, each with 1 common prefix and 1 + * different prefix. We then remove half + * in order to verify that the merging correctly + * stops at the common prefix subdir. See bug + * #5273 */ +TEST_P(StoreTest, TwoHash) { + coll_t cid; + int r; + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + std::cout << "Making objects" << std::endl; + for (int i = 0; i < 360; ++i) { + ObjectStore::Transaction t; + ghobject_t o; + o.hobj.pool = -1; + if (i < 8) { + o.hobj.set_hash((i << 16) | 0xA1); + t.touch(cid, o); + } + o.hobj.set_hash((i << 16) | 0xB1); + t.touch(cid, o); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + std::cout << "Removing half" << std::endl; + for (int i = 1; i < 8; ++i) { + ObjectStore::Transaction t; + ghobject_t o; + o.hobj.pool = -1; + o.hobj.set_hash((i << 16) | 0xA1); + t.remove(cid, o); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + std::cout << "Checking" << std::endl; + for (int i = 1; i < 8; ++i) { + ObjectStore::Transaction t; + ghobject_t o; + o.hobj.set_hash((i << 16) | 0xA1); + o.hobj.pool = -1; + bool exists = store->exists(ch, o); + ASSERT_EQ(exists, false); + } + { + ghobject_t o; + o.hobj.set_hash(0xA1); + o.hobj.pool = -1; + bool exists = store->exists(ch, o); + ASSERT_EQ(exists, true); + } + std::cout << "Cleanup" << std::endl; + for (int i = 0; i < 360; ++i) { + ObjectStore::Transaction t; + ghobject_t o; + o.hobj.set_hash((i << 16) | 0xA1); + o.hobj.pool = -1; + t.remove(cid, o); + o.hobj.set_hash((i << 16) | 0xB1); + t.remove(cid, o); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ObjectStore::Transaction t; + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); +} + +TEST_P(StoreTest, Rename) { + coll_t cid(spg_t(pg_t(0, 2122),shard_id_t::NO_SHARD)); + ghobject_t srcoid(hobject_t("src_oid", "", CEPH_NOSNAP, 0, 0, "")); + ghobject_t dstoid(hobject_t("dest_oid", "", CEPH_NOSNAP, 0, 0, "")); + bufferlist a, b; + a.append("foo"); + b.append("bar"); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.write(cid, srcoid, 0, a.length(), a); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_TRUE(store->exists(ch, srcoid)); + { + ObjectStore::Transaction t; + t.collection_move_rename(cid, srcoid, cid, dstoid); + t.write(cid, srcoid, 0, b.length(), b); + t.setattr(cid, srcoid, "attr", b); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_TRUE(store->exists(ch, srcoid)); + ASSERT_TRUE(store->exists(ch, dstoid)); + { + bufferlist bl; + store->read(ch, srcoid, 0, 3, bl); + ASSERT_TRUE(bl_eq(b, bl)); + store->read(ch, dstoid, 0, 3, bl); + ASSERT_TRUE(bl_eq(a, bl)); + } + { + ObjectStore::Transaction t; + t.remove(cid, dstoid); + t.collection_move_rename(cid, srcoid, cid, dstoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_TRUE(store->exists(ch, dstoid)); + ASSERT_FALSE(store->exists(ch, srcoid)); + { + bufferlist bl; + store->read(ch, dstoid, 0, 3, bl); + ASSERT_TRUE(bl_eq(b, bl)); + } + { + ObjectStore::Transaction t; + t.remove(cid, dstoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, MoveRename) { + coll_t cid(spg_t(pg_t(0, 212),shard_id_t::NO_SHARD)); + ghobject_t temp_oid(hobject_t("tmp_oid", "", CEPH_NOSNAP, 0, 0, "")); + ghobject_t oid(hobject_t("dest_oid", "", CEPH_NOSNAP, 0, 0, "")); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_TRUE(store->exists(ch, oid)); + bufferlist data, attr; + map<string, bufferlist> omap; + data.append("data payload"); + attr.append("attr value"); + omap["omap_key"].append("omap value"); + { + ObjectStore::Transaction t; + t.touch(cid, temp_oid); + t.write(cid, temp_oid, 0, data.length(), data); + t.setattr(cid, temp_oid, "attr", attr); + t.omap_setkeys(cid, temp_oid, omap); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_TRUE(store->exists(ch, temp_oid)); + { + ObjectStore::Transaction t; + t.remove(cid, oid); + t.collection_move_rename(cid, temp_oid, cid, oid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_TRUE(store->exists(ch, oid)); + ASSERT_FALSE(store->exists(ch, temp_oid)); + { + bufferlist newdata; + r = store->read(ch, oid, 0, 1000, newdata); + ASSERT_GE(r, 0); + ASSERT_TRUE(bl_eq(data, newdata)); + bufferlist newattr; + r = store->getattr(ch, oid, "attr", newattr); + ASSERT_EQ(r, 0); + ASSERT_TRUE(bl_eq(attr, newattr)); + set<string> keys; + keys.insert("omap_key"); + map<string, bufferlist> newomap; + r = store->omap_get_values(ch, oid, keys, &newomap); + ASSERT_GE(r, 0); + ASSERT_EQ(1u, newomap.size()); + ASSERT_TRUE(newomap.count("omap_key")); + ASSERT_TRUE(bl_eq(omap["omap_key"], newomap["omap_key"])); + } + { + ObjectStore::Transaction t; + t.remove(cid, oid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, BigRGWObjectName) { + coll_t cid(spg_t(pg_t(0,12),shard_id_t::NO_SHARD)); + ghobject_t oid( + hobject_t( + "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "", + CEPH_NOSNAP, + 0x81920472, + 12, + ""), + 15, + shard_id_t::NO_SHARD); + ghobject_t oid2(oid); + oid2.generation = 17; + ghobject_t oidhead(oid); + oidhead.generation = ghobject_t::NO_GEN; + + auto ch = store->create_new_collection(cid); + + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oidhead); + t.collection_move_rename(cid, oidhead, cid, oid); + t.touch(cid, oidhead); + t.collection_move_rename(cid, oidhead, cid, oid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + ObjectStore::Transaction t; + t.remove(cid, oid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + vector<ghobject_t> objects; + r = collection_list(store, ch, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + ASSERT_EQ(objects.size(), 1u); + ASSERT_EQ(objects[0], oid2); + } + + ASSERT_FALSE(store->exists(ch, oid)); + + { + ObjectStore::Transaction t; + t.remove(cid, oid2); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + } +} + +TEST_P(StoreTest, SetAllocHint) { + coll_t cid; + ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, "")); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTest, TryMoveRename) { + coll_t cid; + ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, "")); + ghobject_t hoid2(hobject_t("test_hint2", "", CEPH_NOSNAP, 0, -1, "")); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.try_rename(cid, hoid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.try_rename(cid, hoid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + struct stat st; + ASSERT_EQ(store->stat(ch, hoid, &st), -ENOENT); + ASSERT_EQ(store->stat(ch, hoid2, &st), 0); +} + +#if defined(WITH_BLUESTORE) +TEST_P(StoreTest, BluestoreOnOffCSumTest) { + if (string(GetParam()) != "bluestore") + return; + SetVal(g_conf(), "bluestore_csum_type", "crc32c"); + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + { + auto ch = store->open_collection(cid); + ASSERT_FALSE(ch); + } + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + //write with csum enabled followed by read with csum disabled + size_t block_size = 64*1024; + ObjectStore::Transaction t; + bufferlist bl, orig; + bl.append(std::string(block_size, 'a')); + orig = bl; + t.remove(cid, hoid); + t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*8, 0); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "Remove then create" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + SetVal(g_conf(), "bluestore_csum_type", "none"); + g_conf().apply_changes(nullptr); + + bufferlist in; + r = store->read(ch, hoid, 0, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + + } + { + //write with csum disabled followed by read with csum enabled + + size_t block_size = 64*1024; + ObjectStore::Transaction t; + bufferlist bl, orig; + bl.append(std::string(block_size, 'a')); + orig = bl; + t.remove(cid, hoid); + t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*8, 0); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "Remove then create" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + SetVal(g_conf(), "bluestore_csum_type", "crc32c"); + g_conf().apply_changes(nullptr); + + bufferlist in; + r = store->read(ch, hoid, 0, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + } + { + //'mixed' non-overlapping writes to the same blob + + ObjectStore::Transaction t; + bufferlist bl, orig; + size_t block_size = 8000; + bl.append(std::string(block_size, 'a')); + orig = bl; + t.remove(cid, hoid); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "Remove then create" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + SetVal(g_conf(), "bluestore_csum_type", "none"); + g_conf().apply_changes(nullptr); + + ObjectStore::Transaction t2; + t2.write(cid, hoid, block_size*2, bl.length(), bl); + cerr << "Append 'unprotected'" << std::endl; + r = queue_transaction(store, ch, std::move(t2)); + ASSERT_EQ(r, 0); + + bufferlist in; + r = store->read(ch, hoid, 0, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + in.clear(); + r = store->read(ch, hoid, block_size*2, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + + SetVal(g_conf(), "bluestore_csum_type", "crc32c"); + g_conf().apply_changes(nullptr); + in.clear(); + r = store->read(ch, hoid, 0, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + in.clear(); + r = store->read(ch, hoid, block_size*2, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + } + { + //partially blob overwrite under a different csum enablement mode + + ObjectStore::Transaction t; + bufferlist bl, orig, orig2; + size_t block_size0 = 0x10000; + size_t block_size = 9000; + size_t block_size2 = 5000; + bl.append(std::string(block_size0, 'a')); + t.remove(cid, hoid); + t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*8, 0); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "Remove then create" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + SetVal(g_conf(), "bluestore_csum_type", "none"); + g_conf().apply_changes(nullptr); + + ObjectStore::Transaction t2; + bl.clear(); + bl.append(std::string(block_size, 'b')); + t2.write(cid, hoid, 0, bl.length(), bl); + t2.write(cid, hoid, block_size0, bl.length(), bl); + cerr << "Overwrite with unprotected data" << std::endl; + r = queue_transaction(store, ch, std::move(t2)); + ASSERT_EQ(r, 0); + + orig = bl; + orig2 = bl; + orig.append( std::string(block_size0 - block_size, 'a')); + + bufferlist in; + r = store->read(ch, hoid, 0, block_size0, in); + ASSERT_EQ((int)block_size0, r); + ASSERT_TRUE(bl_eq(orig, in)); + + r = store->read(ch, hoid, block_size0, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig2, in)); + + SetVal(g_conf(), "bluestore_csum_type", "crc32c"); + g_conf().apply_changes(nullptr); + + ObjectStore::Transaction t3; + bl.clear(); + bl.append(std::string(block_size2, 'c')); + t3.write(cid, hoid, block_size0, bl.length(), bl); + cerr << "Overwrite with protected data" << std::endl; + r = queue_transaction(store, ch, std::move(t3)); + ASSERT_EQ(r, 0); + + in.clear(); + orig = bl; + orig.append( std::string(block_size - block_size2, 'b')); + r = store->read(ch, hoid, block_size0, block_size, in); + ASSERT_EQ((int)block_size, r); + ASSERT_TRUE(bl_eq(orig, in)); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} +#endif + +INSTANTIATE_TEST_SUITE_P( + ObjectStore, + StoreTest, + ::testing::Values( + "memstore", +#if defined(WITH_BLUESTORE) + "bluestore", +#endif + "kstore")); + +// Note: instantiate all stores to preserve store numbering order only +INSTANTIATE_TEST_SUITE_P( + ObjectStore, + StoreTestSpecificAUSize, + ::testing::Values( + "memstore", +#if defined(WITH_BLUESTORE) + "bluestore", +#endif + "kstore")); + +// Note: instantiate all stores to preserve store numbering order only +INSTANTIATE_TEST_SUITE_P( + ObjectStore, + StoreTestOmapUpgrade, + ::testing::Values( + "memstore", +#if defined(WITH_BLUESTORE) + "bluestore", +#endif + "kstore")); + +#if defined(WITH_BLUESTORE) +INSTANTIATE_TEST_SUITE_P( + ObjectStore, + StoreTestDeferredSetup, + ::testing::Values( + "bluestore")); +#endif + + +struct deferred_test_t { + uint32_t bdev_block_size; + uint32_t min_alloc_size; + uint32_t max_blob_size; + uint32_t prefer_deferred_size; +}; + +void PrintTo(const deferred_test_t& t, ::std::ostream* os) +{ + *os << t.bdev_block_size << "/" << t.min_alloc_size << "/" + << t.max_blob_size << "/" << t.prefer_deferred_size; +} + +class DeferredWriteTest : public StoreTestFixture, + public ::testing::WithParamInterface<deferred_test_t> { +public: + DeferredWriteTest() + : StoreTestFixture("bluestore") + {} + void SetUp() override { + //do nothing + } +protected: + void DeferredSetup() { + StoreTestFixture::SetUp(); + } +public: + std::vector<uint32_t> offsets = {0, 3000, 4096, 20000, 32768, 65000, 65536, 80000, 128 * 1024}; + std::vector<uint32_t> lengths = {1, 1000, 4096, 12000, 32768, 30000, 80000, 128 * 1024}; +}; + +TEST_P(DeferredWriteTest, NewData) { + const bool print = false; + deferred_test_t t = GetParam(); + SetVal(g_conf(), "bdev_block_size", stringify(t.bdev_block_size).c_str()); + SetVal(g_conf(), "bluestore_min_alloc_size", stringify(t.min_alloc_size).c_str()); + SetVal(g_conf(), "bluestore_max_blob_size", stringify(t.max_blob_size).c_str()); + SetVal(g_conf(), "bluestore_prefer_deferred_size", stringify(t.prefer_deferred_size).c_str()); + g_conf().apply_changes(nullptr); + DeferredSetup(); + + int r; + coll_t cid; + const PerfCounters* logger = store->get_perf_counters(); + ObjectStore::CollectionHandle ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + for (auto offset:offsets) { + for (auto length:lengths) { + std::string hname = fmt::format("test-{}-{}", offset, length); + ghobject_t hoid(hobject_t(hname, "", CEPH_NOSNAP, 0, -1, "")); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + if (print) + std::cout << hname << std::endl; + + auto w_new = logger->get(l_bluestore_write_new); + auto w_big_deferred = logger->get(l_bluestore_write_big_deferred); + auto i_deferred_w = logger->get(l_bluestore_issued_deferred_writes); + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(length, 'x')); + t.write(cid, hoid, offset, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + uint32_t first_db = offset / t.bdev_block_size; + uint32_t last_db = (offset + length - 1) / t.bdev_block_size; + + uint32_t write_size = (last_db - first_db + 1) * t.bdev_block_size; + if (write_size < t.prefer_deferred_size) { + // expect no direct writes + ASSERT_EQ(w_new , logger->get(l_bluestore_write_new)); + } else { + // expect no deferred + ASSERT_EQ(w_big_deferred , logger->get(l_bluestore_write_big_deferred)); + ASSERT_EQ(i_deferred_w , logger->get(l_bluestore_issued_deferred_writes)); + } + } + } + } +} + +#if defined(WITH_BLUESTORE) +INSTANTIATE_TEST_SUITE_P( + BlueStore, + DeferredWriteTest, + ::testing::Values( + // bdev alloc blob deferred + deferred_test_t{4 * 1024, 4 * 1024, 16 * 1024, 32 * 1024}, + deferred_test_t{4 * 1024, 16 * 1024, 64 * 1024, 64 * 1024}, + deferred_test_t{4 * 1024, 64 * 1024, 64 * 1024, 4 * 1024}, + deferred_test_t{4 * 1024, 4 * 1024, 64 * 1024, 0 * 1024}, + deferred_test_t{4 * 1024, 16 * 1024, 32 * 1024, 32 * 1024}, + deferred_test_t{4 * 1024, 16 * 1024, 64 * 1024, 128 * 1024} + )); +#endif + +void doMany4KWritesTest(ObjectStore* store, + unsigned max_objects, + unsigned max_ops, + unsigned max_object_size, + unsigned max_write_size, + unsigned write_alignment) +{ + MixedGenerator gen(555); + gen_type rng(time(NULL)); + coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD)); + store_statfs_t res_stat; + + SyntheticWorkloadState test_obj(store, + &gen, + &rng, + cid, + max_object_size, + max_write_size, + write_alignment); + test_obj.init(); + for (unsigned i = 0; i < max_objects; ++i) { + if (!(i % 500)) cerr << "seeding object " << i << std::endl; + test_obj.touch(); + } + for (unsigned i = 0; i < max_ops; ++i) { + if (!(i % 200)) { + cerr << "Op " << i << std::endl; + test_obj.print_internal_state(); + } + test_obj.write(); + } + test_obj.wait_for_done(); + test_obj.statfs(res_stat); + if (!(res_stat.data_stored <= max_object_size) || + !(res_stat.allocated <= max_object_size)) { + // this will provide more insight on the mismatch and + // helps to avoid any races during stats collection + test_obj.fsck(false); + // retrieving stats once again and assert if still broken + test_obj.statfs(res_stat); + ASSERT_LE(res_stat.data_stored, max_object_size); + ASSERT_LE(res_stat.allocated, max_object_size); + } + test_obj.shutdown(); +} + +TEST_P(StoreTestSpecificAUSize, Many4KWritesTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply" + << std::endl; + return; + } + + StartDeferred(0x10000); + + const unsigned max_object = 4*1024*1024; + doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0); +} + +TEST_P(StoreTestSpecificAUSize, Many4KWritesNoCSumTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply" + << std::endl; + return; + } + StartDeferred(0x10000); + SetVal(g_conf(), "bluestore_csum_type", "none"); + g_ceph_context->_conf.apply_changes(nullptr); + const unsigned max_object = 4*1024*1024; + + doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0 ); +} + +TEST_P(StoreTestSpecificAUSize, TooManyBlobsTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred; assertions around res_stat.allocated don't apply" + << std::endl; + return; + } + StartDeferred(0x10000); + const unsigned max_object = 4*1024*1024; + doMany4KWritesTest(store.get(), 1, 1000, max_object, 4*1024, 0); +} + +#if defined(WITH_BLUESTORE) +void get_mempool_stats(uint64_t* total_bytes, uint64_t* total_items) +{ + uint64_t meta_allocated = mempool::bluestore_cache_meta::allocated_bytes(); + uint64_t onode_allocated = mempool::bluestore_cache_onode::allocated_bytes(); + uint64_t other_allocated = mempool::bluestore_cache_other::allocated_bytes(); + + uint64_t meta_items = mempool::bluestore_cache_meta::allocated_items(); + uint64_t onode_items = mempool::bluestore_cache_onode::allocated_items(); + uint64_t other_items = mempool::bluestore_cache_other::allocated_items(); + cout << "meta(" << meta_allocated << "/" << meta_items + << ") onode(" << onode_allocated << "/" << onode_items + << ") other(" << other_allocated << "/" << other_items + << ")" << std::endl; + *total_bytes = meta_allocated + onode_allocated + other_allocated; + *total_items = onode_items; +} + +TEST_P(StoreTestSpecificAUSize, OnodeSizeTracking) { + + if (string(GetParam()) != "bluestore") + return; + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_compression_mode", "none"); + SetVal(g_conf(), "bluestore_csum_type", "none"); + SetVal(g_conf(), "bluestore_cache_size_hdd", "400000000"); + SetVal(g_conf(), "bluestore_cache_size_ssd", "400000000"); + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, "")); + size_t obj_size = 4 * 1024 * 1024; + uint64_t total_bytes_prev; + uint64_t total_bytes, total_bytes2; + uint64_t total_onodes; + get_mempool_stats(&total_bytes, &total_onodes); + total_bytes_prev = total_bytes; + // 5u for onode_cache_shards vector + ASSERT_EQ(total_onodes, 5u); + ASSERT_EQ(total_bytes, 40u); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig, orig2; + + bl.append(std::string(obj_size, 'a')); + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + get_mempool_stats(&total_bytes, &total_onodes); + ASSERT_GT(total_bytes - total_bytes_prev, 0u); + ASSERT_EQ(total_onodes, 6u); + + { + ObjectStore::Transaction t; + t.truncate(cid, hoid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + for(size_t i = 0; i < 1; ++i) { + bufferlist bl; + bl.append(std::string(block_size * (i+1), 'a')); + for( size_t j = 0; j < obj_size; j+= bl.length()) { + ObjectStore::Transaction t; + t.write(cid, hoid, j, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + get_mempool_stats(&total_bytes2, &total_onodes); + ASSERT_NE(total_bytes2, 0u); + ASSERT_EQ(total_onodes, 6u); + } + { + cout <<" mempool dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + mempool::dump(&f); + f.close_section(); + f.flush(cout); + cout << std::endl; + } + { + bufferlist bl; + for (size_t i = 0; i < obj_size; i += 0x1000) { + store->read(ch, hoid, i, 0x1000, bl); + } + } + get_mempool_stats(&total_bytes, &total_onodes); + ASSERT_NE(total_bytes, 0u); + ASSERT_EQ(total_onodes, 6u); + + { + cout <<" mempool dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + mempool::dump(&f); + f.close_section(); + f.flush(cout); + cout << std::endl; + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwrite) { + + if (string(GetParam()) != "bluestore") + return; + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'a')); + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // overwrite at the beginning + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'b')); + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // append + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'c')); + t.write(cid, hoid, block_size * 2, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // append with a gap + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'd')); + t.write(cid, hoid, block_size * 5, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 2u); + } + { + // overwrite at end + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'e')); + + // Currently we are unable to reuse blob when overwriting in a single step + t.write(cid, hoid, block_size * 6, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 2u); + } + { + // fill the gap + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'f')); + + t.write(cid, hoid, block_size * 4, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // we need to wait some time for mempool + // thread to update stats to be able to check blob/extent numbers from + // perf counters. + sleep(1); + + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + + bl.clear(); + expected.clear(); + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + + bl.clear(); + expected.clear(); + r = store->read(ch, hoid, block_size * 2, block_size * 2, bl); + ASSERT_EQ(r, (int)block_size * 2); + expected.append(string(block_size * 2, 'c')); + ASSERT_TRUE(bl_eq(expected, bl)); + + bl.clear(); + expected.clear(); + r = store->read(ch, hoid, block_size * 4, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'f')); + ASSERT_TRUE(bl_eq(expected, bl)); + + bl.clear(); + expected.clear(); + r = store->read(ch, hoid, block_size * 5, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'd')); + ASSERT_TRUE(bl_eq(expected, bl)); + + bl.clear(); + expected.clear(); + r = store->read(ch, hoid, block_size * 5, block_size * 3, bl); + ASSERT_EQ(r, (int)block_size * 3); + expected.append(string(block_size, 'd')); + expected.append(string(block_size * 2, 'e')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 1u); + + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionSmallAppend) { + CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get(); + if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) { + GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping"; + } + + size_t block_size = 65536; + StartDeferred(block_size); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // [1] append zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append_zero(4096); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_small), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 4096u); + + bufferlist in; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(4096, r); + ASSERT_TRUE(in.is_zero()); + } + + { + // [2] append non-zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(4096, 'c')); + + t.write(cid, hoid, 4096, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_small), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u*2); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 4096u); + + bufferlist in, _exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(4096 * 2, r); + _exp.append_zero(4096); + _exp.append(bl); + ASSERT_TRUE(bl_eq(_exp, in)); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionSmallOverwrite) { + CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get(); + if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) { + GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping"; + } + if (smr) { + GTEST_SKIP() << "smr, skipping"; + } + + size_t block_size = 65536; + StartDeferred(block_size); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + // {setting up the scenario} append non-zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(4096, 'c')); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_small), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 0u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 0u); + + bufferlist in, _exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(4096, r); + _exp.append(bl); + ASSERT_TRUE(bl_eq(_exp, in)); + } + + { + // [1] overwrite non-zeros with zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append_zero(4096); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_small), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u*2); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 0u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 0u); + + bufferlist in; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(4096, r); + ASSERT_TRUE(in.is_zero()); + } + + { + // [2] overwrite zeros with non-zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(4096, 'c')); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_small), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_small_bytes), 4096u*3); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped), 0u); + ASSERT_EQ(logger->get(l_bluestore_write_small_skipped_bytes), 0u); + + bufferlist in, _exp; + r = store->read(ch, hoid, 0, 0x4000, in); + ASSERT_EQ(4096, r); + _exp.append(bl); + ASSERT_TRUE(bl_eq(_exp, in)); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionBigAppend) { + CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get(); + if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) { + GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping"; + } + + size_t block_size = 4096; + StartDeferred(block_size); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + // [1] append zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append_zero(block_size * 2); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*2); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 0u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2); + + bufferlist in; + r = store->read(ch, hoid, 0, block_size * 8, in); + ASSERT_EQ(block_size * 2, r); + ASSERT_TRUE(in.is_zero()); + } + + { + // [2] append non-zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'c')); + + t.write(cid, hoid, block_size * 2, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_big), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*4); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2); + + bufferlist in, _exp; + r = store->read(ch, hoid, 0, block_size * 8, in); + ASSERT_EQ(block_size * 4, r); + _exp.append_zero(block_size * 2); + _exp.append(bl); + ASSERT_TRUE(bl_eq(_exp, in)); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionBigOverwrite) { + CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get(); + if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) { + GTEST_SKIP() << "not bluestore or bluestore_zero_block_detection=false, skipping"; + } + if (smr) { + GTEST_SKIP() << "smr, skipping"; + } + + size_t block_size = 4096; + StartDeferred(block_size); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + // {setting up the scenario} append non-zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'c')); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*2); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 0u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 0u); + + bufferlist in, _exp; + r = store->read(ch, hoid, 0, block_size * 8, in); + ASSERT_EQ(block_size * 2, r); + _exp.append(bl); + ASSERT_TRUE(bl_eq(_exp, in)); + } + + { + // [1] overwrite non-zeros with zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append_zero(block_size * 2); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_big), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*4); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2); + + bufferlist in; + r = store->read(ch, hoid, 0, block_size * 8, in); + ASSERT_EQ(block_size * 2, r); + ASSERT_TRUE(in.is_zero()); + } + + { + // [2] overwrite zeros with non-zeros + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'c')); + + t.write(cid, hoid, 0, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_big), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), 4096u*6); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_skipped_bytes), 4096u*2); + + bufferlist in, _exp; + r = store->read(ch, hoid, 0, block_size * 8, in); + ASSERT_EQ(block_size * 2, r); + _exp.append(bl); + ASSERT_TRUE(bl_eq(_exp, in)); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred" << std::endl; + return; + } + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "131072"); + SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536"); + + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + ghobject_t hoid2(hobject_t("test2", "", CEPH_NOSNAP, 0, -1, "")); + + PerfCounters* logger = const_cast<PerfCounters*>(store->get_perf_counters()); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, bl2; + + bl.append(std::string(block_size * 2, 'c')); + bl2.append(std::string(block_size * 3, 'd')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + t.set_alloc_hint(cid, hoid2, block_size * 4, block_size * 4, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ); + t.write(cid, hoid2, 0, bl2.length(), bl2, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u); + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 5); + } + + // overwrite at the beginning, 4K alignment + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'b')); + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'c')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + // overwrite at the end, 4K alignment + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'g')); + t.write(cid, hoid, block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 4u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'g')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + // overwrite at 4K, 12K alignment + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'e')); + t.write(cid, hoid2, block_size , bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 5u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u); + + // makes sure deferred has been submitted + // and do all the checks again + sleep(g_conf().get_val<double>("bluestore_max_defer_interval") + 2); + + ASSERT_EQ(logger->get(l_bluestore_write_big), 5u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'g')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid2, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'd')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid2, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'e')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid2, block_size * 2, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'd')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 5); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 2u); + ASSERT_EQ(logger->get(l_bluestore_extents), 2u); + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 2, 'f')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 6u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u); + + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 0, 100); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, 100, bl); + ASSERT_EQ(r, (int)100); + expected.append(string(100, 0)); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, 100, block_size * 2 - 100, bl); + ASSERT_EQ(r, (int)block_size * 2 - 100); + expected.append(string(block_size * 2 - 100, 'f')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + sleep(2); + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2 - 100); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 2); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 1u); + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size, 'g')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 7u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u); + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'g')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'f')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 2); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 1u); + + // check whether full overwrite bypass deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 2, 'h')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 8u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size * 2, bl); + ASSERT_EQ(r, (int)block_size * 2); + expected.append(string(block_size * 2, 'h')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 2); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 32, 'a')); + + // this will create two 128K aligned blobs + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + t.write(cid, hoid, bl.length(), bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 10u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u); + + // check whether overwrite (less than prefer_deferred_size) partially overlapping two adjacent blobs goes + // deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 3, 'b')); + + t.write(cid, hoid, 0x20000 - block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 11u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 6u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, 0x20000 - block_size, bl); + ASSERT_EQ(r, 0x20000 - block_size); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, 0x20000 - block_size, block_size * 3, bl); + ASSERT_EQ(r, 3 * block_size); + expected.append(string(r, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, 0x20000 + 2 * block_size, block_size * 30, bl); + ASSERT_EQ(r, 30 * block_size); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 64); + } + + // check whether overwrite (larger than prefer_deferred_size) partially + // overlapping two adjacent blobs goes deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 30, 'c')); + + t.write(cid, hoid, 0x10000 + block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + sleep(2); + ASSERT_EQ(logger->get(l_bluestore_write_big), 12u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 8u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, 0x11000, bl); + ASSERT_EQ(r, 0x11000); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, 0x11000, block_size * 30, bl); + ASSERT_EQ(r, block_size * 30); + expected.append(string(r, 'c')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, block_size * 47, 0x10000 + block_size, bl); + ASSERT_EQ(r, 0x10000 + block_size); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 64); + } + + logger->reset(); + // check whether overwrite (prefer_deferred_size < 120K < 2 * prefer_defer_size) partially + // overlapping two adjacent blobs goes partly deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 30, 'e')); + + t.write(cid, hoid, 0x20000 - block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + sleep(2); + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 1u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), block_size); + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 64); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite2) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred" << std::endl; + return; + } + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536"); + + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + + PerfCounters* logger = const_cast<PerfCounters*>(store->get_perf_counters()); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(128 * 1024, 'c')); + + t.write(cid, hoid, 0x1000, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length()); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 0u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 0); + } + + logger->reset(); + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(128 * 1024, 'c')); + + t.write(cid, hoid, 0x2000, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length()); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 1u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 57344); + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite3) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred" << std::endl; + return; + } + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536"); + + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + + PerfCounters* logger = const_cast<PerfCounters*>(store->get_perf_counters()); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + logger->reset(); + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(4096 * 1024, 'c')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length()); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 64u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 0u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 0u); + } + logger->reset(); + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(4096 * 1024, 'c')); + + t.write(cid, hoid, 0x1000, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + ASSERT_EQ(logger->get(l_bluestore_write_big), 1u); + ASSERT_EQ(logger->get(l_bluestore_write_big_bytes), bl.length()); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 65u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), 1u); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), 61440); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, DeferredDifferentChunks) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred" << std::endl; + return; + } + + size_t alloc_size = 4096; + size_t large_object_size = 1 * 1024 * 1024; + size_t prefer_deferred_size = 65536; + StartDeferred(alloc_size); + SetVal(g_conf(), "bluestore_max_blob_size", "131072"); + SetVal(g_conf(), "bluestore_prefer_deferred_size", + stringify(prefer_deferred_size).c_str()); + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + const PerfCounters* logger = store->get_perf_counters(); + size_t exp_bluestore_write_big = 0; + size_t exp_bluestore_write_big_deferred = 0; + + ObjectStore::CollectionHandle ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + for (size_t expected_write_size = 1024; expected_write_size <= prefer_deferred_size; expected_write_size *= 2) { + //create object with hint + ghobject_t hoid(hobject_t("test-"+to_string(expected_write_size), "", CEPH_NOSNAP, 0, -1, "")); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.set_alloc_hint(cid, hoid, large_object_size, expected_write_size, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + //fill object + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(large_object_size, 'h')); + t.write(cid, hoid, 0, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ++exp_bluestore_write_big; + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), exp_bluestore_write_big); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), exp_bluestore_write_big_deferred); + + // check whether write will properly use deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(alloc_size + 2, 'z')); + t.write(cid, hoid, large_object_size - 2 * alloc_size - 1, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ++exp_bluestore_write_big; + if (expected_write_size < prefer_deferred_size) + ++exp_bluestore_write_big_deferred; + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), exp_bluestore_write_big); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), exp_bluestore_write_big_deferred); + } + ch.reset(nullptr); + CloseAndReopen(); + ch = store->open_collection(cid); + // check values + for (size_t expected_write_size = 1024; expected_write_size <= 65536; expected_write_size *= 2) { + ghobject_t hoid(hobject_t("test-"+to_string(expected_write_size), "", CEPH_NOSNAP, 0, -1, "")); + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, large_object_size, bl); + ASSERT_EQ(r, large_object_size); + expected.append(string(large_object_size - 2 * alloc_size - 1, 'h')); + expected.append(string(alloc_size + 2, 'z')); + expected.append(string(alloc_size - 1, 'h')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + } + { + ObjectStore::Transaction t; + for (size_t expected_write_size = 1024; expected_write_size <= 65536; expected_write_size *= 2) { + ghobject_t hoid(hobject_t("test-"+to_string(expected_write_size), "", CEPH_NOSNAP, 0, -1, "")); + t.remove(cid, hoid); + } + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwriteReverse) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no overwrite" << std::endl; + return; + } + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, "")); + + auto ch = store->create_new_collection(cid); + + const PerfCounters* logger = store->get_perf_counters(); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size * 2, 'a')); + t.write(cid, hoid, block_size * 10, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // prepend existing + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'b')); + t.write(cid, hoid, block_size * 9, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, block_size * 9, block_size * 2, bl); + ASSERT_EQ(r, (int)block_size * 2); + expected.append(string(block_size, 'b')); + expected.append(string(block_size, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 1u); + } + + + { + // prepend existing with a gap + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'c')); + t.write(cid, hoid, block_size * 7, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, block_size * 7, block_size * 3, bl); + ASSERT_EQ(r, (int)block_size * 3); + expected.append(string(block_size, 'c')); + expected.append(string(block_size, 0)); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 2u); + } + + { + // append after existing with a gap + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'd')); + t.write(cid, hoid, block_size * 13, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, block_size * 11, block_size * 3, bl); + ASSERT_EQ(r, (int)block_size * 3); + expected.append(string(block_size, 'a')); + expected.append(string(block_size, 0)); + expected.append(string(block_size, 'd')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 3u); + } + + { + // append twice to the next max_blob slot + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'e')); + t.write(cid, hoid, block_size * 17, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + t.write(cid, hoid, block_size * 19, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, block_size * 17, block_size * 3, bl); + ASSERT_EQ(r, (int)block_size * 3); + expected.append(string(block_size, 'e')); + expected.append(string(block_size, 0)); + expected.append(string(block_size, 'e')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 2u); + ASSERT_EQ(logger->get(l_bluestore_extents), 5u); + } + { + // fill gaps at the second slot + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'f')); + t.write(cid, hoid, block_size * 16, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + t.write(cid, hoid, block_size * 18, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, block_size * 16, block_size * 4, bl); + ASSERT_EQ(r, (int)block_size * 4); + expected.append(string(block_size, 'f')); + expected.append(string(block_size, 'e')); + expected.append(string(block_size, 'f')); + expected.append(string(block_size, 'e')); + ASSERT_TRUE(bl_eq(expected, bl)); + ASSERT_EQ(logger->get(l_bluestore_blobs), 2u); + ASSERT_EQ(logger->get(l_bluestore_extents), 4u); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, BlobReuseOnSmallOverwrite) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no overwrite" << std::endl; + return; + } + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + auto ch = store->create_new_collection(cid); + + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'a')); + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + t.write(cid, hoid, block_size * 2, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // write small into the gap + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(3, 'b')); + t.write(cid, hoid, block_size + 1, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + // We need to issue a read to trigger cache stat update that refresh + // perf counters. additionally we need to wait some time for mempool + // thread to update stats. + sleep(1); + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size * 3, bl); + ASSERT_EQ(r, (int)block_size * 3); + expected.append(string(block_size, 'a')); + expected.append(string(1, 0)); + expected.append(string(3, 'b')); + expected.append(string(block_size - 4, 0)); + expected.append(string(block_size, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 3u); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +// The test case to reproduce an issue when write happens +// to a zero space between the extents sharing the same spanning blob +// with unloaded shard map. +// Second extent might be filled with zeros this way due to wrong result +// returned by has_any_extents() call in do_write_small. The latter is caused +// by incompletly loaded extent map. +TEST_P(StoreTestSpecificAUSize, SmallWriteOnShardedExtents) { + if (string(GetParam()) != "bluestore") + return; + + size_t block_size = 0x10000; + StartDeferred(block_size); + + SetVal(g_conf(), "bluestore_csum_type", "xxhash64"); + SetVal(g_conf(), "bluestore_max_blob_size", "524288"); // for sure + + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid1(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + //doing some tricks to have sharded extents/spanning objects + ObjectStore::Transaction t; + bufferlist bl, bl2; + + bl.append(std::string(0x80000, 'a')); + t.write(cid, hoid1, 0, bl.length(), bl, 0); + t.zero(cid, hoid1, 0x719e0, 0x75b0 ); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + bl2.append(std::string(0x70000, 'b')); + t.write(cid, hoid1, 0, bl2.length(), bl2, 0); + t.zero(cid, hoid1, 0, 0x50000); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + } + ch.reset(); + store->umount(); + store->mount(); + ch = store->open_collection(cid); + + { + // do a write to zero space in between some extents sharing the same blob + ObjectStore::Transaction t; + bufferlist bl, bl2; + + bl.append(std::string(0x6520, 'c')); + t.write(cid, hoid1, 0x71c00, bl.length(), bl, 0); + + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + ObjectStore::Transaction t; + bufferlist bl, expected; + + r = store->read(ch, hoid1, 0x70000, 0x9c00, bl); + ASSERT_EQ(r, (int)0x9c00); + expected.append(string(0x19e0, 'a')); + expected.append(string(0x220, 0)); + expected.append(string(0x6520, 'c')); + expected.append(string(0xe70, 0)); + expected.append(string(0xc70, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + bl.clear(); + + } + + { + ObjectStore::Transaction t; + t.remove(cid, hoid1); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, ReproBug56488Test) { + + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: no deferred" << std::endl; + return; + } + + size_t alloc_size = 65536; + size_t write_size = 4096; + SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd"); + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", stringify(1 << 30).c_str()); + + g_conf().apply_changes(nullptr); + StartDeferred(alloc_size); + + int r; + coll_t cid; + const PerfCounters* logger = store->get_perf_counters(); + + ObjectStore::CollectionHandle ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + auto issued_dw = logger->get(l_bluestore_issued_deferred_writes); + auto issued_dw_bytes = logger->get(l_bluestore_issued_deferred_write_bytes); + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(write_size, 'x')); + t.write(cid, hoid, 0, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), issued_dw + 1); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), + issued_dw_bytes + write_size); + } + { + ghobject_t hoid(hobject_t("test-a", "", CEPH_NOSNAP, 0, -1, "")); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + auto issued_dw = logger->get(l_bluestore_issued_deferred_writes); + auto issued_dw_bytes = logger->get(l_bluestore_issued_deferred_write_bytes); + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(write_size * 2, 'x')); + t.write(cid, hoid, alloc_size - write_size, bl.length(), bl, + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_writes), issued_dw + 2); + ASSERT_EQ(logger->get(l_bluestore_issued_deferred_write_bytes), + issued_dw_bytes + write_size * 2); + } + { + ObjectStore::Transaction t; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + t.remove(cid, hoid); + ghobject_t hoid_a(hobject_t("test-a", "", CEPH_NOSNAP, 0, -1, "")); + t.remove(cid, hoid_a); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +#endif //#if defined(WITH_BLUESTORE) + +TEST_P(StoreTest, KVDBHistogramTest) { + if (string(GetParam()) != "bluestore") + return; + + int NUM_OBJS = 200; + int r = 0; + coll_t cid; + string base("testobj."); + bufferlist a; + bufferptr ap(0x1000); + memset(ap.c_str(), 'a', 0x1000); + a.append(ap); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + for (int i = 0; i < NUM_OBJS; ++i) { + ObjectStore::Transaction t; + char buf[100]; + snprintf(buf, sizeof(buf), "%d", i); + ghobject_t hoid(hobject_t(sobject_t(base + string(buf), CEPH_NOSNAP))); + t.write(cid, hoid, 0, 0x1000, a); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + std::unique_ptr<Formatter> f(Formatter::create("store_test", "json-pretty", "json-pretty")); + store->generate_db_histogram(f.get()); + f->flush(cout); + cout << std::endl; +} + +TEST_P(StoreTest, KVDBStatsTest) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "rocksdb_perf", "true"); + SetVal(g_conf(), "rocksdb_collect_compaction_stats", "true"); + SetVal(g_conf(), "rocksdb_collect_extended_stats","true"); + SetVal(g_conf(), "rocksdb_collect_memory_stats","true"); + g_ceph_context->_conf.apply_changes(nullptr); + int r = store->umount(); + ASSERT_EQ(r, 0); + r = store->mount(); //to force rocksdb stats + ASSERT_EQ(r, 0); + + int NUM_OBJS = 200; + coll_t cid; + string base("testobj."); + bufferlist a; + bufferptr ap(0x1000); + memset(ap.c_str(), 'a', 0x1000); + a.append(ap); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + for (int i = 0; i < NUM_OBJS; ++i) { + ObjectStore::Transaction t; + char buf[100]; + snprintf(buf, sizeof(buf), "%d", i); + ghobject_t hoid(hobject_t(sobject_t(base + string(buf), CEPH_NOSNAP))); + t.write(cid, hoid, 0, 0x1000, a); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + std::unique_ptr<Formatter> f(Formatter::create("store_test", "json-pretty", "json-pretty")); + store->get_db_statistics(f.get()); + f->flush(cout); + cout << std::endl; +} + +#if defined(WITH_BLUESTORE) +TEST_P(StoreTestSpecificAUSize, garbageCollection) { + int r; + coll_t cid; + int buf_len = 256 * 1024; + int overlap_offset = 64 * 1024; + int write_offset = buf_len; + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: assertions about allocations need to be adjusted" << std::endl; + return; + } + +#define WRITE_AT(offset, _length) {\ + ObjectStore::Transaction t;\ + if ((uint64_t)_length != bl.length()) { \ + buffer::ptr p(bl.c_str(), _length);\ + bufferlist bl_tmp;\ + bl_tmp.push_back(p);\ + t.write(cid, hoid, offset, bl_tmp.length(), bl_tmp);\ + } else {\ + t.write(cid, hoid, offset, bl.length(), bl);\ + }\ + r = queue_transaction(store, ch, std::move(t));\ + ASSERT_EQ(r, 0);\ + } + + StartDeferred(65536); + + SetVal(g_conf(), "bluestore_compression_max_blob_size", "524288"); + SetVal(g_conf(), "bluestore_compression_min_blob_size", "262144"); + SetVal(g_conf(), "bluestore_max_blob_size", "524288"); + SetVal(g_conf(), "bluestore_compression_mode", "force"); + g_conf().apply_changes(nullptr); + + auto ch = store->create_new_collection(cid); + + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + { + bufferlist in; + r = store->read(ch, hoid, 0, 5, in); + ASSERT_EQ(-ENOENT, r); + } + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + std::string data; + data.resize(buf_len); + + { + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + bufferlist bl; + + for(size_t i = 0; i < data.size(); i++) + data[i] = i % 256; + + bl.append(data); + + { + struct store_statfs_t statfs; + WRITE_AT(0, buf_len); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + } + { + struct store_statfs_t statfs; + WRITE_AT(write_offset - 2 * overlap_offset, buf_len); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x20000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0u); + } + + { + struct store_statfs_t statfs; + WRITE_AT(write_offset - overlap_offset, buf_len); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x20000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x10000u); + } + { + struct store_statfs_t statfs; + WRITE_AT(write_offset - 3 * overlap_offset, buf_len); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x20000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x20000u); + } + { + struct store_statfs_t statfs; + WRITE_AT(write_offset + 1, overlap_offset-1); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x20000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x20000u); + } + { + struct store_statfs_t statfs; + WRITE_AT(write_offset + 1, overlap_offset); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x3ffffu); + } + { + struct store_statfs_t statfs; + WRITE_AT(0, buf_len-1); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x40001u); + } + SetVal(g_conf(), "bluestore_gc_enable_total_threshold", "1"); //forbid GC when saving = 0 + { + struct store_statfs_t statfs; + WRITE_AT(1, overlap_offset-2); + WRITE_AT(overlap_offset * 2 + 1, overlap_offset-2); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x10000); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x40001u); + } + { + struct store_statfs_t statfs; + WRITE_AT(overlap_offset + 1, overlap_offset-2); + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_compressed_allocated, 0x0); + const PerfCounters* counters = store->get_perf_counters(); + ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x40007u); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + } +} + +TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_block_size", + stringify(0x280005000).c_str()); //10 Gb + 4K + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + StartDeferred(0x4000); + store->umount(); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + store->mount(); + +} + +TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_block_size", + stringify(0x280005000).c_str()); //10 Gb + 20K + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + StartDeferred(0x1000); + store->umount(); + ASSERT_EQ(store->fsck(false), 0); // do fsck explicitly + store->mount(); +} + +namespace { + ghobject_t make_object(const char* name, int64_t pool) { + sobject_t soid{name, CEPH_NOSNAP}; + uint32_t hash = std::hash<sobject_t>{}(soid); + return ghobject_t{hobject_t{soid, "", hash, pool, ""}}; + } +} + +TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl; + return; + } + const size_t offs_base = 65536 / 2; + + + // Now we need standalone db to pass "false free fix" section below + // Due to new BlueFS allocation model (single allocator for main device) + // it might cause "false free" blob overwrite by BlueFS/DB stuff + // and hence fail the test case and corrupt data. + // + + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", "4294967296"); + + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + SetVal(g_conf(), "bluestore_max_blob_size", + stringify(2 * offs_base).c_str()); + SetVal(g_conf(), "bluestore_extent_map_shard_max_size", "12000"); + + StartDeferred(0x10000); + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + + // fill the store with some data + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + + ghobject_t hoid = make_object("Object 1", pool); + ghobject_t hoid_dup = make_object("Object 1(dup)", pool); + ghobject_t hoid2 = make_object("Object 2", pool); + ghobject_t hoid_cloned = hoid2; + hoid_cloned.hobj.snap = 1; + ghobject_t hoid3 = make_object("Object 3", pool); + ghobject_t hoid3_cloned = hoid3; + hoid3_cloned.hobj.snap = 1; + bufferlist bl; + bl.append("1234512345"); + int r; + const size_t repeats = 16; + { + auto ch = store->create_new_collection(cid); + cerr << "create collection + write" << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid, 0); + for( auto i = 0ul; i < repeats; ++i ) { + t.write(cid, hoid, i * offs_base, bl.length(), bl); + t.write(cid, hoid_dup, i * offs_base, bl.length(), bl); + } + for( auto i = 0ul; i < repeats; ++i ) { + t.write(cid, hoid2, i * offs_base, bl.length(), bl); + } + t.clone(cid, hoid2, hoid_cloned); + + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + bstore->umount(); + bool err_was_injected = false; + //////////// leaked pextent fix //////////// + cerr << "fix leaked pextents" << std::endl; + ASSERT_EQ(bstore->fsck(false), 0); + ASSERT_EQ(bstore->repair(false), 0); + bstore->mount(); + if (!bstore->has_null_manager()) { + bstore->inject_leaked(0x30000); + err_was_injected = true; + } + + bstore->umount(); + if (err_was_injected) { + ASSERT_EQ(bstore->fsck(false), 1); + } + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + + //////////// false free fix //////////// + cerr << "fix false free pextents" << std::endl; + bstore->mount(); + if (!bstore->has_null_manager()) { + bstore->inject_false_free(cid, hoid); + err_was_injected = true; + } + bstore->umount(); + if (err_was_injected) { + ASSERT_EQ(bstore->fsck(false), 2); + ASSERT_EQ(bstore->repair(false), 0); + } + ASSERT_EQ(bstore->fsck(false), 0); + + + ///////// undecodable shared blob key / stray shared blob records /////// + bstore->mount(); + cerr << "undecodable shared blob key" << std::endl; + bstore->inject_broken_shared_blob_key("undec1", + bufferlist()); + bstore->inject_broken_shared_blob_key("undecodable key 2", + bufferlist()); + bstore->inject_broken_shared_blob_key("undecodable key 3", + bufferlist()); + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), 3); + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + + cerr << "misreferencing" << std::endl; + bstore->mount(); + bstore->inject_misreference(cid, hoid, cid, hoid_dup, 0); + bstore->inject_misreference(cid, hoid, cid, hoid_dup, (offs_base * repeats) / 2); + bstore->inject_misreference(cid, hoid, cid, hoid_dup, offs_base * (repeats -1) ); + int expected_errors = bstore->has_null_manager() ? 3 : 6; + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), expected_errors); + ASSERT_EQ(bstore->repair(false), 0); + + ASSERT_EQ(bstore->fsck(true), 0); + + // reproducing issues #21040 & 20983 + SetVal(g_conf(), "bluestore_debug_inject_bug21040", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + bstore->mount(); + + cerr << "repro bug #21040" << std::endl; + { + auto ch = store->open_collection(cid); + { + ObjectStore::Transaction t; + bl.append("0123456789012345"); + t.write(cid, hoid3, offs_base, bl.length(), bl); + bl.clear(); + bl.append('!'); + t.write(cid, hoid3, 0, bl.length(), bl); + + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.clone(cid, hoid3, hoid3_cloned); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + bstore->umount(); + // depending on statfs tracking we might meet or miss relevant error + // hence error count >= 3 + ASSERT_GE(bstore->fsck(false), 3); + ASSERT_LE(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + } + + cerr << "Zombie spanning blob" << std::endl; + { + bstore->mount(); + ghobject_t hoid4 = make_object("Object 4", pool); + auto ch = store->open_collection(cid); + { + bufferlist bl; + string s(0x1000, 'a'); + bl.append(s); + ObjectStore::Transaction t; + for(size_t i = 0; i < 0x10; i++) { + t.write(cid, hoid4, i * bl.length(), bl.length(), bl); + } + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + sleep(5); + { + bstore->inject_zombie_spanning_blob(cid, hoid4, 12345); + bstore->inject_zombie_spanning_blob(cid, hoid4, 23456); + bstore->inject_zombie_spanning_blob(cid, hoid4, 23457); + } + + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), 1); + ASSERT_LE(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + } + + //////////// verify invalid statfs /////////// + cerr << "fix invalid statfs" << std::endl; + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_stats", "true"); + SetVal(g_conf(), + "bluestore_debug_inject_allocation_from_file_failure", "1"); + store_statfs_t statfs0; + store_statfs_t statfs; + bstore->mount(); + ASSERT_EQ(bstore->statfs(&statfs0), 0); + statfs = statfs0; + statfs.allocated += 0x10000; + statfs.data_stored += 0x10000; + ASSERT_FALSE(statfs0 == statfs); + // this enforces global stats usage + bstore->inject_statfs("bluestore_statfs", statfs); + bstore->umount(); + + ASSERT_GE(bstore->fsck(false), 1); // global stats mismatch might omitted when + // NCB restore is applied. Hence using >= for + // error count + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + ASSERT_EQ(bstore->mount(), 0); + ASSERT_EQ(bstore->statfs(&statfs), 0); + // adjust free/internal meta space to success in comparison + statfs0.available = statfs.available; + statfs0.internal_metadata = statfs.internal_metadata; + ASSERT_EQ(statfs0, statfs); + + SetVal(g_conf(), + "bluestore_debug_inject_allocation_from_file_failure", "0"); + cerr << "fix invalid statfs2" << std::endl; + ASSERT_EQ(bstore->statfs(&statfs0), 0); + statfs = statfs0; + statfs.allocated += 0x20000; + statfs.data_stored += 0x20000; + ASSERT_FALSE(statfs0 == statfs); + // this enforces global stats usage + bstore->inject_statfs("bluestore_statfs", statfs); + bstore->umount(); + + ASSERT_EQ(bstore->fsck(false), 2); + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + ASSERT_EQ(bstore->mount(), 0); + ASSERT_EQ(bstore->statfs(&statfs), 0); + // adjust free/internal meta space to success in comparison + statfs0.available = statfs.available; + statfs0.internal_metadata = statfs.internal_metadata; + ASSERT_EQ(statfs0, statfs); + + cerr << "Completing" << std::endl; +} + +TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: smr repair is different" << std::endl; + return; + } + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + + StartDeferred(0x10000); + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + + int r; + + cerr << "initializing" << std::endl; + { + const size_t col_count = 16; + const size_t obj_count = 1024; + ObjectStore::CollectionHandle ch[col_count]; + ghobject_t hoid[col_count][obj_count]; + + unique_ptr<coll_t> cid[col_count]; + + for (size_t i = 0; i < col_count; i++) { + cid[i].reset(new coll_t(spg_t(pg_t(0, i), shard_id_t::NO_SHARD))); + ch[i] = store->create_new_collection(*cid[i]); + for (size_t j = 0; j < obj_count; j++) { + hoid[i][j] = make_object(stringify(j).c_str(), i); + } + } + + for (size_t i = 0; i < col_count; i++) { + ObjectStore::Transaction t; + t.create_collection(*cid[i], 0); + r = queue_transaction(store, ch[i], std::move(t)); + ASSERT_EQ(r, 0); + } + cerr << "onode preparing" << std::endl; + bufferlist bl; + string s(0x1000, 'a'); + bl.append(s); + + for (size_t i = 0; i < col_count; i++) { + for (size_t j = 0; j < obj_count; j++) { + ObjectStore::Transaction t; + t.write(*cid[i], hoid[i][j], bl.length(), bl.length(), bl); + r = queue_transaction(store, ch[i], std::move(t)); + ASSERT_EQ(r, 0); + } + } + cerr << "Zombie spanning blob injection" << std::endl; + + sleep(5); + + for (size_t i = 0; i < col_count; i++) { + for (size_t j = 0; j < obj_count; j++) { + bstore->inject_zombie_spanning_blob(*cid[i], hoid[i][j], 12345); + } + } + + cerr << "fscking/fixing" << std::endl; + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), col_count * obj_count); + ASSERT_LE(bstore->quick_fix(), 0); + ASSERT_EQ(bstore->fsck(false), 0); + } + + cerr << "Completing" << std::endl; + bstore->mount(); +} + +TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + + const size_t block_size = 0x1000; + StartDeferred(block_size); + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + + // fill the store with some data + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + + ghobject_t hoid = make_object("Object 1", pool); + ghobject_t hoid_cloned = hoid; + hoid_cloned.hobj.snap = 1; + ghobject_t hoid2 = make_object("Object 2", pool); + + string s(block_size, 1); + bufferlist bl; + bl.append(s); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // check the scenario when shared blob contains + // references to extents from two objects which don't overlapp + // o1 -> 0x2000~1K + // o2 -> 0x4000~1k + cerr << "introduce 2 non-overlapped extents in a shared blob" + << std::endl; + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0, bl.length(), bl); + t.write(cid, hoid2, 0, bl.length(), bl); // to make a gap in allocations + t.write(cid, hoid, block_size * 2 , bl.length(), bl); + t.clone(cid, hoid, hoid_cloned); + t.zero(cid, hoid, 0, bl.length()); + t.zero(cid, hoid_cloned, block_size * 2, bl.length()); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bstore->umount(); + bstore->mount(); + { + string key; + _key_encode_u64(1, &key); + bluestore_shared_blob_t sb(1); + sb.ref_map.get(0x822000, block_size); + sb.ref_map.get(0x824000, block_size); + sb.ref_map.get(0x824000, block_size); + bufferlist bl; + encode(sb, bl); + bstore->inject_broken_shared_blob_key(key, bl); + } + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), 2); + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + + cerr << "Completing" << std::endl; + bstore->mount(); +} + +TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: smr repair is different" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + + StartDeferred(0x10000); + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + + int r; + + // initializing + cerr << "initializing" << std::endl; + { + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + + ghobject_t hoid = make_object("Object", pool); + ghobject_t hoid_cloned = hoid; + hoid_cloned.hobj.snap = 1; + + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append("0123456789012345"); + t.write(cid, hoid, 0, bl.length(), bl); + + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.clone(cid, hoid, hoid_cloned); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + } + // injecting an error and checking + cerr << "injecting" << std::endl; + sleep(3); // need some time for the previous write to land + bstore->inject_no_shared_blob_key(); + bstore->inject_stray_shared_blob_key(12345678); + + { + cerr << "fscking/fixing" << std::endl; + // we need to check for null-manager before umount() + bool has_null_manager = bstore->has_null_manager(); + bstore->umount(); + // depending on the allocation map's source we can + // either observe or don't observe an additional + // extent leak detection. Hence adjusting the expected + // value + size_t expected_error_count = + has_null_manager ? + 4: // 4 sb ref mismatch errors [+ 1 optional statfs, hence ASSERT_GE] + 7; // 4 sb ref mismatch errors + 1 statfs + 1 block leak + 1 non-free + ASSERT_GE(bstore->fsck(false), expected_error_count); + // repair might report less errors than fsck above showed + // as some errors, e.g. statfs mismatch, are implicitly fixed + // before the detection during the previous repair steps... + ASSERT_LE(bstore->repair(false), expected_error_count); + ASSERT_EQ(bstore->fsck(false), 0); + } + + cerr << "Completing" << std::endl; + bstore->mount(); +} + +TEST_P(StoreTest, BluestoreRepairGlobalStats) { + if (string(GetParam()) != "bluestore") + return; + const size_t offs_base = 65536 / 2; + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + + // start with global stats + bstore->inject_global_statfs({}); + bstore->umount(); + SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "false"); + bstore->mount(); + + // fill the store with some data + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + + ghobject_t hoid = make_object("Object 1", pool); + ghobject_t hoid_dup = make_object("Object 1(dup)", pool); + ghobject_t hoid2 = make_object("Object 2", pool); + ghobject_t hoid_cloned = hoid2; + hoid_cloned.hobj.snap = 1; + ghobject_t hoid3 = make_object("Object 3", pool); + ghobject_t hoid3_cloned = hoid3; + hoid3_cloned.hobj.snap = 1; + bufferlist bl; + bl.append("1234512345"); + int r; + const size_t repeats = 16; + { + auto ch = store->create_new_collection(cid); + cerr << "create collection + write" << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid, 0); + for( auto i = 0ul; i < repeats; ++i ) { + t.write(cid, hoid, i * offs_base, bl.length(), bl); + t.write(cid, hoid_dup, i * offs_base, bl.length(), bl); + } + for( auto i = 0ul; i < repeats; ++i ) { + t.write(cid, hoid2, i * offs_base, bl.length(), bl); + } + t.clone(cid, hoid2, hoid_cloned); + + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + bstore->umount(); + + // enable per-pool stats collection hence causing fsck to fail + cerr << "per-pool statfs" << std::endl; + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_stats", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + + ASSERT_EQ(bstore->fsck(false), 1); + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + + bstore->mount(); +} + +TEST_P(StoreTest, BluestoreRepairGlobalStatsFixOnMount) { + if (string(GetParam()) != "bluestore") + return; + const size_t offs_base = 65536 / 2; + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + + // start with global stats + bstore->inject_global_statfs({}); + bstore->umount(); + SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "false"); + bstore->mount(); + + // fill the store with some data + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + + ghobject_t hoid = make_object("Object 1", pool); + ghobject_t hoid_dup = make_object("Object 1(dup)", pool); + ghobject_t hoid2 = make_object("Object 2", pool); + ghobject_t hoid_cloned = hoid2; + hoid_cloned.hobj.snap = 1; + ghobject_t hoid3 = make_object("Object 3", pool); + ghobject_t hoid3_cloned = hoid3; + hoid3_cloned.hobj.snap = 1; + bufferlist bl; + bl.append("1234512345"); + int r; + const size_t repeats = 16; + { + auto ch = store->create_new_collection(cid); + cerr << "create collection + write" << std::endl; + ObjectStore::Transaction t; + t.create_collection(cid, 0); + for( auto i = 0ul; i < repeats; ++i ) { + t.write(cid, hoid, i * offs_base, bl.length(), bl); + t.write(cid, hoid_dup, i * offs_base, bl.length(), bl); + } + for( auto i = 0ul; i < repeats; ++i ) { + t.write(cid, hoid2, i * offs_base, bl.length(), bl); + } + t.clone(cid, hoid2, hoid_cloned); + + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + bstore->umount(); + + // enable per-pool stats collection hence causing fsck to fail + cerr << "per-pool statfs" << std::endl; + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_stats", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + + ASSERT_EQ(bstore->fsck(false), 1); + + SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "true"); + bstore->mount(); + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), 0); + + bstore->mount(); +} + +TEST_P(StoreTest, BluestoreStatistics) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "rocksdb_perf", "true"); + SetVal(g_conf(), "rocksdb_collect_compaction_stats", "true"); + SetVal(g_conf(), "rocksdb_collect_extended_stats","true"); + SetVal(g_conf(), "rocksdb_collect_memory_stats","true"); + + // disable cache + SetVal(g_conf(), "bluestore_cache_size_ssd", "0"); + SetVal(g_conf(), "bluestore_cache_size_hdd", "0"); + SetVal(g_conf(), "bluestore_cache_size", "0"); + g_ceph_context->_conf.apply_changes(nullptr); + + int r = store->umount(); + ASSERT_EQ(r, 0); + r = store->mount(); + ASSERT_EQ(r, 0); + + BlueStore* bstore = NULL; + EXPECT_NO_THROW(bstore = dynamic_cast<BlueStore*> (store.get())); + + coll_t cid; + ghobject_t hoid(hobject_t("test_db_statistics", "", CEPH_NOSNAP, 0, 0, "")); + auto ch = bstore->create_new_collection(cid); + bufferlist bl; + bl.append("0123456789abcdefghi"); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, hoid); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "Write object" << std::endl; + r = queue_transaction(bstore, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist readback; + r = store->read(ch, hoid, 0, bl.length(), readback); + ASSERT_EQ(static_cast<int>(bl.length()), r); + ASSERT_TRUE(bl_eq(bl, readback)); + } + std::unique_ptr<Formatter> f(Formatter::create("store_test", "json-pretty", "json-pretty")); + EXPECT_NO_THROW(store->get_db_statistics(f.get())); + f->flush(cout); + cout << std::endl; +} + +TEST_P(StoreTest, BluestoreStrayOmapDetection) +{ + if (string(GetParam()) != "bluestore") + return; + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + ghobject_t oid = make_object("Object 1", pool); + ghobject_t oid2 = make_object("Object 2", pool); + // fill the store with some data + auto ch = store->create_new_collection(cid); + bufferlist h; + h.append("header"); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oid); + t.omap_setheader(cid, oid, h); + t.touch(cid, oid2); + t.omap_setheader(cid, oid2, h); + int r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // inject stray omap + bstore->inject_stray_omap(123456, "somename"); + + bstore->umount(); + // check we detect injected stray omap.. + + ASSERT_EQ(bstore->fsck(false), 1); + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + bstore->mount(); +} + +TEST_P(StoreTest, BluestorePerPoolOmapFixOnMount) +{ + if (string(GetParam()) != "bluestore") + return; + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + ghobject_t oid = make_object("Object 1", pool); + ghobject_t oid2 = make_object("Object 2", pool); + // fill the store with some data + auto ch = store->create_new_collection(cid); + map<string, bufferlist> omap; + bufferlist h; + h.append("header"); + { + omap["omap_key"].append("omap value"); + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, oid); + t.omap_setheader(cid, oid, h); + t.touch(cid, oid2); + t.omap_setheader(cid, oid2, h); + int r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // inject legacy omaps + bstore->inject_legacy_omap(); + bstore->inject_legacy_omap(cid, oid); + bstore->inject_legacy_omap(cid, oid2); + + bstore->umount(); + + // check we injected an issue + SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + ASSERT_EQ(bstore->fsck(false), 3); + + // set autofix and mount + SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + bstore->mount(); + bstore->umount(); + + // check we fixed it.. + ASSERT_EQ(bstore->fsck(false), 0); + bstore->mount(); + + // + // Now repro https://tracker.ceph.com/issues/43824 + // + // inject legacy omaps again + bstore->inject_legacy_omap(); + bstore->inject_legacy_omap(cid, oid); + bstore->inject_legacy_omap(cid, oid2); + bstore->umount(); + + // check we injected an issue + SetVal(g_conf(), "bluestore_fsck_quick_fix_on_mount", "true"); + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + bstore->mount(); + ch = store->open_collection(cid); + + { + // write to onode which will partiall revert per-pool + // omap repair done on mount due to #43824. + // And object removal will leave stray per-pool omap recs + // + ObjectStore::Transaction t; + bufferlist bl; + bl.append("data"); + //this triggers onode rec update and hence legacy omap + t.write(cid, oid, 0, bl.length(), bl); + t.remove(cid, oid2); // this will trigger stray per-pool omap + int r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bstore->umount(); + // check omap's been fixed. + ASSERT_EQ(bstore->fsck(false), 0); // this will fail without fix for #43824 + + bstore->mount(); +} + +class hugepaged_raw; + +static bool is_hugepaged(const bufferptr& bp) +{ + const auto& ibp = + static_cast<const ceph::buffer_instrumentation::instrumented_bptr&>(bp); + return ibp.is_raw_marked<BlockDevice::hugepaged_raw_marker_t>(); +} + +// disabled by default b/c of the dependency on huge page ssome test +// environments might not offer without extra configuration. +TEST_P(StoreTestDeferredSetup, DISABLED_BluestoreHugeReads) +{ + if (string(GetParam()) != "bluestore") { + return; + } + + constexpr static size_t HUGE_BUFFER_SIZE{2_M}; + cout << "Configuring huge page pools" << std::endl; + { + SetVal(g_conf(), "bdev_read_preallocated_huge_buffers", + fmt::format("{}=2", HUGE_BUFFER_SIZE).c_str()); + SetVal(g_conf(), "bluestore_max_blob_size", + std::to_string(HUGE_BUFFER_SIZE).c_str()); + // let's verify the per-IOContext no-cache override + SetVal(g_conf(), "bluestore_default_buffered_read", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + } + DeferredSetup(); + + coll_t cid; + ghobject_t hoid(hobject_t("test_huge_buffers", "", CEPH_NOSNAP, 0, 0, "")); + auto ch = store->create_new_collection(cid); + + bufferlist bl; + { + bufferptr bp{HUGE_BUFFER_SIZE}; + // non-zeros! Otherwise the deduplication will take place. + ::memset(bp.c_str(), 0x42, HUGE_BUFFER_SIZE); + bl.push_back(std::move(bp)); + ASSERT_EQ(bl.get_num_buffers(), 1); + ASSERT_EQ(bl.length(), HUGE_BUFFER_SIZE); + } + + cout << "Write object" << std::endl; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + t.touch(cid, hoid); + t.write(cid, hoid, 0, bl.length(), bl); + const auto r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // force cache clear + { + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + ch = store->open_collection(cid); + } + + // we want to extend the life-time of all huge paged-backed + // bufferlists to validate the behaviour on pool exhaustion. + bufferlist bl_1_huge, bl_2_huge, bl_3_plain; + + cout << "Read object 1st time" << std::endl; + { + const auto r = store->read(ch, hoid, 0, HUGE_BUFFER_SIZE, bl_1_huge); + ASSERT_EQ(static_cast<int>(HUGE_BUFFER_SIZE), r); + ASSERT_TRUE(bl_eq(bl, bl_1_huge)); + ASSERT_EQ(bl_1_huge.get_num_buffers(), 1); + ASSERT_TRUE(is_hugepaged(bl_1_huge.front())); + } + + cout << "Read object 2nd time" << std::endl; + { + const auto r = store->read(ch, hoid, 0, HUGE_BUFFER_SIZE, bl_2_huge); + ASSERT_EQ(static_cast<int>(HUGE_BUFFER_SIZE), r); + ASSERT_TRUE(bl_eq(bl, bl_2_huge)); + ASSERT_EQ(bl_2_huge.get_num_buffers(), 1); + ASSERT_TRUE(is_hugepaged(bl_2_huge.front())); + } + + cout << "Read object 3rd time" << std::endl; + { + const auto r = store->read(ch, hoid, 0, HUGE_BUFFER_SIZE, bl_3_plain); + ASSERT_EQ(static_cast<int>(HUGE_BUFFER_SIZE), r); + ASSERT_TRUE(bl_eq(bl, bl_3_plain)); + ASSERT_EQ(bl_3_plain.get_num_buffers(), 1); + ASSERT_FALSE(is_hugepaged(bl_3_plain.front())); + } +} + +TEST_P(StoreTest, SpuriousReadErrorTest) { + if (string(GetParam()) != "bluestore") + return; + + int r; + auto logger = store->get_perf_counters(); + coll_t cid; + auto ch = store->create_new_collection(cid); + ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP))); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist test_data; + bufferptr ap(0x2000); + memset(ap.c_str(), 'a', 0x2000); + test_data.append(ap); + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0, 0x2000, test_data); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + // force cache clear + EXPECT_EQ(store->umount(), 0); + EXPECT_EQ(store->mount(), 0); + } + ch = store->open_collection(cid); + + cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl; + SetVal(g_conf(), "bluestore_retry_disk_reads", "0"); + SetVal(g_conf(), "bluestore_debug_inject_csum_err_probability", "1"); + g_ceph_context->_conf.apply_changes(nullptr); + { + bufferlist in; + r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + ASSERT_EQ(-EIO, r); + ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u); + ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u); + } + + cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl; + SetVal(g_conf(), "bluestore_retry_disk_reads", "255"); + SetVal(g_conf(), "bluestore_debug_inject_csum_err_probability", "0.8"); + /** + * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries + * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18 + * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24 + */ + g_ceph_context->_conf.apply_changes(nullptr); + { + for (int i = 0; i < 25; ++i) { + bufferlist in; + r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + ASSERT_EQ(0x2000, r); + ASSERT_TRUE(bl_eq(test_data, in)); + } + ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u); + } +} + +TEST_P(StoreTest, mergeRegionTest) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_fsck_on_mount", "true"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "true"); + SetVal(g_conf(), "bdev_debug_inflight_ios", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + + uint32_t chunk_size = g_ceph_context->_conf->bdev_block_size; + int r = -1; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl5; + bl5.append("abcde"); + uint64_t offset = 0; + { // 1. same region + ObjectStore::Transaction t; + t.write(cid, hoid, offset, 5, bl5); + t.write(cid, hoid, 0xa + offset, 5, bl5); + t.write(cid, hoid, 0x14 + offset, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 2. adjacent regions + ObjectStore::Transaction t; + offset = chunk_size; + t.write(cid, hoid, offset, 5, bl5); + t.write(cid, hoid, offset + chunk_size + 3, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 3. front merge + ObjectStore::Transaction t; + offset = chunk_size * 2; + t.write(cid, hoid, offset, 5, bl5); + t.write(cid, hoid, offset + chunk_size - 2, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 4. back merge + ObjectStore::Transaction t; + bufferlist blc2; + blc2.append_zero(chunk_size + 2); + + offset = chunk_size * 3; + t.write(cid, hoid, offset, chunk_size + 2, blc2); + t.write(cid, hoid, offset + chunk_size + 3, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 5. overlapping + ObjectStore::Transaction t; + uint64_t final_len = 0; + offset = chunk_size * 10; + bufferlist bl2c2; + bl2c2.append_zero(chunk_size * 2); + t.write(cid, hoid, offset + chunk_size * 3 - 3, chunk_size * 2, bl2c2); + bl2c2.append_zero(2); + t.write(cid, hoid, offset + chunk_size - 2, chunk_size * 2 + 2, bl2c2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + final_len = (offset + chunk_size * 3 - 3) + (chunk_size * 2); + bufferlist bl; + r = store->read(ch, hoid, 0, final_len, bl); + ASSERT_EQ(final_len, static_cast<uint64_t>(r)); + } +} + +TEST_P(StoreTest, FixSMRWritePointer) { + if(string(GetParam()) != "bluestore") + return; + if (!smr) + return; + int r = store->umount(); + ASSERT_EQ(0, r); + + // copied from StoreTestFixture + std::string path = GetParam() + ".test_temp_dir"s; + + std::string p = path + "/block"; + BlockDevice* bdev = BlockDevice::create(g_ceph_context, p, nullptr, nullptr, nullptr, nullptr); + r = bdev->open(p); + ASSERT_EQ(0, r); + ASSERT_EQ(true, bdev->is_smr()); + + std::vector<uint64_t> wp = bdev->get_zones(); + uint64_t first_seq_zone = bdev->get_conventional_region_size() / bdev->get_zone_size(); + + IOContext ioc(g_ceph_context, NULL, true); + bufferlist bl; + bl.append(std::string(1024 * 1024, 'x')); + r = bdev->aio_write(wp[first_seq_zone], bl, &ioc, false); + ASSERT_EQ(0, r); + bdev->aio_submit(&ioc); + ioc.aio_wait(); + bdev->close(); + delete bdev; + + r = store->mount(); + ASSERT_EQ(0, r); +} + + +TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsHdd) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd"); + StartDeferred(0x1000); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(g_ceph_context->_conf->bluestore_max_blob_size_hdd, '0'); + bl.append(s); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "write" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + const PerfCounters* logger = store->get_perf_counters(); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u); + } +} + +TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsSsd) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_debug_enforce_settings", "ssd"); + StartDeferred(0x1000); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(g_ceph_context->_conf->bluestore_max_blob_size_ssd * 8, '0'); + bl.append(s); + t.write(cid, hoid, 0, bl.length(), bl); + cerr << "write" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + const PerfCounters* logger = store->get_perf_counters(); + ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 8u); + } +} + +TEST_P(StoreTestSpecificAUSize, ReproNoBlobMultiTest) { + + if(string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP (FIXME): bluestore gc does not seem to do the trick here" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", "4294967296"); + SetVal(g_conf(), "bluestore_block_size", "12884901888"); + SetVal(g_conf(), "bluestore_max_blob_size", "524288"); + + g_conf().apply_changes(nullptr); + + StartDeferred(65536); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP))); + ghobject_t hoid2 = hoid; + hoid2.hobj.snap = 1; + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bool exists = store->exists(ch, hoid); + ASSERT_TRUE(!exists); + + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + exists = store->exists(ch, hoid); + ASSERT_EQ(true, exists); + } + { + uint64_t offs = 0; + bufferlist bl; + const int size = 0x100; + bufferptr ap(size); + memset(ap.c_str(), 'a', size); + bl.append(ap); + int i = 0; + uint64_t blob_size = 524288; + uint64_t total = 0; + for (i = 0; i <= 512; i++) { + offs = 0 + i * size; + ObjectStore::Transaction t; + ghobject_t hoid2 = hoid; + hoid2.hobj.snap = i + 1; + while (offs < 128 * 1024 * 1024) { + + t.write(cid, hoid, offs, ap.length(), bl); + offs += blob_size; + total += ap.length(); + } + t.clone(cid, hoid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + cerr << "Total written = " << total << std::endl; + } + { + cerr << "Finalizing" << std::endl; + const PerfCounters* logger = store->get_perf_counters(); + ASSERT_GE(logger->get(l_bluestore_gc_merged), 1024*1024*1024); + } +} + +void doManySetAttr(ObjectStore* store, + std::function<void(ObjectStore*)> do_check_fn) +{ + MixedGenerator gen(447); + gen_type rng(time(NULL)); + coll_t cid(spg_t(pg_t(0, 447), shard_id_t::NO_SHARD)); + + SyntheticWorkloadState test_obj(store, &gen, &rng, cid, 0, 0, 0); + test_obj.init(); + size_t object_count = 256; + for (size_t i = 0; i < object_count; ++i) { + if (!(i % 10)) cerr << "seeding object " << i << std::endl; + test_obj.touch(); + } + for (size_t i = 0; i < object_count; ++i) { + if (!(i % 100)) { + cerr << "Op " << i << std::endl; + test_obj.print_internal_state(); + } + test_obj.set_fixed_attrs(1024, 64, 4096); // 1024 attributes, 64 bytes name and 4K value + } + test_obj.wait_for_done(); + + std::cout << "done" << std::endl; + do_check_fn(store); + AdminSocket* admin_socket = g_ceph_context->get_admin_socket(); + ceph_assert(admin_socket); + + ceph::bufferlist in, out; + ostringstream err; + + auto r = admin_socket->execute_command( + { "{\"prefix\": \"bluefs stats\"}" }, + in, err, &out); + if (r != 0) { + cerr << "failure querying: " << cpp_strerror(r) << std::endl; + } else { + std::cout << std::string(out.c_str(), out.length()) << std::endl; + } + test_obj.shutdown(); +} + +TEST_P(StoreTestSpecificAUSize, SpilloverTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); + SetVal(g_conf(), "bluestore_volume_selection_policy", "rocksdb_original"); + // original RocksDB settings used before https://github.com/ceph/ceph/pull/47221/ + // which enable BlueFS spillover. + SetVal(g_conf(), "bluestore_rocksdb_options", + "compression=kNoCompression,max_write_buffer_number=4," + "min_write_buffer_number_to_merge=1,recycle_log_file_num=4," + "write_buffer_size=268435456,writable_file_max_buffer_size=0," + "compaction_readahead_size=2097152,max_background_compactions=2," + "max_total_wal_size=1073741824"); + + g_conf().apply_changes(nullptr); + + StartDeferred(65536); + doManySetAttr(store.get(), + [&](ObjectStore* _store) { + + BlueStore* bstore = dynamic_cast<BlueStore*> (_store); + ceph_assert(bstore); + bstore->compact(); + const PerfCounters* logger = bstore->get_bluefs_perf_counters(); + //experimentally it was discovered that this case results in 400+MB spillover + //using lower 300MB threshold just to be safe enough + std::cout << "DB used:" << logger->get(l_bluefs_db_used_bytes) << std::endl; + std::cout << "SLOW used:" << logger->get(l_bluefs_slow_used_bytes) << std::endl; + ASSERT_GE(logger->get(l_bluefs_slow_used_bytes), 16 * 1024 * 1024); + + struct store_statfs_t statfs; + osd_alert_list_t alerts; + int r = store->statfs(&statfs, &alerts); + ASSERT_EQ(r, 0); + ASSERT_EQ(alerts.count("BLUEFS_SPILLOVER"), 1); + std::cout << "spillover_alert:" << alerts.find("BLUEFS_SPILLOVER")->second + << std::endl; + } + ); +} + +TEST_P(StoreTestSpecificAUSize, SpilloverFixedTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); + SetVal(g_conf(), "bluestore_volume_selection_policy", "use_some_extra"); + SetVal(g_conf(), "bluestore_volume_selection_reserved", "1"); // just use non-zero to enable + + g_conf().apply_changes(nullptr); + + StartDeferred(65536); + doManySetAttr(store.get(), + [&](ObjectStore* _store) { + + BlueStore* bstore = dynamic_cast<BlueStore*> (_store); + ceph_assert(bstore); + bstore->compact(); + const PerfCounters* logger = bstore->get_bluefs_perf_counters(); + ASSERT_EQ(0, logger->get(l_bluefs_slow_used_bytes)); + } + ); +} + +TEST_P(StoreTestSpecificAUSize, SpilloverFixed2Test) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); + SetVal(g_conf(), "bluestore_volume_selection_policy", "use_some_extra"); + //default 2.0 factor results in too high threshold, using less value + // that results in less but still present spillover. + SetVal(g_conf(), "bluestore_volume_selection_reserved_factor", "0.5"); + + g_conf().apply_changes(nullptr); + + StartDeferred(65536); + doManySetAttr(store.get(), + [&](ObjectStore* _store) { + + BlueStore* bstore = dynamic_cast<BlueStore*> (_store); + ceph_assert(bstore); + bstore->compact(); + const PerfCounters* logger = bstore->get_bluefs_perf_counters(); + ASSERT_LE(logger->get(l_bluefs_slow_used_bytes), 300 * 1024 * 1024); // see SpilloverTest for 300MB choice rationale + } + ); +} + +TEST_P(StoreTestSpecificAUSize, SpilloverFixed3Test) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "SKIP: (FIXME?) adjust me for smr at some point?" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_block_db_create", "true"); + SetVal(g_conf(), "bluestore_block_db_size", "3221225472"); + SetVal(g_conf(), "bluestore_volume_selection_policy", "fit_to_fast"); + + g_conf().apply_changes(nullptr); + + StartDeferred(65536); + doManySetAttr(store.get(), + [&](ObjectStore* _store) { + + BlueStore* bstore = dynamic_cast<BlueStore*> (_store); + ceph_assert(bstore); + bstore->compact(); + const PerfCounters* logger = bstore->get_bluefs_perf_counters(); + ASSERT_EQ(logger->get(l_bluefs_slow_used_bytes), 0); // reffering to SpilloverFixedTest + } + ); +} + +TEST_P(StoreTestSpecificAUSize, Ticket45195Repro) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + return; + } + + SetVal(g_conf(), "bluestore_default_buffered_write", "true"); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd"); + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + g_conf().apply_changes(nullptr); + + StartDeferred(0x1000); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + cerr << "Creating collection " << cid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + size_t large_object_size = 1 * 1024 * 1024; + size_t expected_write_size = 0x8000; + ObjectStore::Transaction t; + t.touch(cid, hoid); + t.set_alloc_hint(cid, hoid, large_object_size, expected_write_size, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(0xc000, '0'); + bl.append(s); + t.write(cid, hoid, 0xb000, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(0x10000, '1'); + bl.append(s); + t.write(cid, hoid, 0x16000, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(0x4000, '1'); + bl.append(s); + t.write(cid, hoid, 0x1b000, bl.length(), bl); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl; + r = store->read(ch, hoid, 0xb000, 0xb000, bl); + ASSERT_EQ(r, 0xb000); + + store->umount(); + store->mount(); + + ch = store->open_collection(cid); + { + ObjectStore::Transaction t; + bufferlist bl, orig; + string s(0xf000, '3'); + bl.append(s); + t.write(cid, hoid, 0xf000, bl.length(), bl); + cerr << "write4" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + r = store->read(ch, hoid, 0xb000, 0x10000, bl); + ASSERT_EQ(r, 0x10000); +} + +TEST_P(StoreTestOmapUpgrade, WithOmapHeader) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_debug_legacy_omap", "true"); + g_conf().apply_changes(nullptr); + + StartDeferred(); + int64_t poolid = 11; + coll_t cid(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD)); + ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, poolid, "")); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + map<string, bufferlist> attrs; + bufferlist expected_header; + expected_header.append("this is a header"); + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + bufferlist header; + header.append(expected_header); + t.omap_setheader(cid, hoid, header); + map<string, bufferlist> start_set; + bufferlist bl; + bl.append(string("value")); + start_set.emplace(string("key1"), bl); + t.omap_setkeys(cid, hoid, start_set); + r = queue_transaction(store, ch, std::move(t)); + } + { + map<string,bufferlist> res; + bufferlist h; + r = store->omap_get(ch, hoid, &h, &res); + ASSERT_EQ(r, 0); + ASSERT_TRUE(bl_eq(h, expected_header)); + ASSERT_EQ(res.size(), 1); + ASSERT_EQ(res.begin()->first, "key1"); + } + store->umount(); + ASSERT_EQ(store->fsck(false), 0); + SetVal(g_conf(), "bluestore_debug_legacy_omap", "false"); + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true"); + g_conf().apply_changes(nullptr); + ASSERT_EQ(store->fsck(false), 2); + ASSERT_EQ(store->quick_fix(), 0); + store->mount(); + ch = store->open_collection(cid); + { + map<string,bufferlist> res; + bufferlist h; + r = store->omap_get(ch, hoid, &h, &res); + ASSERT_EQ(r, 0); + ASSERT_EQ(res.size(), 1); + ASSERT_EQ(res.begin()->first, "key1"); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestSpecificAUSize, BluefsWriteInSingleDiskEnvTest) { + if (string(GetParam()) != "bluestore") + return; + + g_conf().apply_changes(nullptr); + + StartDeferred(0x1000); + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + ceph_assert(bstore); + bstore->inject_bluefs_file("db.slow", "store_test_injection_slow", 1 << 20ul); + bstore->inject_bluefs_file("db.wal", "store_test_injection_wal", 1 << 20ul); + bstore->inject_bluefs_file("db", "store_test_injection_wal", 1 << 20ul); + + AdminSocket* admin_socket = g_ceph_context->get_admin_socket(); + ceph_assert(admin_socket); + + ceph::bufferlist in, out; + ostringstream err; + auto r = admin_socket->execute_command( + { "{\"prefix\": \"bluefs stats\"}" }, + in, err, &out); + if (r != 0) { + cerr << "failure querying: " << cpp_strerror(r) << std::endl; + } else { + std::cout << std::string(out.c_str(), out.length()) << std::endl; + } +} + +TEST_P(StoreTestSpecificAUSize, BluefsWriteInNoWalDiskEnvTest) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_block_db_path", "db"); + SetVal(g_conf(), "bluestore_block_db_size", stringify(1ull << 31).c_str()); + SetVal(g_conf(), "bluestore_block_db_create", "true"); + + g_conf().apply_changes(nullptr); + + StartDeferred(0x1000); + + BlueStore* bstore = dynamic_cast<BlueStore*> (store.get()); + ceph_assert(bstore); + bstore->inject_bluefs_file("db.slow", "store_test_injection_slow", 1 << 20ul); + bstore->inject_bluefs_file("db.wal", "store_test_injection_wal", 1 << 20ul); + bstore->inject_bluefs_file("db", "store_test_injection_wal", 1 << 20ul); + + AdminSocket* admin_socket = g_ceph_context->get_admin_socket(); + ceph_assert(admin_socket); + + ceph::bufferlist in, out; + ostringstream err; + auto r = admin_socket->execute_command( + { "{\"prefix\": \"bluefs stats\"}" }, + in, err, &out); + if (r != 0) { + cerr << "failure querying: " << cpp_strerror(r) << std::endl; + } + else { + std::cout << std::string(out.c_str(), out.length()) << std::endl; + } +} + +TEST_P(StoreTestOmapUpgrade, NoOmapHeader) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_debug_legacy_omap", "true"); + g_conf().apply_changes(nullptr); + + StartDeferred(); + int64_t poolid = 11; + coll_t cid(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD)); + ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, poolid, "")); + auto ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + map<string, bufferlist> attrs; + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + map<string, bufferlist> start_set; + bufferlist bl; + bl.append(string("value")); + start_set.emplace(string("key1"), bl); + t.omap_setkeys(cid, hoid, start_set); + r = queue_transaction(store, ch, std::move(t)); + } + { + map<string,bufferlist> res; + bufferlist h; + r = store->omap_get(ch, hoid, &h, &res); + ASSERT_EQ(r, 0); + ASSERT_EQ(h.length(), 0); + ASSERT_EQ(res.size(), 1); + ASSERT_EQ(res.begin()->first, "key1"); + } + store->umount(); + ASSERT_EQ(store->fsck(false), 0); + SetVal(g_conf(), "bluestore_debug_legacy_omap", "false"); + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true"); + g_conf().apply_changes(nullptr); + ASSERT_EQ(store->fsck(false), 2); + ASSERT_EQ(store->quick_fix(), 0); + store->mount(); + ch = store->open_collection(cid); + { + map<string,bufferlist> res; + bufferlist h; + r = store->omap_get(ch, hoid, &h, &res); + ASSERT_EQ(r, 0); + ASSERT_EQ(res.size(), 1); + ASSERT_EQ(res.begin()->first, "key1"); + } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +TEST_P(StoreTestOmapUpgrade, LargeLegacyToPG) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_debug_legacy_omap", "true"); + g_conf().apply_changes(nullptr); + + int64_t poolid; + coll_t cid; + ghobject_t hoid; + ObjectStore::CollectionHandle ch; + StartDeferred(); + poolid = 11; + cid = coll_t(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD)); + ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + //ASSERT_EQ(false, g_conf().get_val<bool>("bluestore_debug_inject_upgrade_bug53062")); + map<string, bufferlist> attrs; + bufferlist expected_header; + expected_header.append("this is a header"); + + size_t object_count = 1000; + make_omap_data(object_count, poolid, cid); + //checking just written data + check_omap_data(object_count, poolid, cid); + + store->umount(); + ASSERT_EQ(store->fsck(false), 0); + SetVal(g_conf(), "bluestore_debug_legacy_omap", "false"); + SetVal(g_conf(), "bluestore_fsck_error_on_no_per_pool_omap", "true"); + g_conf().apply_changes(nullptr); + ASSERT_EQ(store->fsck(false), 1001); + ASSERT_EQ(store->quick_fix(), 0); + store->mount(); + ch = store->open_collection(cid); + + //checking quick_fix() data + check_omap_data(object_count, poolid, cid); + + { + ObjectStore::Transaction t; + for (size_t o = 0; o < object_count; o++) + { + std::string oid = generate_monotonic_name(object_count, o, 3.71, 0.5); + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 0, poolid, "")); + t.remove(cid, hoid); + } + t.remove_collection(cid); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + +#endif // WITH_BLUESTORE + +int main(int argc, char **argv) { + auto args = argv_to_vec(argc, argv); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + for (auto& i : args) { + if (i == "--smr"s) { +#if defined(HAVE_LIBZBD) + derr << "Adjusting tests for smr mode." << dendl; + smr = true; +#else + derr << "smr mode selected, but support not compiled in" << dendl; + return 1; +#endif + } + } + + // make sure we can adjust any config settings + g_ceph_context->_conf._clear_safe_to_start_threads(); + + g_ceph_context->_conf.set_val_or_die("osd_journal_size", "400"); + g_ceph_context->_conf.set_val_or_die("filestore_index_retry_probability", "0.5"); + g_ceph_context->_conf.set_val_or_die("filestore_op_thread_timeout", "1000"); + g_ceph_context->_conf.set_val_or_die("filestore_op_thread_suicide_timeout", "10000"); + //g_ceph_context->_conf.set_val_or_die("filestore_fiemap", "true"); + g_ceph_context->_conf.set_val_or_die("bluestore_fsck_on_mkfs", "false"); + g_ceph_context->_conf.set_val_or_die("bluestore_fsck_on_mount", "false"); + g_ceph_context->_conf.set_val_or_die("bluestore_fsck_on_umount", "false"); + g_ceph_context->_conf.set_val_or_die("bluestore_debug_small_allocations", "4"); + g_ceph_context->_conf.set_val_or_die("bluestore_debug_freelist", "true"); + g_ceph_context->_conf.set_val_or_die("bluestore_clone_cow", "true"); + g_ceph_context->_conf.set_val_or_die("bluestore_max_alloc_size", "196608"); + // set small cache sizes so we see trimming during Synthetic tests + g_ceph_context->_conf.set_val_or_die("bluestore_cache_size_hdd", "4000000"); + g_ceph_context->_conf.set_val_or_die("bluestore_cache_size_ssd", "4000000"); + g_ceph_context->_conf.set_val_or_die( + "bluestore_debug_inject_allocation_from_file_failure", "0.66"); + + // very short *_max prealloc so that we fall back to async submits + g_ceph_context->_conf.set_val_or_die("bluestore_blobid_prealloc", "10"); + g_ceph_context->_conf.set_val_or_die("bluestore_nid_prealloc", "10"); + g_ceph_context->_conf.set_val_or_die("bluestore_debug_randomize_serial_transaction", + "10"); + + g_ceph_context->_conf.set_val_or_die("bdev_debug_aio", "true"); + + // specify device size + g_ceph_context->_conf.set_val_or_die("bluestore_block_size", + stringify(DEF_STORE_TEST_BLOCKDEV_SIZE)); + + g_ceph_context->_conf.set_val_or_die( + "enable_experimental_unrecoverable_data_corrupting_features", "*"); + g_ceph_context->_conf.apply_changes(nullptr); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +/* + * Local Variables: + * compile-command: "cd ../.. ; make ceph_test_objectstore && + * ./ceph_test_objectstore \ + * --gtest_filter=*.collect_metadata* --log-to-stderr=true --debug-filestore=20 + * " + * End: + */ diff --git a/src/test/objectstore/store_test_fixture.cc b/src/test/objectstore/store_test_fixture.cc new file mode 100644 index 000000000..a3bdc7a36 --- /dev/null +++ b/src/test/objectstore/store_test_fixture.cc @@ -0,0 +1,135 @@ +#include <stdlib.h> +#include <string> +#include <iostream> +#include <assert.h> +#include <gtest/gtest.h> + +#include "common/errno.h" +#include "common/config.h" +#include "os/ObjectStore.h" + +#if defined(WITH_BLUESTORE) +#include "os/bluestore/BlueStore.h" +#endif +#include "store_test_fixture.h" + +using namespace std; + +static void rm_r(const string& path) +{ + string cmd = string("rm -r ") + path; + cout << "==> " << cmd << std::endl; + int r = ::system(cmd.c_str()); + if (r) { + if (r == -1) { + r = errno; + cerr << "system() failed to fork() " << cpp_strerror(r) + << ", continuing anyway" << std::endl; + } else { + cerr << "failed with exit code " << r + << ", continuing anyway" << std::endl; + } + } +} + +void StoreTestFixture::SetUp() +{ + + int r = ::mkdir(data_dir.c_str(), 0777); + if (r < 0) { + r = -errno; + cerr << __func__ << ": unable to create " << data_dir << ": " << cpp_strerror(r) << std::endl; + } + ASSERT_EQ(0, r); + + store = ObjectStore::create(g_ceph_context, + type, + data_dir, + "store_test_temp_journal"); + if (!store) { + cerr << __func__ << ": objectstore type " << type << " doesn't exist yet!" << std::endl; + } + ASSERT_TRUE(store); +#if defined(WITH_BLUESTORE) + if (type == "bluestore") { + BlueStore *s = static_cast<BlueStore*>(store.get()); + // better test coverage! + s->set_cache_shards(5); + } +#endif + ASSERT_EQ(0, store->mkfs()); + ASSERT_EQ(0, store->mount()); + + // we keep this stuff 'unsafe' out of test case scope to be able to update ANY + // config settings. Hence setting it to 'safe' here to proceed with the test + // case + g_conf().set_safe_to_start_threads(); +} + +void StoreTestFixture::TearDown() +{ + if (store) { + int r = store->umount(); + EXPECT_EQ(0, r); + rm_r(data_dir); + } + // we keep this stuff 'unsafe' out of test case scope to be able to update ANY + // config settings. Hence setting it to 'unsafe' here as test case is closing. + g_conf()._clear_safe_to_start_threads(); + PopSettings(0); + if (!orig_death_test_style.empty()) { + ::testing::FLAGS_gtest_death_test_style = orig_death_test_style; + orig_death_test_style.clear(); + } +} + +void StoreTestFixture::SetVal(ConfigProxy& _conf, const char* key, const char* val) +{ + ceph_assert(!conf || conf == &_conf); + conf = &_conf; + std::string skey(key); + std::string prev_val; + conf->get_val(skey, &prev_val); + conf->set_val_or_die(key, val); + saved_settings.emplace(skey, prev_val); +} + +void StoreTestFixture::PopSettings(size_t pos) +{ + if (conf) { + ceph_assert(pos == 0 || pos <= saved_settings.size()); // for sanity + while(pos < saved_settings.size()) + { + auto& e = saved_settings.top(); + conf->set_val_or_die(e.first, e.second); + saved_settings.pop(); + } + conf->apply_changes(NULL); + } +} + +void StoreTestFixture::CloseAndReopen() { + ceph_assert(store != nullptr); + g_conf()._clear_safe_to_start_threads(); + int r = store->umount(); + EXPECT_EQ(0, r); + ch.reset(nullptr); + store.reset(nullptr); + store = ObjectStore::create(g_ceph_context, + type, + data_dir, + "store_test_temp_journal"); + if (!store) { + cerr << __func__ << ": objectstore type " << type << " failed to reopen!" << std::endl; + } + ASSERT_TRUE(store); +#if defined(WITH_BLUESTORE) + if (type == "bluestore") { + BlueStore *s = static_cast<BlueStore*>(store.get()); + // better test coverage! + s->set_cache_shards(5); + } +#endif + ASSERT_EQ(0, store->mount()); + g_conf().set_safe_to_start_threads(); +} diff --git a/src/test/objectstore/store_test_fixture.h b/src/test/objectstore/store_test_fixture.h new file mode 100644 index 000000000..3f25fd493 --- /dev/null +++ b/src/test/objectstore/store_test_fixture.h @@ -0,0 +1,52 @@ +#include <string> +#include <stack> +#include <memory> +#include <gtest/gtest.h> +#include "common/config_fwd.h" + +class ObjectStore; + +class StoreTestFixture : virtual public ::testing::Test { + const std::string type; + const std::string data_dir; + + std::stack<std::pair<std::string, std::string>> saved_settings; + ConfigProxy* conf = nullptr; + + std::string orig_death_test_style; + +public: + std::unique_ptr<ObjectStore> store; + ObjectStore::CollectionHandle ch; + + explicit StoreTestFixture(const std::string& type) + : type(type), data_dir(type + ".test_temp_dir") + {} + + void SetUp() override; + void TearDown() override; + void SetDeathTestStyle(const char* new_style) { + if (orig_death_test_style.empty()) { + orig_death_test_style = ::testing::FLAGS_gtest_death_test_style; + } + ::testing::FLAGS_gtest_death_test_style = new_style; + } + + void SetVal(ConfigProxy& conf, const char* key, const char* val); + struct SettingsBookmark { + StoreTestFixture& s; + size_t pos; + + SettingsBookmark(StoreTestFixture& _s, size_t p) : s(_s), pos(p) + {} + + ~SettingsBookmark() { + s.PopSettings(pos); + } + }; + SettingsBookmark BookmarkSettings() { + return SettingsBookmark(*this, saved_settings.size()); + } + void PopSettings(size_t); + void CloseAndReopen(); +}; diff --git a/src/test/objectstore/test_bdev.cc b/src/test/objectstore/test_bdev.cc new file mode 100755 index 000000000..628b586bc --- /dev/null +++ b/src/test/objectstore/test_bdev.cc @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <gtest/gtest.h> +#include "global/global_init.h" +#include "global/global_context.h" +#include "common/ceph_context.h" +#include "common/ceph_argparse.h" +#include "include/stringify.h" +#include "common/errno.h" + +#include "blk/BlockDevice.h" + +using namespace std; + +class TempBdev { +public: + TempBdev(uint64_t size) + : path{get_temp_bdev(size)} + {} + ~TempBdev() { + rm_temp_bdev(path); + } + const std::string path; +private: + static string get_temp_bdev(uint64_t size) + { + static int n = 0; + string fn = "ceph_test_bluefs.tmp.block." + stringify(getpid()) + + "." + stringify(++n); + int fd = ::open(fn.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644); + ceph_assert(fd >= 0); + int r = ::ftruncate(fd, size); + ceph_assert(r >= 0); + ::close(fd); + return fn; + } + static void rm_temp_bdev(string f) + { + ::unlink(f.c_str()); + } +}; + +TEST(KernelDevice, Ticket45337) { + // Large (>=2 GB) writes are incomplete when bluefs_buffered_io = true + + uint64_t size = 1048576ull * 8192; + TempBdev bdev{ size }; + + const bool buffered = true; + + std::unique_ptr<BlockDevice> b( + BlockDevice::create(g_ceph_context, bdev.path, NULL, NULL, + [](void* handle, void* aio) {}, NULL)); + bufferlist bl; + // writing a bit less than 4GB + for (auto i = 0; i < 4000; i++) { + string s(1048576, 'a' + (i % 28)); + bl.append(s); + } + uint64_t magic_offs = bl.length(); + string s(4086, 'z'); + s += "0123456789"; + bl.append(s); + + { + int r = b->open(bdev.path); + if (r < 0) { + std::cerr << "open " << bdev.path << " failed" << std::endl; + return; + } + } + std::unique_ptr<IOContext> ioc(new IOContext(g_ceph_context, NULL)); + + auto r = b->aio_write(0, bl, ioc.get(), buffered); + ASSERT_EQ(r, 0); + + if (ioc->has_pending_aios()) { + b->aio_submit(ioc.get()); + ioc->aio_wait(); + } + + char outbuf[0x1000]; + r = b->read_random(magic_offs, sizeof(outbuf), outbuf, buffered); + ASSERT_EQ(r, 0); + ASSERT_EQ(memcmp(s.c_str(), outbuf, sizeof(outbuf)), 0); + + b->close(); +} + +int main(int argc, char **argv) { + auto args = argv_to_vec(argc, argv); + map<string,string> defaults = { + { "debug_bdev", "1/20" } + }; + + auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + g_ceph_context->_conf.set_val( + "enable_experimental_unrecoverable_data_corrupting_features", + "*"); + g_ceph_context->_conf.apply_changes(nullptr); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc new file mode 100644 index 000000000..4f77d8597 --- /dev/null +++ b/src/test/objectstore/test_bluefs.cc @@ -0,0 +1,1422 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <time.h> +#include <fcntl.h> +#include <unistd.h> +#include <random> +#include <thread> +#include <stack> +#include <gtest/gtest.h> +#include "global/global_init.h" +#include "common/ceph_argparse.h" +#include "include/stringify.h" +#include "include/scope_guard.h" +#include "common/errno.h" + +#include "os/bluestore/Allocator.h" +#include "os/bluestore/BlueFS.h" + +using namespace std; + +std::unique_ptr<char[]> gen_buffer(uint64_t size) +{ + std::unique_ptr<char[]> buffer = std::make_unique<char[]>(size); + std::independent_bits_engine<std::default_random_engine, CHAR_BIT, unsigned char> e; + std::generate(buffer.get(), buffer.get()+size, std::ref(e)); + return buffer; +} + +class TempBdev { +public: + TempBdev(uint64_t size) + : path{get_temp_bdev(size)} + {} + ~TempBdev() { + rm_temp_bdev(path); + } + const std::string path; +private: + static string get_temp_bdev(uint64_t size) + { + static int n = 0; + string fn = "ceph_test_bluefs.tmp.block." + stringify(getpid()) + + "." + stringify(++n); + int fd = ::open(fn.c_str(), O_CREAT|O_RDWR|O_TRUNC, 0644); + ceph_assert(fd >= 0); + int r = ::ftruncate(fd, size); + ceph_assert(r >= 0); + ::close(fd); + return fn; + } + static void rm_temp_bdev(string f) + { + ::unlink(f.c_str()); + } +}; + +class ConfSaver { + std::stack<std::pair<std::string, std::string>> saved_settings; + ConfigProxy& conf; +public: + ConfSaver(ConfigProxy& conf) : conf(conf) { + conf._clear_safe_to_start_threads(); + }; + ~ConfSaver() { + conf._clear_safe_to_start_threads(); + while(saved_settings.size() > 0) { + auto& e = saved_settings.top(); + conf.set_val_or_die(e.first, e.second); + saved_settings.pop(); + } + conf.set_safe_to_start_threads(); + conf.apply_changes(nullptr); + } + void SetVal(const char* key, const char* val) { + std::string skey(key); + std::string prev_val; + conf.get_val(skey, &prev_val); + conf.set_val_or_die(skey, val); + saved_settings.emplace(skey, prev_val); + } + void ApplyChanges() { + conf.set_safe_to_start_threads(); + conf.apply_changes(nullptr); + } +}; + +TEST(BlueFS, mkfs) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + uuid_d fsid; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); +} + +TEST(BlueFS, mkfs_mount) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576); + ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576); + fs.umount(); +} + +TEST(BlueFS, write_read) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false)); + h->append("foo", 3); + h->append("bar", 3); + h->append("baz", 3); + fs.fsync(h); + fs.close_writer(h); + } + { + BlueFS::FileReader *h; + ASSERT_EQ(0, fs.open_for_read("dir", "file", &h)); + bufferlist bl; + ASSERT_EQ(9, fs.read(h, 0, 1024, &bl, NULL)); + ASSERT_EQ(0, strncmp("foobarbaz", bl.c_str(), 9)); + delete h; + } + fs.umount(); +} + +TEST(BlueFS, small_appends) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false)); + for (unsigned i = 0; i < 10000; ++i) { + h->append("abcdeabcdeabcdeabcdeabcdeabc", 23); + } + fs.fsync(h); + fs.close_writer(h); + } + { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write("dir", "file_sync", &h, false)); + for (unsigned i = 0; i < 1000; ++i) { + h->append("abcdeabcdeabcdeabcdeabcdeabc", 23); + ASSERT_EQ(0, fs.fsync(h)); + } + fs.close_writer(h); + } + fs.umount(); +} + +TEST(BlueFS, very_large_write) { + // we'll write a ~5G file, so allocate more than that for the whole fs + uint64_t size = 1048576 * 1024 * 6ull; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + + bool old = g_ceph_context->_conf.get_val<bool>("bluefs_buffered_io"); + g_ceph_context->_conf.set_val("bluefs_buffered_io", "false"); + uint64_t total_written = 0; + + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + char buf[1048571]; // this is biggish, but intentionally not evenly aligned + for (unsigned i = 0; i < sizeof(buf); ++i) { + buf[i] = i; + } + { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false)); + for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { + h->append(buf, sizeof(buf)); + total_written += sizeof(buf); + } + fs.fsync(h); + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + h->append(buf, sizeof(buf)); + total_written += sizeof(buf); + } + fs.fsync(h); + fs.close_writer(h); + } + { + BlueFS::FileReader *h; + ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); + bufferlist bl; + ASSERT_EQ(h->file->fnode.size, total_written); + for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { + bl.clear(); + fs.read(h, i * sizeof(buf), sizeof(buf), &bl, NULL); + int r = memcmp(buf, bl.c_str(), sizeof(buf)); + if (r) { + cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r + << std::endl; + } + ASSERT_EQ(0, r); + } + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + bl.clear(); + fs.read(h, i * sizeof(buf), sizeof(buf), &bl, NULL); + int r = memcmp(buf, bl.c_str(), sizeof(buf)); + if (r) { + cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r + << std::endl; + } + ASSERT_EQ(0, r); + } + delete h; + ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); + ASSERT_EQ(h->file->fnode.size, total_written); + auto huge_buf = std::make_unique<char[]>(h->file->fnode.size); + auto l = h->file->fnode.size; + int64_t r = fs.read(h, 0, l, NULL, huge_buf.get()); + ASSERT_EQ(r, l); + delete h; + } + fs.umount(); + + g_ceph_context->_conf.set_val("bluefs_buffered_io", stringify((int)old)); +} + +TEST(BlueFS, very_large_write2) { + // we'll write a ~5G file, so allocate more than that for the whole fs + uint64_t size_full = 1048576 * 1024 * 6ull; + uint64_t size = 1048576 * 1024 * 5ull; + TempBdev bdev{ size_full }; + BlueFS fs(g_ceph_context); + + bool old = g_ceph_context->_conf.get_val<bool>("bluefs_buffered_io"); + g_ceph_context->_conf.set_val("bluefs_buffered_io", "false"); + uint64_t total_written = 0; + + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + + char fill_arr[1 << 20]; // 1M + for (size_t i = 0; i < sizeof(fill_arr); ++i) { + fill_arr[i] = (char)i; + } + std::unique_ptr<char[]> buf; + buf.reset(new char[size]); + for (size_t i = 0; i < size; i += sizeof(fill_arr)) { + memcpy(buf.get() + i, fill_arr, sizeof(fill_arr)); + } + { + BlueFS::FileWriter* h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false)); + fs.append_try_flush(h, buf.get(), size); + total_written = size; + fs.fsync(h); + fs.close_writer(h); + } + memset(buf.get(), 0, size); + { + BlueFS::FileReader* h; + ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); + ASSERT_EQ(h->file->fnode.size, total_written); + auto l = h->file->fnode.size; + int64_t r = fs.read(h, 0, l, NULL, buf.get()); + ASSERT_EQ(r, l); + for (size_t i = 0; i < size; i += sizeof(fill_arr)) { + ceph_assert(memcmp(buf.get() + i, fill_arr, sizeof(fill_arr)) == 0); + } + delete h; + } + fs.umount(); + + g_ceph_context->_conf.set_val("bluefs_buffered_io", stringify((int)old)); +} + +#define ALLOC_SIZE 4096 + +void write_data(BlueFS &fs, uint64_t rationed_bytes) +{ + int j=0, r=0; + uint64_t written_bytes = 0; + rationed_bytes -= ALLOC_SIZE; + stringstream ss; + string dir = "dir."; + ss << std::this_thread::get_id(); + dir.append(ss.str()); + dir.append("."); + dir.append(to_string(j)); + ASSERT_EQ(0, fs.mkdir(dir)); + while (1) { + string file = "file."; + file.append(to_string(j)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(ALLOC_SIZE); + bufferptr bp = buffer::claim_char(ALLOC_SIZE, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + r = fs.fsync(h); + if (r < 0) { + break; + } + written_bytes += g_conf()->bluefs_alloc_size; + j++; + if ((rationed_bytes - written_bytes) <= g_conf()->bluefs_alloc_size) { + break; + } + } +} + +void create_single_file(BlueFS &fs) +{ + BlueFS::FileWriter *h; + stringstream ss; + string dir = "dir.test"; + ASSERT_EQ(0, fs.mkdir(dir)); + string file = "testfile"; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(ALLOC_SIZE); + bufferptr bp = buffer::claim_char(ALLOC_SIZE, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + fs.fsync(h); + fs.close_writer(h); +} + +void write_single_file(BlueFS &fs, uint64_t rationed_bytes) +{ + stringstream ss; + const string dir = "dir.test"; + const string file = "testfile"; + uint64_t written_bytes = 0; + rationed_bytes -= ALLOC_SIZE; + while (1) { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(ALLOC_SIZE); + bufferptr bp = buffer::claim_char(ALLOC_SIZE, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + int r = fs.fsync(h); + if (r < 0) { + break; + } + written_bytes += g_conf()->bluefs_alloc_size; + if ((rationed_bytes - written_bytes) <= g_conf()->bluefs_alloc_size) { + break; + } + } +} + +bool writes_done = false; + +void sync_fs(BlueFS &fs) +{ + while (1) { + if (writes_done == true) + break; + fs.sync_metadata(false); + sleep(1); + } +} + + +void do_join(std::thread& t) +{ + t.join(); +} + +void join_all(std::vector<std::thread>& v) +{ + std::for_each(v.begin(),v.end(),do_join); +} + +#define NUM_WRITERS 3 +#define NUM_SYNC_THREADS 1 + +#define NUM_SINGLE_FILE_WRITERS 1 +#define NUM_MULTIPLE_FILE_WRITERS 2 + +TEST(BlueFS, test_flush_1) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + g_ceph_context->_conf.set_val( + "bluefs_alloc_size", + "65536"); + g_ceph_context->_conf.apply_changes(nullptr); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + std::vector<std::thread> write_thread_multiple; + uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction + uint64_t per_thread_bytes = (effective_size/(NUM_MULTIPLE_FILE_WRITERS + NUM_SINGLE_FILE_WRITERS)); + for (int i=0; i<NUM_MULTIPLE_FILE_WRITERS ; i++) { + write_thread_multiple.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes)); + } + + create_single_file(fs); + std::vector<std::thread> write_thread_single; + for (int i=0; i<NUM_SINGLE_FILE_WRITERS; i++) { + write_thread_single.push_back(std::thread(write_single_file, std::ref(fs), per_thread_bytes)); + } + + join_all(write_thread_single); + join_all(write_thread_multiple); + } + fs.umount(); +} + +TEST(BlueFS, test_flush_2) { + uint64_t size = 1048576 * 256; + TempBdev bdev{size}; + g_ceph_context->_conf.set_val( + "bluefs_alloc_size", + "65536"); + g_ceph_context->_conf.apply_changes(nullptr); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + uint64_t effective_size = size - (128 * 1048576); // leaving the last 32 MB for log compaction + uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS)); + std::vector<std::thread> write_thread_multiple; + for (int i=0; i<NUM_WRITERS; i++) { + write_thread_multiple.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes)); + } + + join_all(write_thread_multiple); + } + fs.umount(); +} + +TEST(BlueFS, test_flush_3) { + uint64_t size = 1048576 * 256; + TempBdev bdev{size}; + g_ceph_context->_conf.set_val( + "bluefs_alloc_size", + "65536"); + g_ceph_context->_conf.apply_changes(nullptr); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + std::vector<std::thread> write_threads; + uint64_t effective_size = size - (64 * 1048576); // leaving the last 11 MB for log compaction + uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS)); + for (int i=0; i<NUM_WRITERS; i++) { + write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes)); + } + + std::vector<std::thread> sync_threads; + for (int i=0; i<NUM_SYNC_THREADS; i++) { + sync_threads.push_back(std::thread(sync_fs, std::ref(fs))); + } + + join_all(write_threads); + writes_done = true; + join_all(sync_threads); + } + fs.umount(); +} + +TEST(BlueFS, test_simple_compaction_sync) { + g_ceph_context->_conf.set_val( + "bluefs_compact_log_sync", + "true"); + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + for (int i=0; i<10; i++) { + string dir = "dir."; + dir.append(to_string(i)); + ASSERT_EQ(0, fs.mkdir(dir)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(4096); + bufferptr bp = buffer::claim_char(4096, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + fs.fsync(h); + } + } + } + { + for (int i=0; i<10; i+=2) { + string dir = "dir."; + dir.append(to_string(i)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + fs.unlink(dir, file); + fs.sync_metadata(false); + } + ASSERT_EQ(0, fs.rmdir(dir)); + fs.sync_metadata(false); + } + } + fs.compact_log(); + fs.umount(); +} + +TEST(BlueFS, test_simple_compaction_async) { + g_ceph_context->_conf.set_val( + "bluefs_compact_log_sync", + "false"); + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + for (int i=0; i<10; i++) { + string dir = "dir."; + dir.append(to_string(i)); + ASSERT_EQ(0, fs.mkdir(dir)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(4096); + bufferptr bp = buffer::claim_char(4096, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + fs.fsync(h); + } + } + } + { + for (int i=0; i<10; i+=2) { + string dir = "dir."; + dir.append(to_string(i)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + fs.unlink(dir, file); + fs.sync_metadata(false); + } + ASSERT_EQ(0, fs.rmdir(dir)); + fs.sync_metadata(false); + } + } + fs.compact_log(); + fs.umount(); +} + +TEST(BlueFS, test_compaction_sync) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + g_ceph_context->_conf.set_val( + "bluefs_alloc_size", + "65536"); + g_ceph_context->_conf.set_val( + "bluefs_compact_log_sync", + "true"); + const char* canary_dir = "dir.after_compact_test"; + const char* canary_file = "file.after_compact_test"; + const char* canary_data = "some random data"; + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + std::vector<std::thread> write_threads; + uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction + uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS)); + for (int i=0; i<NUM_WRITERS; i++) { + write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes)); + } + + std::vector<std::thread> sync_threads; + for (int i=0; i<NUM_SYNC_THREADS; i++) { + sync_threads.push_back(std::thread(sync_fs, std::ref(fs))); + } + + join_all(write_threads); + writes_done = true; + join_all(sync_threads); + fs.compact_log(); + + { + ASSERT_EQ(0, fs.mkdir(canary_dir)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(canary_dir, canary_file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + h->append(canary_data, strlen(canary_data)); + int r = fs.fsync(h); + ASSERT_EQ(r, 0); + } + } + fs.umount(); + + fs.mount(); + { + BlueFS::FileReader *h; + ASSERT_EQ(0, fs.open_for_read(canary_dir, canary_file, &h)); + ASSERT_NE(nullptr, h); + bufferlist bl; + ASSERT_EQ(strlen(canary_data), fs.read(h, 0, 1024, &bl, NULL)); + std::cout << bl.c_str() << std::endl; + ASSERT_EQ(0, strncmp(canary_data, bl.c_str(), strlen(canary_data))); + delete h; + } + fs.umount(); +} + +TEST(BlueFS, test_compaction_async) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + g_ceph_context->_conf.set_val( + "bluefs_alloc_size", + "65536"); + g_ceph_context->_conf.set_val( + "bluefs_compact_log_sync", + "false"); + const char* canary_dir = "dir.after_compact_test"; + const char* canary_file = "file.after_compact_test"; + const char* canary_data = "some random data"; + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + std::vector<std::thread> write_threads; + uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction + uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS)); + for (int i=0; i<NUM_WRITERS; i++) { + write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes)); + } + + std::vector<std::thread> sync_threads; + for (int i=0; i<NUM_SYNC_THREADS; i++) { + sync_threads.push_back(std::thread(sync_fs, std::ref(fs))); + } + + join_all(write_threads); + writes_done = true; + join_all(sync_threads); + fs.compact_log(); + + { + ASSERT_EQ(0, fs.mkdir(canary_dir)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(canary_dir, canary_file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + h->append(canary_data, strlen(canary_data)); + int r = fs.fsync(h); + ASSERT_EQ(r, 0); + } + } + fs.umount(); + + fs.mount(); + { + BlueFS::FileReader *h; + ASSERT_EQ(0, fs.open_for_read(canary_dir, canary_file, &h)); + ASSERT_NE(nullptr, h); + bufferlist bl; + ASSERT_EQ(strlen(canary_data), fs.read(h, 0, 1024, &bl, NULL)); + std::cout << bl.c_str() << std::endl; + ASSERT_EQ(0, strncmp(canary_data, bl.c_str(), strlen(canary_data))); + delete h; + } + fs.umount(); +} + +TEST(BlueFS, test_replay) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + g_ceph_context->_conf.set_val( + "bluefs_alloc_size", + "65536"); + g_ceph_context->_conf.set_val( + "bluefs_compact_log_sync", + "false"); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + std::vector<std::thread> write_threads; + uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction + uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS)); + for (int i=0; i<NUM_WRITERS; i++) { + write_threads.push_back(std::thread(write_data, std::ref(fs), per_thread_bytes)); + } + + std::vector<std::thread> sync_threads; + for (int i=0; i<NUM_SYNC_THREADS; i++) { + sync_threads.push_back(std::thread(sync_fs, std::ref(fs))); + } + + join_all(write_threads); + writes_done = true; + join_all(sync_threads); + fs.compact_log(); + } + fs.umount(); + // remount and check log can replay safe? + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + fs.umount(); +} + +TEST(BlueFS, test_replay_growth) { + uint64_t size = 1048576LL * (2 * 1024 + 128); + TempBdev bdev{size}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_alloc_size", "4096"); + conf.SetVal("bluefs_shared_alloc_size", "4096"); + conf.SetVal("bluefs_compact_log_sync", "false"); + conf.SetVal("bluefs_min_log_runway", "32768"); + conf.SetVal("bluefs_max_log_runway", "65536"); + conf.SetVal("bluefs_allocator", "stupid"); + conf.SetVal("bluefs_sync_write", "true"); + conf.ApplyChanges(); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mkdir("dir")); + + char data[2000]; + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false)); + for (size_t i = 0; i < 10000; i++) { + h->append(data, 2000); + fs.fsync(h); + } + fs.close_writer(h); + fs.umount(true); //do not compact on exit! + + // remount and check log can replay safe? + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + fs.umount(); +} + +TEST(BlueFS, test_tracker_50965) { + uint64_t size_wal = 1048576 * 64; + TempBdev bdev_wal{size_wal}; + uint64_t size_db = 1048576 * 128; + TempBdev bdev_db{size_db}; + uint64_t size_slow = 1048576 * 256; + TempBdev bdev_slow{size_slow}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_min_flush_size", "65536"); + conf.ApplyChanges(); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_WAL, bdev_wal.path, false, 0)); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0)); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, true, true })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true })); + + string dir_slow = "dir.slow"; + ASSERT_EQ(0, fs.mkdir(dir_slow)); + string dir_db = "dir_db"; + ASSERT_EQ(0, fs.mkdir(dir_db)); + + string file_slow = "file"; + BlueFS::FileWriter *h_slow; + ASSERT_EQ(0, fs.open_for_write(dir_slow, file_slow, &h_slow, false)); + ASSERT_NE(nullptr, h_slow); + + string file_db = "file"; + BlueFS::FileWriter *h_db; + ASSERT_EQ(0, fs.open_for_write(dir_db, file_db, &h_db, false)); + ASSERT_NE(nullptr, h_db); + + bufferlist bl1; + std::unique_ptr<char[]> buf1 = gen_buffer(70000); + bufferptr bp1 = buffer::claim_char(70000, buf1.get()); + bl1.push_back(bp1); + h_slow->append(bl1.c_str(), bl1.length()); + fs.flush(h_slow); + + uint64_t h_slow_dirty_seq_1 = fs.debug_get_dirty_seq(h_slow); + + bufferlist bl2; + std::unique_ptr<char[]> buf2 = gen_buffer(1000); + bufferptr bp2 = buffer::claim_char(1000, buf2.get()); + bl2.push_back(bp2); + h_db->append(bl2.c_str(), bl2.length()); + fs.fsync(h_db); + + uint64_t h_slow_dirty_seq_2 = fs.debug_get_dirty_seq(h_slow); + bool h_slow_dev_dirty = fs.debug_get_is_dev_dirty(h_slow, BlueFS::BDEV_SLOW); + + //problem if allocations are stable in log but slow device is not flushed yet + ASSERT_FALSE(h_slow_dirty_seq_1 != 0 && + h_slow_dirty_seq_2 == 0 && + h_slow_dev_dirty == true); + + fs.close_writer(h_slow); + fs.close_writer(h_db); + + fs.umount(); +} + +TEST(BlueFS, test_truncate_stable_53129) { + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_min_flush_size", "65536"); + conf.ApplyChanges(); + + uint64_t size_wal = 1048576 * 64; + TempBdev bdev_wal{size_wal}; + uint64_t size_db = 1048576 * 128; + TempBdev bdev_db{size_db}; + uint64_t size_slow = 1048576 * 256; + TempBdev bdev_slow{size_slow}; + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_WAL, bdev_wal.path, false, 0)); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0)); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, true, true })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true })); + + string dir_slow = "dir.slow"; + ASSERT_EQ(0, fs.mkdir(dir_slow)); + string dir_db = "dir_db"; + ASSERT_EQ(0, fs.mkdir(dir_db)); + + string file_slow = "file"; + BlueFS::FileWriter *h_slow; + ASSERT_EQ(0, fs.open_for_write(dir_slow, file_slow, &h_slow, false)); + ASSERT_NE(nullptr, h_slow); + + string file_db = "file"; + BlueFS::FileWriter *h_db; + ASSERT_EQ(0, fs.open_for_write(dir_db, file_db, &h_db, false)); + ASSERT_NE(nullptr, h_db); + + bufferlist bl1; + std::unique_ptr<char[]> buf1 = gen_buffer(70000); + bufferptr bp1 = buffer::claim_char(70000, buf1.get()); + bl1.push_back(bp1); + // add 70000 bytes + h_slow->append(bl1.c_str(), bl1.length()); + fs.flush(h_slow); + // and truncate to 60000 bytes + fs.truncate(h_slow, 60000); + + // write something to file on DB device + bufferlist bl2; + std::unique_ptr<char[]> buf2 = gen_buffer(1000); + bufferptr bp2 = buffer::claim_char(1000, buf2.get()); + bl2.push_back(bp2); + h_db->append(bl2.c_str(), bl2.length()); + // and force bluefs log to flush + fs.fsync(h_db); + + // This is the actual test point. + // We completed truncate, and we expect + // - size to be 60000 + // - data to be stable on slow device + // OR + // - size = 0 or file does not exist + // - dev_dirty is irrelevant + bool h_slow_dev_dirty = fs.debug_get_is_dev_dirty(h_slow, BlueFS::BDEV_SLOW); + // Imagine power goes down here. + + fs.close_writer(h_slow); + fs.close_writer(h_db); + + fs.umount(); + + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true })); + + uint64_t size; + utime_t mtime; + ASSERT_EQ(0, fs.stat("dir.slow", "file", &size, &mtime)); + // check file size 60000 + ASSERT_EQ(size, 60000); + // check that dev_dirty was false (data stable on media) + ASSERT_EQ(h_slow_dev_dirty, false); + + fs.umount(); +} + +TEST(BlueFS, test_update_ino1_delta_after_replay) { + uint64_t size = 1048576LL * (2 * 1024 + 128); + TempBdev bdev{size}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_alloc_size", "4096"); + conf.SetVal("bluefs_shared_alloc_size", "4096"); + conf.SetVal("bluefs_compact_log_sync", "false"); + conf.SetVal("bluefs_min_log_runway", "32768"); + conf.SetVal("bluefs_max_log_runway", "65536"); + conf.SetVal("bluefs_allocator", "stupid"); + conf.ApplyChanges(); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mkdir("dir")); + + char data[2000]; + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false)); + for (size_t i = 0; i < 100; i++) { + h->append(data, 2000); + fs.fsync(h); + } + fs.close_writer(h); + fs.umount(true); //do not compact on exit! + + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.open_for_write("dir", "file2", &h, false)); + for (size_t i = 0; i < 100; i++) { + h->append(data, 2000); + fs.fsync(h); + } + fs.close_writer(h); + fs.umount(); + + // remount and check log can replay safe? + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + fs.umount(); +} + +TEST(BlueFS, broken_unlink_fsync_seq) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + /* + * This reproduces a weird file op sequence (unlink+fsync) that Octopus + * RocksDB might issue to BlueFS when recycle_log_file_num setting is 0 + * See https://tracker.ceph.com/issues/55636 for more details + * + */ + char buf[1048571]; // this is biggish, but intentionally not evenly aligned + for (unsigned i = 0; i < sizeof(buf); ++i) { + buf[i] = i; + } + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "file", &h, false)); + + h->append(buf, sizeof(buf)); + fs.flush(h); + h->append(buf, sizeof(buf)); + fs.unlink("dir", "file"); + fs.fsync(h); + fs.close_writer(h); + } + fs.umount(); + + // remount and check log can replay safe? + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + fs.umount(); +} + +TEST(BlueFS, truncate_fsync) { + uint64_t bdev_size = 128 * 1048576; + uint64_t block_size = 4096; + uint64_t reserved = 1048576; + TempBdev bdev{bdev_size}; + uuid_d fsid; + const char* DIR_NAME="dir"; + const char* FILE_NAME="file1"; + + size_t sizes[] = {3, 1024, 4096, 1024 * 4096}; + for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) { + const size_t content_size= sizes[i]; + const size_t read_size = p2roundup(content_size, size_t(block_size)); + const std::string content(content_size, 'x'); + { + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, reserved)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false)); + h->append(content.c_str(), content.length()); + fs.fsync(h); + fs.close_writer(h); + } + { + BlueFS::FileReader *h; + ASSERT_EQ(0, fs.open_for_read(DIR_NAME, FILE_NAME, &h)); + bufferlist bl; + ASSERT_EQ(content.length(), fs.read(h, 0, read_size, &bl, NULL)); + ASSERT_EQ(0, strncmp(content.c_str(), bl.c_str(), content.length())); + delete h; + } + { + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, true)); + fs.truncate(h, 0); + fs.fsync(h); + fs.close_writer(h); + } + } + { + //this was broken due to https://tracker.ceph.com/issues/55307 + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, reserved)); + ASSERT_EQ(0, fs.mount()); + BlueFS::FileReader *h; + ASSERT_EQ(0, fs.open_for_read(DIR_NAME, FILE_NAME, &h)); + bufferlist bl; + ASSERT_EQ(0, fs.read(h, 0, read_size, &bl, NULL)); + delete h; + fs.umount(); + } + } +} + +TEST(BlueFS, test_shared_alloc) { + uint64_t size = 1048576 * 128; + TempBdev bdev_slow{size}; + uint64_t size_db = 1048576 * 8; + TempBdev bdev_db{size_db}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_shared_alloc_size", "1048576"); + + bluefs_shared_alloc_context_t shared_alloc; + uint64_t shared_alloc_unit = 4096; + shared_alloc.set( + Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator, + size, shared_alloc_unit, 0, 0, "test shared allocator"), + shared_alloc_unit); + shared_alloc.a->init_add_free(0, size); + + BlueFS fs(g_ceph_context); + // DB device is fully utilized + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, size_db - 0x1000)); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0, + &shared_alloc)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + for (int i=0; i<10; i++) { + string dir = "dir."; + dir.append(to_string(i)); + ASSERT_EQ(0, fs.mkdir(dir)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(4096); + bufferptr bp = buffer::claim_char(4096, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + fs.fsync(h); + } + } + } + { + for (int i=0; i<10; i+=2) { + string dir = "dir."; + dir.append(to_string(i)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + fs.unlink(dir, file); + fs.sync_metadata(false); + } + ASSERT_EQ(0, fs.rmdir(dir)); + fs.sync_metadata(false); + } + } + fs.compact_log(); + auto *logger = fs.get_perf_counters(); + ASSERT_NE(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0); + auto num_files = logger->get(l_bluefs_num_files); + fs.umount(); + fs.mount(); + ASSERT_EQ(num_files, logger->get(l_bluefs_num_files)); + fs.umount(); +} + +TEST(BlueFS, test_shared_alloc_sparse) { + uint64_t size = 1048576 * 128 * 2; + uint64_t main_unit = 4096; + uint64_t bluefs_alloc_unit = 1048576; + TempBdev bdev_slow{size}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_shared_alloc_size", + stringify(bluefs_alloc_unit).c_str()); + + bluefs_shared_alloc_context_t shared_alloc; + shared_alloc.set( + Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator, + size, main_unit, 0, 0, "test shared allocator"), + main_unit); + // prepare sparse free space but let's have a continuous chunk at + // the beginning to fit initial log's fnode into superblock, + // we don't have any tricks to deal with sparse allocations + // (and hence long fnode) at mkfs + shared_alloc.a->init_add_free(bluefs_alloc_unit, 4 * bluefs_alloc_unit); + for(uint64_t i = 5 * bluefs_alloc_unit; i < size; i += 2 * main_unit) { + shared_alloc.a->init_add_free(i, main_unit); + } + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0, + &shared_alloc)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + for (int i=0; i<10; i++) { + string dir = "dir."; + dir.append(to_string(i)); + ASSERT_EQ(0, fs.mkdir(dir)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(4096); + bufferptr bp = buffer::claim_char(4096, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + fs.fsync(h); + } + } + } + { + for (int i=0; i<10; i+=2) { + string dir = "dir."; + dir.append(to_string(i)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + fs.unlink(dir, file); + fs.sync_metadata(false); + } + ASSERT_EQ(0, fs.rmdir(dir)); + fs.sync_metadata(false); + } + } + fs.compact_log(); + auto *logger = fs.get_perf_counters(); + ASSERT_NE(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0); + auto num_files = logger->get(l_bluefs_num_files); + fs.umount(); + + fs.mount(); + ASSERT_EQ(num_files, logger->get(l_bluefs_num_files)); + fs.umount(); +} + +TEST(BlueFS, test_4k_shared_alloc) { + uint64_t size = 1048576 * 128 * 2; + uint64_t main_unit = 4096; + uint64_t bluefs_alloc_unit = main_unit; + TempBdev bdev_slow{size}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_shared_alloc_size", + stringify(bluefs_alloc_unit).c_str()); + + bluefs_shared_alloc_context_t shared_alloc; + shared_alloc.set( + Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator, + size, main_unit, 0, 0, "test shared allocator"), + main_unit); + shared_alloc.a->init_add_free(bluefs_alloc_unit, size - bluefs_alloc_unit); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0, + &shared_alloc)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + for (int i=0; i<10; i++) { + string dir = "dir."; + dir.append(to_string(i)); + ASSERT_EQ(0, fs.mkdir(dir)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); }); + bufferlist bl; + std::unique_ptr<char[]> buf = gen_buffer(4096); + bufferptr bp = buffer::claim_char(4096, buf.get()); + bl.push_back(bp); + h->append(bl.c_str(), bl.length()); + fs.fsync(h); + } + } + } + { + for (int i=0; i<10; i+=2) { + string dir = "dir."; + dir.append(to_string(i)); + for (int j=0; j<10; j++) { + string file = "file."; + file.append(to_string(j)); + fs.unlink(dir, file); + fs.sync_metadata(false); + } + ASSERT_EQ(0, fs.rmdir(dir)); + fs.sync_metadata(false); + } + } + fs.compact_log(); + auto *logger = fs.get_perf_counters(); + ASSERT_EQ(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0); + ASSERT_EQ(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0); + auto num_files = logger->get(l_bluefs_num_files); + fs.umount(); + + fs.mount(); + ASSERT_EQ(num_files, logger->get(l_bluefs_num_files)); + fs.umount(); +} + +void create_files(BlueFS &fs, + atomic_bool& stop_creating, + atomic_bool& started_creating) +{ + uint32_t i = 0; + stringstream ss; + string dir = "dir."; + ss << std::this_thread::get_id(); + dir.append(ss.str()); + dir.append("."); + dir.append(to_string(i)); + ASSERT_EQ(0, fs.mkdir(dir)); + while (!stop_creating.load()) { + string file = "file."; + file.append(to_string(i)); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false)); + ASSERT_NE(nullptr, h); + fs.close_writer(h); + i++; + started_creating = true; + } +} + + +TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + ConfSaver conf(g_ceph_context->_conf); + + conf.SetVal("bluefs_alloc_size", "65536"); + conf.SetVal("bluefs_compact_log_sync", "false"); + // make sure fsync always trigger log compact + conf.SetVal("bluefs_log_compact_min_ratio", "0"); + conf.SetVal("bluefs_log_compact_min_size", "0"); + conf.ApplyChanges(); + + for (int i=0; i<10; ++i) { + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + { + atomic_bool stop_creating{false}; + atomic_bool started_creating{false}; + std::thread create_thread; + create_thread = std::thread(create_files, + std::ref(fs), + std::ref(stop_creating), + std::ref(started_creating)); + while (!started_creating.load()) { + } + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("foo")); + ASSERT_EQ(0, fs.open_for_write("foo", "bar", &h, false)); + fs.fsync(h); + fs.close_writer(h); + + stop_creating = true; + do_join(create_thread); + + fs.umount(true); //do not compact on exit! + ASSERT_EQ(0, fs.mount()); + fs.umount(); + } + } +} + +int main(int argc, char **argv) { + auto args = argv_to_vec(argc, argv); + map<string,string> defaults = { + { "debug_bluefs", "1/20" }, + { "debug_bdev", "1/20" } + }; + + auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + g_ceph_context->_conf.set_val( + "enable_experimental_unrecoverable_data_corrupting_features", + "*"); + g_ceph_context->_conf.apply_changes(nullptr); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc new file mode 100644 index 000000000..18ccaff91 --- /dev/null +++ b/src/test/objectstore/test_bluestore_types.cc @@ -0,0 +1,2346 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "os/bluestore/bluestore_types.h" +#include "gtest/gtest.h" +#include "include/stringify.h" +#include "common/ceph_time.h" +#include "os/bluestore/BlueStore.h" +#include "os/bluestore/simple_bitmap.h" +#include "os/bluestore/AvlAllocator.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "global/global_context.h" +#include "perfglue/heap_profiler.h" + +#include <sstream> + +#define _STR(x) #x +#define STRINGIFY(x) _STR(x) + +using namespace std; + +TEST(bluestore, sizeof) { +#define P(t) cout << STRINGIFY(t) << "\t" << sizeof(t) << std::endl + P(BlueStore::Onode); + P(BlueStore::Extent); + P(BlueStore::Blob); + P(BlueStore::SharedBlob); + P(BlueStore::ExtentMap); + P(BlueStore::extent_map_t); + P(BlueStore::blob_map_t); + P(BlueStore::BufferSpace); + P(BlueStore::Buffer); + P(bluestore_onode_t); + P(bluestore_blob_t); + P(PExtentVector); + P(ghobject_t); + P(bluestore_shared_blob_t); + P(bluestore_extent_ref_map_t); + P(bluestore_extent_ref_map_t::record_t); + P(bluestore_blob_use_tracker_t); + P(std::atomic_int); + P(BlueStore::SharedBlobRef); + P(boost::intrusive::set_base_hook<>); + P(boost::intrusive::unordered_set_base_hook<>); + P(bufferlist); + P(bufferptr); + P(range_seg_t); + P(sb_info_t); + P(SimpleBitmap); + cout << "map<uint64_t,uint64_t>\t" << sizeof(map<uint64_t,uint64_t>) << std::endl; + cout << "map<char,char>\t" << sizeof(map<char,char>) << std::endl; +} + +void dump_mempools() +{ + ostringstream ostr; + auto f = Formatter::create_unique("json-pretty", "json-pretty", "json-pretty"); + ostr << "Mempools: "; + f->open_object_section("mempools"); + mempool::dump(f.get()); + f->close_section(); + f->flush(ostr); + cout << ostr.str() << std::endl; +} +/*void get_mempool_stats(uint64_t* total_bytes, uint64_t* total_items) +{ + uint64_t meta_allocated = mempool::bluestore_cache_meta::allocated_bytes(); + uint64_t onode_allocated = mempool::bluestore_cache_onode::allocated_bytes(); + uint64_t other_allocated = mempool::bluestore_cache_other::allocated_bytes(); + + uint64_t meta_items = mempool::bluestore_cache_meta::allocated_items(); + uint64_t onode_items = mempool::bluestore_cache_onode::allocated_items(); + uint64_t other_items = mempool::bluestore_cache_other::allocated_items(); + cout << "meta(" << meta_allocated << "/" << meta_items + << ") onode(" << onode_allocated << "/" << onode_items + << ") other(" << other_allocated << "/" << other_items + << ")" << std::endl; + *total_bytes = meta_allocated + onode_allocated + other_allocated; + *total_items = onode_items; +}*/ + +TEST(sb_info_space_efficient_map_t, basic) { + sb_info_space_efficient_map_t sb_info; + const size_t num_shared = 1000; + for (size_t i = 0; i < num_shared; i += 2) { + auto& sbi = sb_info.add_maybe_stray(i); + sbi.pool_id = i; + } + ASSERT_TRUE(sb_info.find(0) != sb_info.end()); + ASSERT_TRUE(sb_info.find(1) == sb_info.end()); + ASSERT_TRUE(sb_info.find(2) != sb_info.end()); + ASSERT_TRUE(sb_info.find(4)->pool_id == 4); + ASSERT_TRUE(sb_info.find(num_shared) == sb_info.end()); + + // ordered insertion + sb_info.add_or_adopt(num_shared).pool_id = num_shared; + ASSERT_TRUE(sb_info.find(num_shared) != sb_info.end()); + ASSERT_TRUE(sb_info.find(num_shared)->pool_id == num_shared); + + // out of order insertion + sb_info.add_or_adopt(1).pool_id = 1; + ASSERT_TRUE(sb_info.find(1) != sb_info.end()); + ASSERT_TRUE(sb_info.find(1)->pool_id == 1); + + // ordered insertion + sb_info.add_maybe_stray(num_shared + 1).pool_id = num_shared + 1; + ASSERT_TRUE(sb_info.find(num_shared + 1) != sb_info.end()); + ASSERT_TRUE(sb_info.find(num_shared + 1)->pool_id == num_shared + 1); + + // out of order insertion + sb_info.add_maybe_stray(105).pool_id = 105; + ASSERT_TRUE(sb_info.find(105) != sb_info.end()); + ASSERT_TRUE(sb_info.find(105)->pool_id == 105); +} + +TEST(sb_info_space_efficient_map_t, size) { + const size_t num_shared = 10000000; + sb_info_space_efficient_map_t sb_info; + + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard* oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard* bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + + for (size_t i = 0; i < num_shared; i++) { + auto& sbi = sb_info.add_or_adopt(i); + // primarily to silent the 'unused' warning + ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID); + } + dump_mempools(); +} + +TEST(bluestore_extent_ref_map_t, add) +{ + bluestore_extent_ref_map_t m; + m.get(10, 10); + ASSERT_EQ(1u, m.ref_map.size()); + cout << m << std::endl; + m.get(20, 10); + cout << m << std::endl; + ASSERT_EQ(1u, m.ref_map.size()); + ASSERT_EQ(20u, m.ref_map[10].length); + ASSERT_EQ(1u, m.ref_map[10].refs); + m.get(40, 10); + cout << m << std::endl; + ASSERT_EQ(2u, m.ref_map.size()); + m.get(30, 10); + cout << m << std::endl; + ASSERT_EQ(1u, m.ref_map.size()); + m.get(50, 10); + cout << m << std::endl; + ASSERT_EQ(1u, m.ref_map.size()); + m.get(5, 5); + cout << m << std::endl; + ASSERT_EQ(1u, m.ref_map.size()); +} + +TEST(bluestore_extent_ref_map_t, get) +{ + bluestore_extent_ref_map_t m; + m.get(00, 30); + cout << m << std::endl; + m.get(10, 10); + cout << m << std::endl; + ASSERT_EQ(3u, m.ref_map.size()); + ASSERT_EQ(10u, m.ref_map[0].length); + ASSERT_EQ(1u, m.ref_map[0].refs); + ASSERT_EQ(10u, m.ref_map[10].length); + ASSERT_EQ(2u, m.ref_map[10].refs); + ASSERT_EQ(10u, m.ref_map[20].length); + ASSERT_EQ(1u, m.ref_map[20].refs); + m.get(20, 5); + cout << m << std::endl; + ASSERT_EQ(3u, m.ref_map.size()); + ASSERT_EQ(15u, m.ref_map[10].length); + ASSERT_EQ(2u, m.ref_map[10].refs); + ASSERT_EQ(5u, m.ref_map[25].length); + ASSERT_EQ(1u, m.ref_map[25].refs); + m.get(5, 20); + cout << m << std::endl; + ASSERT_EQ(4u, m.ref_map.size()); + ASSERT_EQ(5u, m.ref_map[0].length); + ASSERT_EQ(1u, m.ref_map[0].refs); + ASSERT_EQ(5u, m.ref_map[5].length); + ASSERT_EQ(2u, m.ref_map[5].refs); + ASSERT_EQ(15u, m.ref_map[10].length); + ASSERT_EQ(3u, m.ref_map[10].refs); + ASSERT_EQ(5u, m.ref_map[25].length); + ASSERT_EQ(1u, m.ref_map[25].refs); + m.get(25, 3); + cout << m << std::endl; + ASSERT_EQ(5u, m.ref_map.size()); + ASSERT_EQ(5u, m.ref_map[0].length); + ASSERT_EQ(1u, m.ref_map[0].refs); + ASSERT_EQ(5u, m.ref_map[5].length); + ASSERT_EQ(2u, m.ref_map[5].refs); + ASSERT_EQ(15u, m.ref_map[10].length); + ASSERT_EQ(3u, m.ref_map[10].refs); + ASSERT_EQ(3u, m.ref_map[25].length); + ASSERT_EQ(2u, m.ref_map[25].refs); + ASSERT_EQ(2u, m.ref_map[28].length); + ASSERT_EQ(1u, m.ref_map[28].refs); +} + +TEST(bluestore_extent_ref_map_t, put) +{ + bluestore_extent_ref_map_t m; + PExtentVector r; + bool maybe_unshared = false; + m.get(10, 30); + maybe_unshared = true; + m.put(10, 30, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; + ASSERT_EQ(0u, m.ref_map.size()); + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(10u, r[0].offset); + ASSERT_EQ(30u, r[0].length); + ASSERT_TRUE(maybe_unshared); + r.clear(); + m.get(10, 30); + m.get(20, 10); + maybe_unshared = true; + m.put(10, 30, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; + ASSERT_EQ(1u, m.ref_map.size()); + ASSERT_EQ(10u, m.ref_map[20].length); + ASSERT_EQ(1u, m.ref_map[20].refs); + ASSERT_EQ(2u, r.size()); + ASSERT_EQ(10u, r[0].offset); + ASSERT_EQ(10u, r[0].length); + ASSERT_EQ(30u, r[1].offset); + ASSERT_EQ(10u, r[1].length); + ASSERT_TRUE(maybe_unshared); + r.clear(); + m.get(30, 10); + m.get(30, 10); + maybe_unshared = true; + m.put(20, 15, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; + ASSERT_EQ(2u, m.ref_map.size()); + ASSERT_EQ(5u, m.ref_map[30].length); + ASSERT_EQ(1u, m.ref_map[30].refs); + ASSERT_EQ(5u, m.ref_map[35].length); + ASSERT_EQ(2u, m.ref_map[35].refs); + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(20u, r[0].offset); + ASSERT_EQ(10u, r[0].length); + ASSERT_FALSE(maybe_unshared); + r.clear(); + maybe_unshared = true; + m.put(33, 5, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; + ASSERT_EQ(3u, m.ref_map.size()); + ASSERT_EQ(3u, m.ref_map[30].length); + ASSERT_EQ(1u, m.ref_map[30].refs); + ASSERT_EQ(3u, m.ref_map[35].length); + ASSERT_EQ(1u, m.ref_map[35].refs); + ASSERT_EQ(2u, m.ref_map[38].length); + ASSERT_EQ(2u, m.ref_map[38].refs); + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(33u, r[0].offset); + ASSERT_EQ(2u, r[0].length); + ASSERT_FALSE(maybe_unshared); + r.clear(); + maybe_unshared = true; + m.put(38, 2, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; + ASSERT_TRUE(maybe_unshared); +} + +TEST(bluestore_extent_ref_map_t, contains) +{ + bluestore_extent_ref_map_t m; + m.get(10, 30); + ASSERT_TRUE(m.contains(10, 30)); + ASSERT_TRUE(m.contains(10, 10)); + ASSERT_TRUE(m.contains(30, 10)); + ASSERT_FALSE(m.contains(0, 10)); + ASSERT_FALSE(m.contains(0, 20)); + ASSERT_FALSE(m.contains(0, 100)); + ASSERT_FALSE(m.contains(40, 10)); + ASSERT_FALSE(m.contains(30, 11)); + m.get(40, 10); + m.get(40, 10); + ASSERT_TRUE(m.contains(30, 11)); + ASSERT_TRUE(m.contains(30, 20)); + ASSERT_TRUE(m.contains(10, 40)); + ASSERT_FALSE(m.contains(0, 50)); + ASSERT_FALSE(m.contains(40, 20)); + m.get(60, 100); + ASSERT_TRUE(m.contains(60, 10)); + ASSERT_TRUE(m.contains(40, 10)); + ASSERT_FALSE(m.contains(40, 11)); + ASSERT_FALSE(m.contains(40, 20)); + ASSERT_FALSE(m.contains(40, 30)); + ASSERT_FALSE(m.contains(40, 3000)); + ASSERT_FALSE(m.contains(4000, 30)); +} + +TEST(bluestore_extent_ref_map_t, intersects) +{ + bluestore_extent_ref_map_t m; + m.get(10, 30); + ASSERT_TRUE(m.intersects(10, 30)); + ASSERT_TRUE(m.intersects(0, 11)); + ASSERT_TRUE(m.intersects(10, 40)); + ASSERT_TRUE(m.intersects(15, 40)); + ASSERT_FALSE(m.intersects(0, 10)); + ASSERT_FALSE(m.intersects(0, 5)); + ASSERT_FALSE(m.intersects(40, 20)); + ASSERT_FALSE(m.intersects(41, 20)); + m.get(40, 10); + m.get(40, 10); + ASSERT_TRUE(m.intersects(0, 100)); + ASSERT_TRUE(m.intersects(10, 35)); + ASSERT_TRUE(m.intersects(45, 10)); + ASSERT_FALSE(m.intersects(50, 5)); + m.get(60, 100); + ASSERT_TRUE(m.intersects(45, 10)); + ASSERT_TRUE(m.intersects(55, 10)); + ASSERT_TRUE(m.intersects(50, 11)); + ASSERT_FALSE(m.intersects(50, 10)); + ASSERT_FALSE(m.intersects(51, 9)); + ASSERT_FALSE(m.intersects(55, 1)); +} + +TEST(bluestore_blob_t, calc_csum) +{ + bufferlist bl; + bl.append("asdfghjkqwertyuizxcvbnm,"); + bufferlist bl2; + bl2.append("xxxxXXXXyyyyYYYYzzzzZZZZ"); + bufferlist f; + f.substr_of(bl, 0, 8); + bufferlist m; + m.substr_of(bl, 8, 8); + bufferlist e; + e.substr_of(bl, 16, 8); + bufferlist n; + n.append("12345678"); + + for (unsigned csum_type = Checksummer::CSUM_NONE + 1; + csum_type < Checksummer::CSUM_MAX; + ++csum_type) { + cout << "csum_type " << Checksummer::get_csum_type_string(csum_type) + << std::endl; + + bluestore_blob_t b; + int bad_off; + uint64_t bad_csum; + ASSERT_EQ(0, b.verify_csum(0, bl, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + + b.init_csum(csum_type, 3, 24); + cout << " value size " << b.get_csum_value_size() << std::endl; + b.calc_csum(0, bl); + ASSERT_EQ(0, b.verify_csum(0, bl, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + ASSERT_EQ(-1, b.verify_csum(0, bl2, &bad_off, &bad_csum)); + ASSERT_EQ(0, bad_off); + + ASSERT_EQ(0, b.verify_csum(0, f, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + ASSERT_EQ(-1, b.verify_csum(8, f, &bad_off, &bad_csum)); + ASSERT_EQ(8, bad_off); + ASSERT_EQ(-1, b.verify_csum(16, f, &bad_off, &bad_csum)); + ASSERT_EQ(16, bad_off); + + ASSERT_EQ(-1, b.verify_csum(0, m, &bad_off, &bad_csum)); + ASSERT_EQ(0, bad_off); + ASSERT_EQ(0, b.verify_csum(8, m, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + ASSERT_EQ(-1, b.verify_csum(16, m, &bad_off, &bad_csum)); + ASSERT_EQ(16, bad_off); + + ASSERT_EQ(-1, b.verify_csum(0, e, &bad_off, &bad_csum)); + ASSERT_EQ(0, bad_off); + ASSERT_EQ(-1, b.verify_csum(8, e, &bad_off, &bad_csum)); + ASSERT_EQ(8, bad_off); + ASSERT_EQ(0, b.verify_csum(16, e, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + + b.calc_csum(8, n); + ASSERT_EQ(0, b.verify_csum(0, f, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + ASSERT_EQ(0, b.verify_csum(8, n, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + ASSERT_EQ(0, b.verify_csum(16, e, &bad_off, &bad_csum)); + ASSERT_EQ(-1, bad_off); + ASSERT_EQ(-1, b.verify_csum(0, bl, &bad_off, &bad_csum)); + ASSERT_EQ(8, bad_off); + } +} + +TEST(bluestore_blob_t, csum_bench) +{ + bufferlist bl; + bufferptr bp(10485760); + for (char *a = bp.c_str(); a < bp.c_str() + bp.length(); ++a) + *a = (unsigned long)a & 0xff; + bl.append(bp); + int count = 256; + for (unsigned csum_type = 1; + csum_type < Checksummer::CSUM_MAX; + ++csum_type) { + bluestore_blob_t b; + b.init_csum(csum_type, 12, bl.length()); + ceph::mono_clock::time_point start = ceph::mono_clock::now(); + for (int i = 0; i<count; ++i) { + b.calc_csum(0, bl); + } + ceph::mono_clock::time_point end = ceph::mono_clock::now(); + auto dur = std::chrono::duration_cast<ceph::timespan>(end - start); + double mbsec = (double)count * (double)bl.length() / 1000000.0 / (double)dur.count() * 1000000000.0; + cout << "csum_type " << Checksummer::get_csum_type_string(csum_type) + << ", " << dur << " seconds, " + << mbsec << " MB/sec" << std::endl; + } +} + +TEST(Blob, put_ref) +{ + { + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Blob b; + b.shared_blob = new BlueStore::SharedBlob(coll.get()); + b.dirty_blob().allocated_test(bluestore_pextent_t(0x40715000, 0x2000)); + b.dirty_blob().allocated_test( + bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x8000)); + b.dirty_blob().allocated_test(bluestore_pextent_t(0x4071f000, 0x5000)); + b.get_ref(coll.get(), 0, 0x1200); + b.get_ref(coll.get(), 0xae00, 0x4200); + ASSERT_EQ(0x5400u, b.get_referenced_bytes()); + cout << b << std::endl; + PExtentVector r; + + ASSERT_FALSE(b.put_ref(coll.get(), 0, 0x1200, &r)); + ASSERT_EQ(0x4200u, b.get_referenced_bytes()); + cout << " r " << r << std::endl; + cout << b << std::endl; + + r.clear(); + ASSERT_TRUE(b.put_ref(coll.get(), 0xae00, 0x4200, &r)); + ASSERT_EQ(0u, b.get_referenced_bytes()); + cout << " r " << r << std::endl; + cout << b << std::endl; + } + + unsigned mas = 4096; + BlueStore store(g_ceph_context, "", 8192); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(0, mas * 2)); + B.get_ref(coll.get(), 0, mas*2); + ASSERT_EQ(mas * 2, B.get_referenced_bytes()); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_TRUE(B.put_ref(coll.get(), 0, mas*2, &r)); + ASSERT_EQ(0u, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_FALSE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(0, mas)); + ASSERT_FALSE(b.is_allocated(mas, 0)); + ASSERT_FALSE(b.get_extents()[0].is_valid()); + ASSERT_EQ(mas*2, b.get_extents()[0].length); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(123, mas * 2)); + B.get_ref(coll.get(), 0, mas*2); + ASSERT_EQ(mas * 2, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r)); + ASSERT_EQ(mas, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_TRUE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(0u, B.get_referenced_bytes()); + ASSERT_EQ(0u, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(123u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_FALSE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.get_extents()[0].is_valid()); + ASSERT_EQ(mas*2, b.get_extents()[0].length); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas)); + b.allocated_test(bluestore_pextent_t(2, mas)); + b.allocated_test(bluestore_pextent_t(3, mas)); + b.allocated_test(bluestore_pextent_t(4, mas)); + B.get_ref(coll.get(), 0, mas*4); + ASSERT_EQ(mas * 4, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 3, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*4)); + ASSERT_TRUE(b.is_allocated(mas, mas)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r)); + ASSERT_EQ(mas * 2, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(mas*2, mas)); + ASSERT_TRUE(b.is_allocated(0, mas*4)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r)); + ASSERT_EQ(mas, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(2u, r.size()); + ASSERT_EQ(3u, r[0].offset); + ASSERT_EQ(mas, r[0].length); + ASSERT_EQ(4u, r[1].offset); + ASSERT_EQ(mas, r[1].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*2)); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_TRUE(b.get_extents()[1].is_valid()); + ASSERT_FALSE(b.get_extents()[2].is_valid()); + ASSERT_EQ(3u, b.get_extents().size()); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas)); + b.allocated_test(bluestore_pextent_t(2, mas)); + b.allocated_test(bluestore_pextent_t(3, mas)); + b.allocated_test(bluestore_pextent_t(4, mas)); + b.allocated_test(bluestore_pextent_t(5, mas)); + b.allocated_test(bluestore_pextent_t(6, mas)); + B.get_ref(coll.get(), 0, mas*6); + ASSERT_EQ(mas * 6, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 5, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*6)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r)); + ASSERT_EQ(mas * 4, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*6)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r)); + ASSERT_EQ(mas * 3, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(2u, r.size()); + ASSERT_EQ(3u, r[0].offset); + ASSERT_EQ(mas, r[0].length); + ASSERT_EQ(4u, r[1].offset); + ASSERT_EQ(mas, r[1].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*2)); + ASSERT_TRUE(b.is_allocated(mas*4, mas*2)); + ASSERT_EQ(5u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_TRUE(b.get_extents()[1].is_valid()); + ASSERT_FALSE(b.get_extents()[2].is_valid()); + ASSERT_TRUE(b.get_extents()[3].is_valid()); + ASSERT_TRUE(b.get_extents()[4].is_valid()); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas * 6)); + B.get_ref(coll.get(), 0, mas*6); + ASSERT_EQ(mas * 6, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 5, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*6)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r)); + ASSERT_EQ(mas * 4, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*6)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r)); + ASSERT_EQ(mas * 3, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x2001u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*2)); + ASSERT_TRUE(b.is_allocated(mas*4, mas*2)); + ASSERT_EQ(3u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_FALSE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(b.get_extents()[2].is_valid()); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas * 4)); + b.allocated_test(bluestore_pextent_t(2, mas * 4)); + b.allocated_test(bluestore_pextent_t(3, mas * 4)); + B.get_ref(coll.get(), 0, mas*12); + ASSERT_EQ(mas * 12, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 11, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*12)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r)); + ASSERT_EQ(mas * 10, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*12)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r)); + ASSERT_EQ(mas * 3, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(3u, r.size()); + ASSERT_EQ(0x2001u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(0x2u, r[1].offset); + ASSERT_EQ(mas*4, r[1].length); + ASSERT_EQ(0x3u, r[2].offset); + ASSERT_EQ(mas*2, r[2].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*8)); + ASSERT_TRUE(b.is_allocated(mas*10, mas*2)); + ASSERT_EQ(3u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_FALSE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(b.get_extents()[2].is_valid()); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas * 4)); + b.allocated_test(bluestore_pextent_t(2, mas * 4)); + b.allocated_test(bluestore_pextent_t(3, mas * 4)); + B.get_ref(coll.get(), 0, mas*12); + ASSERT_EQ(mas * 12, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 11, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*12)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r)); + ASSERT_EQ(mas * 10, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*12)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r)); + ASSERT_EQ(mas * 3, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(3u, r.size()); + ASSERT_EQ(0x2001u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(0x2u, r[1].offset); + ASSERT_EQ(mas*4, r[1].length); + ASSERT_EQ(0x3u, r[2].offset); + ASSERT_EQ(mas*2, r[2].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*8)); + ASSERT_TRUE(b.is_allocated(mas*10, mas*2)); + ASSERT_EQ(3u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_FALSE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(b.get_extents()[2].is_valid()); + ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r)); + ASSERT_EQ(mas * 2, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x1u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(2u, b.get_extents().size()); + ASSERT_FALSE(b.get_extents()[0].is_valid()); + ASSERT_TRUE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(B.put_ref(coll.get(), mas*10, mas*2, &r)); + ASSERT_EQ(mas * 0, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x2003u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(1u, b.get_extents().size()); + ASSERT_FALSE(b.get_extents()[0].is_valid()); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas * 4)); + b.allocated_test(bluestore_pextent_t(2, mas * 4)); + b.allocated_test(bluestore_pextent_t(3, mas * 4)); + B.get_ref(coll.get(), 0, mas*12); + ASSERT_EQ(mas * 12, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 11, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*12)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r)); + ASSERT_EQ(mas * 10, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*12)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r)); + ASSERT_EQ(mas * 3, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(3u, r.size()); + ASSERT_EQ(0x2001u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(0x2u, r[1].offset); + ASSERT_EQ(mas*4, r[1].length); + ASSERT_EQ(0x3u, r[2].offset); + ASSERT_EQ(mas*2, r[2].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*8)); + ASSERT_TRUE(b.is_allocated(mas*10, mas*2)); + ASSERT_EQ(3u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_FALSE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(b.get_extents()[2].is_valid()); + ASSERT_FALSE(B.put_ref(coll.get(), mas*10, mas*2, &r)); + ASSERT_EQ(mas * 1, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x2003u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(2u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_FALSE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(B.put_ref(coll.get(), 0, mas, &r)); + ASSERT_EQ(mas * 0, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x1u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(1u, b.get_extents().size()); + ASSERT_FALSE(b.get_extents()[0].is_valid()); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(1, mas * 8)); + B.get_ref(coll.get(), 0, mas*8); + ASSERT_EQ(mas * 8, B.get_referenced_bytes()); + ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r)); + ASSERT_EQ(mas * 7, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*8)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*7, mas, &r)); + ASSERT_EQ(mas * 6, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*8)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r)); + ASSERT_EQ(mas * 5, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, 8)); + ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas*4, &r)); + ASSERT_EQ(mas * 1, B.get_referenced_bytes()); + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x2001u, r[0].offset); + ASSERT_EQ(mas*6, r[0].length); + ASSERT_TRUE(b.is_allocated(0, mas*2)); + ASSERT_FALSE(b.is_allocated(mas*2, mas*6)); + ASSERT_EQ(2u, b.get_extents().size()); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_FALSE(b.get_extents()[1].is_valid()); + ASSERT_TRUE(B.put_ref(coll.get(), mas, mas, &r)); + ASSERT_EQ(mas * 0, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x1u, r[0].offset); + ASSERT_EQ(mas*2, r[0].length); + ASSERT_EQ(1u, b.get_extents().size()); + ASSERT_FALSE(b.get_extents()[0].is_valid()); + } + // verify csum chunk size if factored in properly + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + PExtentVector r; + b.allocated_test(bluestore_pextent_t(0, mas*4)); + b.init_csum(Checksummer::CSUM_CRC32C, 14, mas * 4); + B.get_ref(coll.get(), 0, mas*4); + ASSERT_EQ(mas * 4, B.get_referenced_bytes()); + ASSERT_TRUE(b.is_allocated(0, mas*4)); + ASSERT_FALSE(B.put_ref(coll.get(), 0, mas*3, &r)); + ASSERT_EQ(mas * 1, B.get_referenced_bytes()); + cout << "r " << r << " " << b << std::endl; + ASSERT_EQ(0u, r.size()); + ASSERT_TRUE(b.is_allocated(0, mas*4)); + ASSERT_TRUE(b.get_extents()[0].is_valid()); + ASSERT_EQ(mas*4, b.get_extents()[0].length); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + b.allocated_test(bluestore_pextent_t(0x40101000, 0x4000)); + b.allocated_test(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, + 0x13000)); + + b.allocated_test(bluestore_pextent_t(0x40118000, 0x7000)); + B.get_ref(coll.get(), 0x0, 0x3800); + B.get_ref(coll.get(), 0x17c00, 0x6400); + ASSERT_EQ(0x3800u + 0x6400u, B.get_referenced_bytes()); + b.set_flag(bluestore_blob_t::FLAG_SHARED); + b.init_csum(Checksummer::CSUM_CRC32C, 12, 0x1e000); + + cout << "before: " << B << std::endl; + PExtentVector r; + ASSERT_FALSE(B.put_ref(coll.get(), 0x1800, 0x2000, &r)); + ASSERT_EQ(0x3800u + 0x6400u - 0x2000u, B.get_referenced_bytes()); + cout << "after: " << B << std::endl; + cout << "r " << r << std::endl; + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + b.allocated_test(bluestore_pextent_t(1, 0x5000)); + b.allocated_test(bluestore_pextent_t(2, 0x5000)); + B.get_ref(coll.get(), 0x0, 0xa000); + ASSERT_EQ(0xa000u, B.get_referenced_bytes()); + cout << "before: " << B << std::endl; + PExtentVector r; + ASSERT_FALSE(B.put_ref(coll.get(), 0x8000, 0x2000, &r)); + cout << "after: " << B << std::endl; + cout << "r " << r << std::endl; + ASSERT_EQ(0x8000u, B.get_referenced_bytes()); + ASSERT_EQ(1u, r.size()); + ASSERT_EQ(0x3002u, r[0].offset); + ASSERT_EQ(0x2000u, r[0].length); + } + { + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + b.allocated_test(bluestore_pextent_t(1, 0x7000)); + b.allocated_test(bluestore_pextent_t(2, 0x7000)); + B.get_ref(coll.get(), 0x0, 0xe000); + ASSERT_EQ(0xe000u, B.get_referenced_bytes()); + cout << "before: " << B << std::endl; + PExtentVector r; + ASSERT_FALSE(B.put_ref(coll.get(), 0, 0xb000, &r)); + ASSERT_EQ(0x3000u, B.get_referenced_bytes()); + cout << "after: " << B << std::endl; + cout << "r " << r << std::endl; + ASSERT_EQ(0x3000u, B.get_referenced_bytes()); + ASSERT_EQ(2u, r.size()); + ASSERT_EQ(1u, r[0].offset); + ASSERT_EQ(0x7000u, r[0].length); + ASSERT_EQ(2u, r[1].offset); + ASSERT_EQ(0x3000u, r[1].length); // we have 0x1000 bytes less due to + // alignment caused by min_alloc_size = 0x2000 + } + { + BlueStore store(g_ceph_context, "", 0x4000); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Blob B; + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + bluestore_blob_t& b = B.dirty_blob(); + b.allocated_test(bluestore_pextent_t(1, 0x5000)); + b.allocated_test(bluestore_pextent_t(2, 0x7000)); + B.get_ref(coll.get(), 0x0, 0xc000); + ASSERT_EQ(0xc000u, B.get_referenced_bytes()); + cout << "before: " << B << std::endl; + PExtentVector r; + ASSERT_FALSE(B.put_ref(coll.get(), 0x2000, 0xa000, &r)); + cout << "after: " << B << std::endl; + cout << "r " << r << std::endl; + ASSERT_EQ(0x2000u, B.get_referenced_bytes()); + ASSERT_EQ(2u, r.size()); + ASSERT_EQ(0x4001u, r[0].offset); + ASSERT_EQ(0x1000u, r[0].length); + ASSERT_EQ(2u, r[1].offset); + ASSERT_EQ(0x7000u, r[1].length); + ASSERT_EQ(1u, b.get_extents()[0].offset); + ASSERT_EQ(0x4000u, b.get_extents()[0].length); + } +} + +TEST(bluestore_blob_t, can_split) +{ + bluestore_blob_t a; + ASSERT_TRUE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_SHARED; + ASSERT_FALSE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_COMPRESSED; + ASSERT_FALSE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_HAS_UNUSED; + ASSERT_FALSE(a.can_split()); +} + +TEST(bluestore_blob_t, can_split_at) +{ + bluestore_blob_t a; + a.allocated_test(bluestore_pextent_t(0x10000, 0x2000)); + a.allocated_test(bluestore_pextent_t(0x20000, 0x2000)); + ASSERT_TRUE(a.can_split_at(0x1000)); + ASSERT_TRUE(a.can_split_at(0x1800)); + a.init_csum(Checksummer::CSUM_CRC32C, 12, 0x4000); + ASSERT_TRUE(a.can_split_at(0x1000)); + ASSERT_TRUE(a.can_split_at(0x2000)); + ASSERT_TRUE(a.can_split_at(0x3000)); + ASSERT_FALSE(a.can_split_at(0x2800)); +} + +TEST(bluestore_blob_t, prune_tail) +{ + bluestore_blob_t a; + a.allocated_test(bluestore_pextent_t(0x10000, 0x2000)); + a.allocated_test(bluestore_pextent_t(0x20000, 0x2000)); + ASSERT_FALSE(a.can_prune_tail()); + a.allocated_test( + bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x2000)); + ASSERT_TRUE(a.can_prune_tail()); + a.prune_tail(); + ASSERT_FALSE(a.can_prune_tail()); + ASSERT_EQ(2u, a.get_extents().size()); + ASSERT_EQ(0x4000u, a.get_logical_length()); + + a.allocated_test( + bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x2000)); + a.init_csum(Checksummer::CSUM_CRC32C_8, 12, 0x6000); + ASSERT_EQ(6u, a.csum_data.length()); + ASSERT_TRUE(a.can_prune_tail()); + a.prune_tail(); + ASSERT_FALSE(a.can_prune_tail()); + ASSERT_EQ(2u, a.get_extents().size()); + ASSERT_EQ(0x4000u, a.get_logical_length()); + ASSERT_EQ(4u, a.csum_data.length()); + + bluestore_blob_t b; + b.allocated_test( + bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x2000)); + ASSERT_FALSE(a.can_prune_tail()); +} + +TEST(Blob, split) +{ + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + { + BlueStore::Blob L, R; + L.shared_blob = new BlueStore::SharedBlob(coll.get()); + R.shared_blob = new BlueStore::SharedBlob(coll.get()); + L.dirty_blob().allocated_test(bluestore_pextent_t(0x2000, 0x2000)); + L.dirty_blob().init_csum(Checksummer::CSUM_CRC32C, 12, 0x2000); + L.get_ref(coll.get(), 0, 0x2000); + L.split(coll.get(), 0x1000, &R); + ASSERT_EQ(0x1000u, L.get_blob().get_logical_length()); + ASSERT_EQ(4u, L.get_blob().csum_data.length()); + ASSERT_EQ(1u, L.get_blob().get_extents().size()); + ASSERT_EQ(0x2000u, L.get_blob().get_extents().front().offset); + ASSERT_EQ(0x1000u, L.get_blob().get_extents().front().length); + ASSERT_EQ(0x1000u, L.get_referenced_bytes()); + ASSERT_EQ(0x1000u, R.get_blob().get_logical_length()); + ASSERT_EQ(4u, R.get_blob().csum_data.length()); + ASSERT_EQ(1u, R.get_blob().get_extents().size()); + ASSERT_EQ(0x3000u, R.get_blob().get_extents().front().offset); + ASSERT_EQ(0x1000u, R.get_blob().get_extents().front().length); + ASSERT_EQ(0x1000u, R.get_referenced_bytes()); + } + { + BlueStore::Blob L, R; + L.shared_blob = new BlueStore::SharedBlob(coll.get()); + R.shared_blob = new BlueStore::SharedBlob(coll.get()); + L.dirty_blob().allocated_test(bluestore_pextent_t(0x2000, 0x1000)); + L.dirty_blob().allocated_test(bluestore_pextent_t(0x12000, 0x1000)); + L.dirty_blob().init_csum(Checksummer::CSUM_CRC32C, 12, 0x2000); + L.get_ref(coll.get(), 0, 0x1000); + L.get_ref(coll.get(), 0x1000, 0x1000); + L.split(coll.get(), 0x1000, &R); + ASSERT_EQ(0x1000u, L.get_blob().get_logical_length()); + ASSERT_EQ(4u, L.get_blob().csum_data.length()); + ASSERT_EQ(1u, L.get_blob().get_extents().size()); + ASSERT_EQ(0x2000u, L.get_blob().get_extents().front().offset); + ASSERT_EQ(0x1000u, L.get_blob().get_extents().front().length); + ASSERT_EQ(0x1000u, L.get_referenced_bytes()); + ASSERT_EQ(0x1000u, R.get_blob().get_logical_length()); + ASSERT_EQ(4u, R.get_blob().csum_data.length()); + ASSERT_EQ(1u, R.get_blob().get_extents().size()); + ASSERT_EQ(0x12000u, R.get_blob().get_extents().front().offset); + ASSERT_EQ(0x1000u, R.get_blob().get_extents().front().length); + ASSERT_EQ(0x1000u, R.get_referenced_bytes()); + } +} + +TEST(Blob, legacy_decode) +{ + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + bufferlist bl, bl2; + { + BlueStore::Blob B; + + B.shared_blob = new BlueStore::SharedBlob(coll.get()); + B.dirty_blob().allocated_test(bluestore_pextent_t(0x1, 0x2000)); + B.dirty_blob().init_csum(Checksummer::CSUM_CRC32C, 12, 0x2000); + B.get_ref(coll.get(), 0, 0xff0); + B.get_ref(coll.get(), 0x1fff, 1); + + bluestore_extent_ref_map_t fake_ref_map; + fake_ref_map.get(0, 0xff0); + fake_ref_map.get(0x1fff, 1); + + size_t bound = 0, bound2 = 0; + + B.bound_encode( + bound, + 1, /*struct_v*/ + 0, /*sbid*/ + false); + fake_ref_map.bound_encode(bound); + + B.bound_encode( + bound2, + 2, /*struct_v*/ + 0, /*sbid*/ + true); + + { + auto app = bl.get_contiguous_appender(bound); + auto app2 = bl2.get_contiguous_appender(bound2); + B.encode( + app, + 1, /*struct_v*/ + 0, /*sbid*/ + false); + fake_ref_map.encode(app); + + B.encode( + app2, + 2, /*struct_v*/ + 0, /*sbid*/ + true); + } + + auto p = bl.front().begin_deep(); + auto p2 = bl2.front().begin_deep(); + BlueStore::Blob Bres, Bres2; + Bres.shared_blob = new BlueStore::SharedBlob(coll.get()); + Bres2.shared_blob = new BlueStore::SharedBlob(coll.get()); + + uint64_t sbid, sbid2; + Bres.decode( + p, + 1, /*struct_v*/ + &sbid, + true, + coll.get()); + Bres2.decode( + p2, + 2, /*struct_v*/ + &sbid2, + true, + coll.get()); + + ASSERT_EQ(0xff0u + 1u, Bres.get_blob_use_tracker().get_referenced_bytes()); + ASSERT_EQ(0xff0u + 1u, Bres2.get_blob_use_tracker().get_referenced_bytes()); + ASSERT_TRUE(Bres.get_blob_use_tracker().equal(Bres2.get_blob_use_tracker())); + } +} + +TEST(ExtentMap, seek_lextent) +{ + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Onode onode(coll.get(), ghobject_t(), ""); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); + BlueStore::BlobRef br(new BlueStore::Blob); + br->shared_blob = new BlueStore::SharedBlob(coll.get()); + + ASSERT_EQ(em.extent_map.end(), em.seek_lextent(0)); + ASSERT_EQ(em.extent_map.end(), em.seek_lextent(100)); + + em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, br)); + auto a = em.find(100); + ASSERT_EQ(a, em.seek_lextent(0)); + ASSERT_EQ(a, em.seek_lextent(99)); + ASSERT_EQ(a, em.seek_lextent(100)); + ASSERT_EQ(a, em.seek_lextent(101)); + ASSERT_EQ(a, em.seek_lextent(199)); + ASSERT_EQ(em.extent_map.end(), em.seek_lextent(200)); + + em.extent_map.insert(*new BlueStore::Extent(200, 0, 100, br)); + auto b = em.find(200); + ASSERT_EQ(a, em.seek_lextent(0)); + ASSERT_EQ(a, em.seek_lextent(99)); + ASSERT_EQ(a, em.seek_lextent(100)); + ASSERT_EQ(a, em.seek_lextent(101)); + ASSERT_EQ(a, em.seek_lextent(199)); + ASSERT_EQ(b, em.seek_lextent(200)); + ASSERT_EQ(b, em.seek_lextent(299)); + ASSERT_EQ(em.extent_map.end(), em.seek_lextent(300)); + + em.extent_map.insert(*new BlueStore::Extent(400, 0, 100, br)); + auto d = em.find(400); + ASSERT_EQ(a, em.seek_lextent(0)); + ASSERT_EQ(a, em.seek_lextent(99)); + ASSERT_EQ(a, em.seek_lextent(100)); + ASSERT_EQ(a, em.seek_lextent(101)); + ASSERT_EQ(a, em.seek_lextent(199)); + ASSERT_EQ(b, em.seek_lextent(200)); + ASSERT_EQ(b, em.seek_lextent(299)); + ASSERT_EQ(d, em.seek_lextent(300)); + ASSERT_EQ(d, em.seek_lextent(399)); + ASSERT_EQ(d, em.seek_lextent(400)); + ASSERT_EQ(d, em.seek_lextent(499)); + ASSERT_EQ(em.extent_map.end(), em.seek_lextent(500)); +} + +TEST(ExtentMap, has_any_lextents) +{ + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Onode onode(coll.get(), ghobject_t(), ""); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); + BlueStore::BlobRef b(new BlueStore::Blob); + b->shared_blob = new BlueStore::SharedBlob(coll.get()); + + ASSERT_FALSE(em.has_any_lextents(0, 0)); + ASSERT_FALSE(em.has_any_lextents(0, 1000)); + ASSERT_FALSE(em.has_any_lextents(1000, 1000)); + + em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, b)); + ASSERT_FALSE(em.has_any_lextents(0, 50)); + ASSERT_FALSE(em.has_any_lextents(0, 100)); + ASSERT_FALSE(em.has_any_lextents(50, 50)); + ASSERT_TRUE(em.has_any_lextents(50, 51)); + ASSERT_TRUE(em.has_any_lextents(50, 100051)); + ASSERT_TRUE(em.has_any_lextents(100, 100)); + ASSERT_TRUE(em.has_any_lextents(100, 1)); + ASSERT_TRUE(em.has_any_lextents(199, 1)); + ASSERT_TRUE(em.has_any_lextents(199, 2)); + ASSERT_FALSE(em.has_any_lextents(200, 2)); + + em.extent_map.insert(*new BlueStore::Extent(200, 0, 100, b)); + ASSERT_TRUE(em.has_any_lextents(199, 1)); + ASSERT_TRUE(em.has_any_lextents(199, 2)); + ASSERT_TRUE(em.has_any_lextents(200, 2)); + ASSERT_TRUE(em.has_any_lextents(200, 200)); + ASSERT_TRUE(em.has_any_lextents(299, 1)); + ASSERT_FALSE(em.has_any_lextents(300, 1)); + + em.extent_map.insert(*new BlueStore::Extent(400, 0, 100, b)); + ASSERT_TRUE(em.has_any_lextents(0, 10000)); + ASSERT_TRUE(em.has_any_lextents(199, 1)); + ASSERT_FALSE(em.has_any_lextents(300, 1)); + ASSERT_FALSE(em.has_any_lextents(300, 100)); + ASSERT_FALSE(em.has_any_lextents(399, 1)); + ASSERT_TRUE(em.has_any_lextents(400, 1)); + ASSERT_TRUE(em.has_any_lextents(400, 100)); + ASSERT_TRUE(em.has_any_lextents(400, 1000)); + ASSERT_TRUE(em.has_any_lextents(499, 1000)); + ASSERT_FALSE(em.has_any_lextents(500, 1000)); +} + +void erase_and_delete(BlueStore::ExtentMap& em, size_t v) +{ + auto d = em.find(v); + ASSERT_NE(d, em.extent_map.end()); + em.extent_map.erase(d); + delete &*d; +} + +TEST(ExtentMap, compress_extent_map) +{ + BlueStore store(g_ceph_context, "", 4096); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Onode onode(coll.get(), ghobject_t(), ""); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); + BlueStore::BlobRef b1(new BlueStore::Blob); + BlueStore::BlobRef b2(new BlueStore::Blob); + BlueStore::BlobRef b3(new BlueStore::Blob); + b1->shared_blob = new BlueStore::SharedBlob(coll.get()); + b2->shared_blob = new BlueStore::SharedBlob(coll.get()); + b3->shared_blob = new BlueStore::SharedBlob(coll.get()); + + em.extent_map.insert(*new BlueStore::Extent(0, 0, 100, b1)); + em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, b2)); + ASSERT_EQ(0, em.compress_extent_map(0, 10000)); + ASSERT_EQ(2u, em.extent_map.size()); + + em.extent_map.insert(*new BlueStore::Extent(200, 100, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(300, 200, 100, b2)); + ASSERT_EQ(0, em.compress_extent_map(0, 0)); + ASSERT_EQ(0, em.compress_extent_map(100000, 1000)); + ASSERT_EQ(2, em.compress_extent_map(0, 100000)); + ASSERT_EQ(2u, em.extent_map.size()); + erase_and_delete(em, 100); + em.extent_map.insert(*new BlueStore::Extent(100, 0, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(200, 100, 100, b3)); + em.extent_map.insert(*new BlueStore::Extent(300, 200, 100, b2)); + ASSERT_EQ(0, em.compress_extent_map(0, 1)); + ASSERT_EQ(0, em.compress_extent_map(0, 100000)); + ASSERT_EQ(4u, em.extent_map.size()); + + em.extent_map.insert(*new BlueStore::Extent(400, 300, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(500, 500, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(600, 600, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(700, 0, 100, b1)); + em.extent_map.insert(*new BlueStore::Extent(800, 0, 100, b3)); + ASSERT_EQ(0, em.compress_extent_map(0, 99)); + ASSERT_EQ(0, em.compress_extent_map(800, 1000)); + ASSERT_EQ(2, em.compress_extent_map(100, 500)); + ASSERT_EQ(7u, em.extent_map.size()); + erase_and_delete(em, 300); + erase_and_delete(em, 500); + erase_and_delete(em, 700); + em.extent_map.insert(*new BlueStore::Extent(400, 300, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(500, 400, 100, b2)); + em.extent_map.insert(*new BlueStore::Extent(700, 500, 100, b2)); + ASSERT_EQ(1, em.compress_extent_map(0, 1000)); + ASSERT_EQ(6u, em.extent_map.size()); +} + + +void clear_and_dispose(BlueStore::old_extent_map_t& old_em) +{ + auto oep = old_em.begin(); + while (oep != old_em.end()) { + auto &lo = *oep; + oep = old_em.erase(oep); + delete &lo; + } +} + +TEST(GarbageCollector, BasicTest) +{ + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + BlueStore store(g_ceph_context, "", 4096); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Onode onode(coll.get(), ghobject_t(), ""); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); + + BlueStore::old_extent_map_t old_extents; + + + /* + min_alloc_size = 4096 + original disposition + extent1 <loffs = 100, boffs = 100, len = 10> + -> blob1<compressed, len_on_disk=4096, logical_len=8192> + extent2 <loffs = 200, boffs = 200, len = 10> + -> blob2<raw, len_on_disk=4096, llen=4096> + extent3 <loffs = 300, boffs = 300, len = 10> + -> blob1<compressed, len_on_disk=4096, llen=8192> + extent4 <loffs = 4096, boffs = 0, len = 10> + -> blob3<raw, len_on_disk=4096, llen=4096> + on write(300~100) resulted in + extent1 <loffs = 100, boffs = 100, len = 10> + -> blob1<compressed, len_on_disk=4096, logical_len=8192> + extent2 <loffs = 200, boffs = 200, len = 10> + -> blob2<raw, len_on_disk=4096, llen=4096> + extent3 <loffs = 300, boffs = 300, len = 100> + -> blob4<raw, len_on_disk=4096, llen=4096> + extent4 <loffs = 4096, boffs = 0, len = 10> + -> blob3<raw, len_on_disk=4096, llen=4096> + */ + { + BlueStore::GarbageCollector gc(g_ceph_context); + int64_t saving; + BlueStore::BlobRef b1(new BlueStore::Blob); + BlueStore::BlobRef b2(new BlueStore::Blob); + BlueStore::BlobRef b3(new BlueStore::Blob); + BlueStore::BlobRef b4(new BlueStore::Blob); + b1->shared_blob = new BlueStore::SharedBlob(coll.get()); + b2->shared_blob = new BlueStore::SharedBlob(coll.get()); + b3->shared_blob = new BlueStore::SharedBlob(coll.get()); + b4->shared_blob = new BlueStore::SharedBlob(coll.get()); + b1->dirty_blob().set_compressed(0x2000, 0x1000); + b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x1000)); + b2->dirty_blob().allocated_test(bluestore_pextent_t(1, 0x1000)); + b3->dirty_blob().allocated_test(bluestore_pextent_t(2, 0x1000)); + b4->dirty_blob().allocated_test(bluestore_pextent_t(3, 0x1000)); + em.extent_map.insert(*new BlueStore::Extent(100, 100, 10, b1)); + b1->get_ref(coll.get(), 100, 10); + em.extent_map.insert(*new BlueStore::Extent(200, 200, 10, b2)); + b2->get_ref(coll.get(), 200, 10); + em.extent_map.insert(*new BlueStore::Extent(300, 300, 100, b4)); + b4->get_ref(coll.get(), 300, 100); + em.extent_map.insert(*new BlueStore::Extent(4096, 0, 10, b3)); + b3->get_ref(coll.get(), 0, 10); + + old_extents.push_back(*new BlueStore::OldExtent(300, 300, 10, b1)); + + saving = gc.estimate(300, 100, em, old_extents, 4096); + ASSERT_EQ(saving, 1); + auto& to_collect = gc.get_extents_to_collect(); + ASSERT_EQ(to_collect.num_intervals(), 1u); + { + auto it = to_collect.begin(); + using p = decltype(*it); + auto v = p{100ul, 10ul}; + ASSERT_EQ(*it, v); + } + em.clear(); + clear_and_dispose(old_extents); + } + /* + original disposition + min_alloc_size = 0x10000 + extent1 <loffs = 0, boffs = 0, len = 0x40000> + -> blob1<compressed, len_on_disk=0x20000, logical_len=0x40000> + Write 0x8000~37000 resulted in the following extent map prior to GC + for the last write_small(0x30000~0xf000): + + extent1 <loffs = 0, boffs = 0, len = 0x8000> + -> blob1<compressed, len_on_disk=0x20000, logical_len=0x40000> + extent2 <loffs = 0x8000, boffs = 0x8000, len = 0x8000> + -> blob2<raw, len_on_disk=0x10000, llen=0x10000> + extent3 <loffs = 0x10000, boffs = 0, len = 0x20000> + -> blob3<raw, len_on_disk=0x20000, llen=0x20000> + extent4 <loffs = 0x30000, boffs = 0, len = 0xf000> + -> blob4<raw, len_on_disk=0x10000, llen=0x10000> + extent5 <loffs = 0x3f000, boffs = 0x3f000, len = 0x1000> + -> blob1<compressed, len_on_disk=0x20000, llen=0x40000> + */ + { + BlueStore store(g_ceph_context, "", 0x10000); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Onode onode(coll.get(), ghobject_t(), ""); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); + + BlueStore::old_extent_map_t old_extents; + BlueStore::GarbageCollector gc(g_ceph_context); + int64_t saving; + BlueStore::BlobRef b1(new BlueStore::Blob); + BlueStore::BlobRef b2(new BlueStore::Blob); + BlueStore::BlobRef b3(new BlueStore::Blob); + BlueStore::BlobRef b4(new BlueStore::Blob); + b1->shared_blob = new BlueStore::SharedBlob(coll.get()); + b2->shared_blob = new BlueStore::SharedBlob(coll.get()); + b3->shared_blob = new BlueStore::SharedBlob(coll.get()); + b4->shared_blob = new BlueStore::SharedBlob(coll.get()); + b1->dirty_blob().set_compressed(0x40000, 0x20000); + b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x20000)); + b2->dirty_blob().allocated_test(bluestore_pextent_t(1, 0x10000)); + b3->dirty_blob().allocated_test(bluestore_pextent_t(2, 0x20000)); + b4->dirty_blob().allocated_test(bluestore_pextent_t(3, 0x10000)); + + em.extent_map.insert(*new BlueStore::Extent(0, 0, 0x8000, b1)); + b1->get_ref(coll.get(), 0, 0x8000); + em.extent_map.insert( + *new BlueStore::Extent(0x8000, 0x8000, 0x8000, b2)); // new extent + b2->get_ref(coll.get(), 0x8000, 0x8000); + em.extent_map.insert( + *new BlueStore::Extent(0x10000, 0, 0x20000, b3)); // new extent + b3->get_ref(coll.get(), 0, 0x20000); + em.extent_map.insert( + *new BlueStore::Extent(0x30000, 0, 0xf000, b4)); // new extent + b4->get_ref(coll.get(), 0, 0xf000); + em.extent_map.insert(*new BlueStore::Extent(0x3f000, 0x3f000, 0x1000, b1)); + b1->get_ref(coll.get(), 0x3f000, 0x1000); + + old_extents.push_back(*new BlueStore::OldExtent(0x8000, 0x8000, 0x8000, b1)); + old_extents.push_back( + *new BlueStore::OldExtent(0x10000, 0x10000, 0x20000, b1)); + old_extents.push_back(*new BlueStore::OldExtent(0x30000, 0x30000, 0xf000, b1)); + + saving = gc.estimate(0x30000, 0xf000, em, old_extents, 0x10000); + ASSERT_EQ(saving, 2); + auto& to_collect = gc.get_extents_to_collect(); + ASSERT_EQ(to_collect.num_intervals(), 2u); + { + auto it1 = to_collect.begin(); + auto it2 = ++to_collect.begin(); + using p = decltype(*it1); + { + auto v1 = p{0x0ul ,0x8000ul}; + auto v2 = p{0x0ul, 0x8000ul}; + ASSERT_TRUE(*it1 == v1 || *it2 == v2); + } + { + auto v1 = p{0x3f000ul, 0x1000ul}; + auto v2 = p{0x3f000ul, 0x1000ul}; + ASSERT_TRUE(*it1 == v1 || *it2 == v2); + } + } + + em.clear(); + clear_and_dispose(old_extents); + } + /* + original disposition + min_alloc_size = 0x1000 + extent1 <loffs = 0, boffs = 0, len = 0x4000> + -> blob1<compressed, len_on_disk=0x2000, logical_len=0x4000> + write 0x3000~4000 resulted in the following extent map + (future feature - suppose we can compress incoming write prior to + GC invocation) + + extent1 <loffs = 0, boffs = 0, len = 0x4000> + -> blob1<compressed, len_on_disk=0x2000, logical_len=0x4000> + extent2 <loffs = 0x3000, boffs = 0, len = 0x4000> + -> blob2<compressed, len_on_disk=0x2000, llen=0x4000> + */ + { + BlueStore::GarbageCollector gc(g_ceph_context); + int64_t saving; + BlueStore::BlobRef b1(new BlueStore::Blob); + BlueStore::BlobRef b2(new BlueStore::Blob); + b1->shared_blob = new BlueStore::SharedBlob(coll.get()); + b2->shared_blob = new BlueStore::SharedBlob(coll.get()); + b1->dirty_blob().set_compressed(0x4000, 0x2000); + b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x2000)); + b2->dirty_blob().set_compressed(0x4000, 0x2000); + b2->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x2000)); + + em.extent_map.insert(*new BlueStore::Extent(0, 0, 0x3000, b1)); + b1->get_ref(coll.get(), 0, 0x3000); + em.extent_map.insert( + *new BlueStore::Extent(0x3000, 0, 0x4000, b2)); // new extent + b2->get_ref(coll.get(), 0, 0x4000); + + old_extents.push_back(*new BlueStore::OldExtent(0x3000, 0x3000, 0x1000, b1)); + + saving = gc.estimate(0x3000, 0x4000, em, old_extents, 0x1000); + ASSERT_EQ(saving, 0); + auto& to_collect = gc.get_extents_to_collect(); + ASSERT_EQ(to_collect.num_intervals(), 0u); + em.clear(); + clear_and_dispose(old_extents); + } + /* + original disposition + min_alloc_size = 0x10000 + extent0 <loffs = 0, boffs = 0, len = 0x20000> + -> blob0<compressed, len_on_disk=0x10000, logical_len=0x20000> + extent1 <loffs = 0x20000, boffs = 0, len = 0x20000> + -> blob1<compressed, len_on_disk=0x10000, logical_len=0x20000> + write 0x8000~37000 resulted in the following extent map prior + to GC for the last write_small(0x30000~0xf000) + + extent0 <loffs = 0, boffs = 0, len = 0x8000> + -> blob0<compressed, len_on_disk=0x10000, logical_len=0x20000> + extent2 <loffs = 0x8000, boffs = 0x8000, len = 0x8000> + -> blob2<raw, len_on_disk=0x10000, llen=0x10000> + extent3 <loffs = 0x10000, boffs = 0, len = 0x20000> + -> blob3<raw, len_on_disk=0x20000, llen=0x20000> + extent4 <loffs = 0x30000, boffs = 0, len = 0xf000> + -> blob4<raw, len_on_disk=0x1000, llen=0x1000> + extent5 <loffs = 0x3f000, boffs = 0x1f000, len = 0x1000> + -> blob1<compressed, len_on_disk=0x10000, llen=0x20000> + */ + { + BlueStore store(g_ceph_context, "", 0x10000); + auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t()); + BlueStore::Onode onode(coll.get(), ghobject_t(), ""); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); + + BlueStore::old_extent_map_t old_extents; + BlueStore::GarbageCollector gc(g_ceph_context); + int64_t saving; + BlueStore::BlobRef b0(new BlueStore::Blob); + BlueStore::BlobRef b1(new BlueStore::Blob); + BlueStore::BlobRef b2(new BlueStore::Blob); + BlueStore::BlobRef b3(new BlueStore::Blob); + BlueStore::BlobRef b4(new BlueStore::Blob); + b0->shared_blob = new BlueStore::SharedBlob(coll.get()); + b1->shared_blob = new BlueStore::SharedBlob(coll.get()); + b2->shared_blob = new BlueStore::SharedBlob(coll.get()); + b3->shared_blob = new BlueStore::SharedBlob(coll.get()); + b4->shared_blob = new BlueStore::SharedBlob(coll.get()); + b0->dirty_blob().set_compressed(0x2000, 0x1000); + b0->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x10000)); + b1->dirty_blob().set_compressed(0x20000, 0x10000); + b1->dirty_blob().allocated_test(bluestore_pextent_t(0, 0x10000)); + b2->dirty_blob().allocated_test(bluestore_pextent_t(1, 0x10000)); + b3->dirty_blob().allocated_test(bluestore_pextent_t(2, 0x20000)); + b4->dirty_blob().allocated_test(bluestore_pextent_t(3, 0x1000)); + + em.extent_map.insert(*new BlueStore::Extent(0, 0, 0x8000, b0)); + b0->get_ref(coll.get(), 0, 0x8000); + em.extent_map.insert( + *new BlueStore::Extent(0x8000, 0x8000, 0x8000, b2)); // new extent + b2->get_ref(coll.get(), 0x8000, 0x8000); + em.extent_map.insert( + *new BlueStore::Extent(0x10000, 0, 0x20000, b3)); // new extent + b3->get_ref(coll.get(), 0, 0x20000); + em.extent_map.insert( + *new BlueStore::Extent(0x30000, 0, 0xf000, b4)); // new extent + b4->get_ref(coll.get(), 0, 0xf000); + em.extent_map.insert(*new BlueStore::Extent(0x3f000, 0x1f000, 0x1000, b1)); + b1->get_ref(coll.get(), 0x1f000, 0x1000); + + old_extents.push_back(*new BlueStore::OldExtent(0x8000, 0x8000, 0x8000, b0)); + old_extents.push_back( + *new BlueStore::OldExtent(0x10000, 0x10000, 0x10000, b0)); + old_extents.push_back( + *new BlueStore::OldExtent(0x20000, 0x00000, 0x1f000, b1)); + + saving = gc.estimate(0x30000, 0xf000, em, old_extents, 0x10000); + ASSERT_EQ(saving, 2); + auto& to_collect = gc.get_extents_to_collect(); + ASSERT_EQ(to_collect.num_intervals(), 2u); + { + auto it1 = to_collect.begin(); + auto it2 = ++to_collect.begin(); + using p = decltype(*it1); + { + auto v1 = p{0x0ul, 0x8000ul}; + auto v2 = p{0x0ul, 0x8000ul}; + ASSERT_TRUE(*it1 == v1 || *it2 == v2); + } + { + auto v1 = p{0x3f000ul, 0x1000ul}; + auto v2 = p{0x3f000ul, 0x1000ul}; + ASSERT_TRUE(*it1 == v1 || *it2 == v2); + } + } + + em.clear(); + clear_and_dispose(old_extents); + } +} + +TEST(BlueStoreRepairer, StoreSpaceTracker) +{ + BlueStoreRepairer::StoreSpaceTracker bmap0; + bmap0.init((uint64_t)4096 * 1024 * 1024 * 1024, 0x1000); + ASSERT_EQ(bmap0.granularity, 2 * 1024 * 1024U); + ASSERT_EQ(bmap0.collections_bfs.size(), 2048u * 1024u); + ASSERT_EQ(bmap0.objects_bfs.size(), 2048u * 1024u); + + BlueStoreRepairer::StoreSpaceTracker bmap; + bmap.init(0x2000 * 0x1000 - 1, 0x1000, 512 * 1024); + ASSERT_EQ(bmap.granularity, 0x1000u); + ASSERT_EQ(bmap.collections_bfs.size(), 0x2000u); + ASSERT_EQ(bmap.objects_bfs.size(), 0x2000u); + + coll_t cid; + ghobject_t hoid; + + ASSERT_FALSE(bmap.is_used(cid, 0)); + ASSERT_FALSE(bmap.is_used(hoid, 0)); + bmap.set_used(0, 1, cid, hoid); + ASSERT_TRUE(bmap.is_used(cid, 0)); + ASSERT_TRUE(bmap.is_used(hoid, 0)); + + ASSERT_FALSE(bmap.is_used(cid, 0x1023)); + ASSERT_FALSE(bmap.is_used(hoid, 0x1023)); + ASSERT_FALSE(bmap.is_used(cid, 0x2023)); + ASSERT_FALSE(bmap.is_used(hoid, 0x2023)); + ASSERT_FALSE(bmap.is_used(cid, 0x3023)); + ASSERT_FALSE(bmap.is_used(hoid, 0x3023)); + bmap.set_used(0x1023, 0x3000, cid, hoid); + ASSERT_TRUE(bmap.is_used(cid, 0x1023)); + ASSERT_TRUE(bmap.is_used(hoid, 0x1023)); + ASSERT_TRUE(bmap.is_used(cid, 0x2023)); + ASSERT_TRUE(bmap.is_used(hoid, 0x2023)); + ASSERT_TRUE(bmap.is_used(cid, 0x3023)); + ASSERT_TRUE(bmap.is_used(hoid, 0x3023)); + + ASSERT_FALSE(bmap.is_used(cid, 0x9001)); + ASSERT_FALSE(bmap.is_used(hoid, 0x9001)); + ASSERT_FALSE(bmap.is_used(cid, 0xa001)); + ASSERT_FALSE(bmap.is_used(hoid, 0xa001)); + ASSERT_FALSE(bmap.is_used(cid, 0xb000)); + ASSERT_FALSE(bmap.is_used(hoid, 0xb000)); + ASSERT_FALSE(bmap.is_used(cid, 0xc000)); + ASSERT_FALSE(bmap.is_used(hoid, 0xc000)); + bmap.set_used(0x9001, 0x2fff, cid, hoid); + ASSERT_TRUE(bmap.is_used(cid, 0x9001)); + ASSERT_TRUE(bmap.is_used(hoid, 0x9001)); + ASSERT_TRUE(bmap.is_used(cid, 0xa001)); + ASSERT_TRUE(bmap.is_used(hoid, 0xa001)); + ASSERT_TRUE(bmap.is_used(cid, 0xb001)); + ASSERT_TRUE(bmap.is_used(hoid, 0xb001)); + ASSERT_FALSE(bmap.is_used(cid, 0xc000)); + ASSERT_FALSE(bmap.is_used(hoid, 0xc000)); + + bmap.set_used(0xa001, 0x2, cid, hoid); + ASSERT_TRUE(bmap.is_used(cid, 0x9001)); + ASSERT_TRUE(bmap.is_used(hoid, 0x9001)); + ASSERT_TRUE(bmap.is_used(cid, 0xa001)); + ASSERT_TRUE(bmap.is_used(hoid, 0xa001)); + ASSERT_TRUE(bmap.is_used(cid, 0xb001)); + ASSERT_TRUE(bmap.is_used(hoid, 0xb001)); + ASSERT_FALSE(bmap.is_used(cid, 0xc000)); + ASSERT_FALSE(bmap.is_used(hoid, 0xc000)); + + ASSERT_FALSE(bmap.is_used(cid, 0xc0000)); + ASSERT_FALSE(bmap.is_used(hoid, 0xc0000)); + ASSERT_FALSE(bmap.is_used(cid, 0xc1000)); + ASSERT_FALSE(bmap.is_used(hoid, 0xc1000)); + + bmap.set_used(0xc0000, 0x2000, cid, hoid); + ASSERT_TRUE(bmap.is_used(cid, 0xc0000)); + ASSERT_TRUE(bmap.is_used(hoid, 0xc0000)); + ASSERT_TRUE(bmap.is_used(cid, 0xc1000)); + ASSERT_TRUE(bmap.is_used(hoid, 0xc1000)); + + interval_set<uint64_t> extents; + extents.insert(0,0x500); + extents.insert(0x800,0x100); + extents.insert(0x1000,0x1000); + extents.insert(0xa001,1); + extents.insert(0xa0000,0xff8); + + ASSERT_EQ(3u, bmap.filter_out(extents)); + ASSERT_TRUE(bmap.is_used(cid)); + ASSERT_TRUE(bmap.is_used(hoid)); + + BlueStoreRepairer::StoreSpaceTracker bmap2; + bmap2.init((uint64_t)0x3223b1d1000, 0x10000); + ASSERT_EQ(0x1a0000u, bmap2.granularity); + ASSERT_EQ(0x1edae4u, bmap2.collections_bfs.size()); + ASSERT_EQ(0x1edae4u, bmap2.objects_bfs.size()); + bmap2.set_used(0x3223b190000, 0x10000, cid, hoid); + ASSERT_TRUE(bmap2.is_used(cid, 0x3223b190000)); + ASSERT_TRUE(bmap2.is_used(hoid, 0x3223b190000)); + ASSERT_TRUE(bmap2.is_used(cid, 0x3223b19f000)); + ASSERT_TRUE(bmap2.is_used(hoid, 0x3223b19ffff)); +} + +TEST(bluestore_blob_t, unused) +{ + { + bluestore_blob_t b; + uint64_t min_alloc_size = 64 << 10; // 64 kB + + // _do_write_small 0x0~1000 + uint64_t offset = 0x0; + uint64_t length = 0x1000; // 4kB + uint64_t suggested_boff = 0; + PExtentVector extents; + extents.emplace_back(0x1a560000, min_alloc_size); + b.allocated(p2align(suggested_boff, min_alloc_size), 0 /*no matter*/, extents); + b.mark_used(offset, length); + ASSERT_FALSE(b.is_unused(offset, length)); + + // _do_write_small 0x2000~1000 + offset = 0x2000; + length = 0x1000; + b.add_unused(0, 0x10000); + ASSERT_TRUE(b.is_unused(offset, length)); + b.mark_used(offset, length); + ASSERT_FALSE(b.is_unused(offset, length)); + + // _do_write_small 0xc000~2000 + offset = 0xc000; + length = 0x2000; + ASSERT_TRUE(b.is_unused(offset, length)); + b.mark_used(offset, length); + ASSERT_FALSE(b.is_unused(offset, length)); + } + + { + bluestore_blob_t b; + uint64_t min_alloc_size = 64 << 10; // 64 kB + + // _do_write_small 0x11000~1000 + uint64_t offset = 0x11000; + uint64_t length = 0x1000; // 4kB + uint64_t suggested_boff = 0x11000; + PExtentVector extents; + extents.emplace_back(0x1a560000, min_alloc_size); + b.allocated(p2align(suggested_boff, min_alloc_size), 0 /*no matter*/, extents); + b.add_unused(0, offset); + b.add_unused(offset + length, min_alloc_size * 2 - offset - length); + b.mark_used(offset, length); + ASSERT_FALSE(b.is_unused(offset, length)); + + // _do_write_small 0x15000~3000 + offset = 0x15000; + length = 0x3000; + ASSERT_TRUE(b.is_unused(offset, length)); + b.mark_used(offset, length); + ASSERT_FALSE(b.is_unused(offset, length)); + } + + { + // reuse blob + bluestore_blob_t b; + uint64_t min_alloc_size = 64 << 10; // 64 kB + + // _do_write_small 0x2a000~1000 + // and 0x1d000~1000 + uint64_t unused_granularity = 0x3000; + // offsets and lenght below are selected to + // be aligned with unused_granularity + uint64_t offset0 = 0x2a000; + uint64_t offset = 0x1d000; + uint64_t length = 0x1000; // 4kB + PExtentVector extents; + extents.emplace_back(0x410000, min_alloc_size); + b.allocated(p2align(offset0, min_alloc_size), min_alloc_size, extents); + b.add_unused(0, min_alloc_size * 3); + b.mark_used(offset0, length); + ASSERT_FALSE(b.is_unused(offset0, length)); + ASSERT_TRUE(b.is_unused(offset, length)); + + extents.clear(); + extents.emplace_back(0x430000, min_alloc_size); + b.allocated(p2align(offset, min_alloc_size), min_alloc_size, extents); + b.mark_used(offset, length); + ASSERT_FALSE(b.is_unused(offset0, length)); + ASSERT_FALSE(b.is_unused(offset, length)); + ASSERT_FALSE(b.is_unused(offset, unused_granularity)); + + ASSERT_TRUE(b.is_unused(0, offset / unused_granularity * unused_granularity)); + ASSERT_TRUE(b.is_unused(offset + length, offset0 - offset - length)); + auto end0_aligned = round_up_to(offset0 + length, unused_granularity); + ASSERT_TRUE(b.is_unused(end0_aligned, min_alloc_size * 3 - end0_aligned)); + } +} +// This UT is primarily intended to show how repair procedure +// causes erroneous write to INVALID_OFFSET which is reported in +// https://tracker.ceph.com/issues/51682 +// Basic map_any functionality is tested as well though. +// +TEST(bluestore_blob_t, wrong_map_bl_in_51682) +{ + { + bluestore_blob_t b; + uint64_t min_alloc_size = 4 << 10; // 64 kB + + b.allocated_test(bluestore_pextent_t(0x17ba000, 4 * min_alloc_size)); + b.allocated_test(bluestore_pextent_t(0x17bf000, 4 * min_alloc_size)); + b.allocated_test( + bluestore_pextent_t( + bluestore_pextent_t::INVALID_OFFSET, + 1 * min_alloc_size)); + b.allocated_test(bluestore_pextent_t(0x153c44d000, 7 * min_alloc_size)); + + b.mark_used(0, 0x8000); + b.mark_used(0x9000, 0x7000); + + string s(0x7000, 'a'); + bufferlist bl; + bl.append(s); + const size_t num_expected_entries = 5; + uint64_t expected[num_expected_entries][2] = { + {0x17ba000, 0x4000}, + {0x17bf000, 0x3000}, + {0x17c0000, 0x3000}, + {0xffffffffffffffff, 0x1000}, + {0x153c44d000, 0x3000}}; + size_t expected_pos = 0; + b.map_bl(0, bl, + [&](uint64_t o, bufferlist& bl) { + ASSERT_EQ(o, expected[expected_pos][0]); + ASSERT_EQ(bl.length(), expected[expected_pos][1]); + ++expected_pos; + }); + // 0x5000 is an improper offset presumably provided when doing a repair + b.map_bl(0x5000, bl, + [&](uint64_t o, bufferlist& bl) { + ASSERT_EQ(o, expected[expected_pos][0]); + ASSERT_EQ(bl.length(), expected[expected_pos][1]); + ++expected_pos; + }); + ASSERT_EQ(expected_pos, num_expected_entries); + } +} + +//--------------------------------------------------------------------------------- +static int verify_extent(const extent_t & ext, const extent_t *ext_arr, uint64_t ext_arr_size, uint64_t idx) +{ + const extent_t & ext_ref = ext_arr[idx]; + if (ext.offset == ext_ref.offset && ext.length == ext_ref.length) { + return 0; + } else { + std::cerr << "mismatch was found at index " << idx << std::endl; + if (ext.length == 0) { + std::cerr << "Null extent was returned at idx = " << idx << std::endl; + } + unsigned start = std::max(((int32_t)(idx)-3), 0); + unsigned end = std::min(idx+3, ext_arr_size); + for (unsigned j = start; j < end; j++) { + const extent_t & ext_ref = ext_arr[j]; + std::cerr << j << ") ref_ext = [" << ext_ref.offset << ", " << ext_ref.length << "]" << std::endl; + } + std::cerr << idx << ") ext = [" << ext.offset << ", " << ext.length << "]" << std::endl; + return -1; + } +} + +//--------------------------------------------------------------------------------- +static int test_extents(uint64_t index, extent_t *ext_arr, uint64_t ext_arr_size, SimpleBitmap& sbmap, bool set) +{ + const uint64_t MAX_JUMP_BIG = 1523; + const uint64_t MAX_JUMP_SMALL = 19; + const uint64_t MAX_LEN_BIG = 523; + const uint64_t MAX_LEN_SMALL = 23; + + uint64_t n = sbmap.get_size(); + uint64_t offset = 0; + unsigned length, jump, i; + for (i = 0; i < ext_arr_size; i++) { + if (i & 3) { + jump = std::rand() % MAX_JUMP_BIG; + } else { + jump = std::rand() % MAX_JUMP_SMALL; + } + offset += jump; + if (i & 1) { + length = std::rand() % MAX_LEN_BIG; + } else { + length = std::rand() % MAX_LEN_SMALL; + } + // make sure no zero length will be used + length++; + if (offset + length >= n) { + break; + } + + bool success; + if (set) { + success = sbmap.set(offset, length); + } else { + success = sbmap.clr(offset, length); + } + if (!success) { + std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl; + return -1; + } + + // if this is not the first entry and no jump -> merge extents + if ( (i==0) || (jump > 0) ) { + ext_arr[i] = {offset, length}; + } else { + // merge 2 extents + i --; + ext_arr[i].length += length; + } + offset += length; + } + unsigned arr_size = std::min((uint64_t)i, ext_arr_size); + std::cout << std::hex << std::right; + std::cout << "[" << index << "] " << (set ? "Set::" : "Clr::") << " extents count = 0x" << arr_size; + std::cout << std::dec << std::endl; + + offset = 0; + extent_t ext; + for(unsigned i = 0; i < arr_size; i++) { + if (set) { + ext = sbmap.get_next_set_extent(offset); + } else { + ext = sbmap.get_next_clr_extent(offset); + } + + if (verify_extent(ext, ext_arr, ext_arr_size, i) != 0) { + return -1; + } + offset = ext.offset + ext.length; + } + + if (set) { + ext = sbmap.get_next_set_extent(offset); + } else { + ext = sbmap.get_next_clr_extent(offset); + } + if (ext.length == 0) { + return 0; + } else { + std::cerr << "sbmap.get_next_" << (set ? "set" : "clr") << "_extent(" << offset << ") return length = " << ext.length << std::endl; + return -1; + } +} + +//--------------------------------------------------------------------------------- +TEST(SimpleBitmap, basic) +{ + const uint64_t MAX_EXTENTS_COUNT = 7131177; + std::unique_ptr<extent_t[]> ext_arr = std::make_unique<extent_t[]>(MAX_EXTENTS_COUNT); + ASSERT_TRUE(ext_arr != nullptr); + const uint64_t BIT_COUNT = 4ULL << 30; // 4Gb = 512MB + SimpleBitmap sbmap(g_ceph_context, BIT_COUNT); + + // use current time as seed for random generator + std::srand(std::time(nullptr)); + for (unsigned i = 0; i < 3; i++ ) { + memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT); + sbmap.clear_all(); + ASSERT_TRUE(test_extents(i, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, true) == 0); + + memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT); + sbmap.set_all(); + ASSERT_TRUE(test_extents(i, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, false) == 0); + } +} + +//--------------------------------------------------------------------------------- +static int test_intersections(unsigned test_idx, SimpleBitmap &sbmap, uint8_t map[], uint64_t map_size) +{ + const uint64_t MAX_LEN_BIG = 523; + const uint64_t MAX_LEN_SMALL = 23; + + bool success; + uint64_t set_op_count = 0, clr_op_count = 0; + unsigned length, i; + for (i = 0; i < map_size / (MAX_LEN_BIG*2); i++) { + uint64_t offset = (std::rand() % (map_size - 1)); + if (i & 1) { + length = std::rand() % MAX_LEN_BIG; + } else { + length = std::rand() % MAX_LEN_SMALL; + } + // make sure no zero length will be used + length++; + if (offset + length >= map_size) { + continue; + } + // 2:1 set/clr + bool set = (std::rand() % 3); + if (set) { + success = sbmap.set(offset, length); + memset(map+offset, 0xFF, length); + set_op_count++; + } else { + success = sbmap.clr(offset, length); + memset(map+offset, 0x0, length); + clr_op_count++; + } + if (!success) { + std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl; + return -1; + } + } + + uint64_t set_bit_count = 0; + uint64_t clr_bit_count = 0; + for(uint64_t idx = 0; idx < map_size; idx++) { + if (map[idx]) { + set_bit_count++; + success = sbmap.bit_is_set(idx); + } else { + clr_bit_count++; + success = sbmap.bit_is_clr(idx); + } + if (!success) { + std::cerr << "expected: sbmap.bit_is_" << (map[idx] ? "set(" : "clr(") << idx << ")"<< std::endl; + return -1; + } + + } + std::cout << std::hex << std::right << __func__ ; + std::cout << " [" << test_idx << "] set_bit_count = 0x" << std::setfill('0') << std::setw(8) << set_bit_count + << ", clr_bit_count = 0x" << std::setfill('0') << std::setw(8) << clr_bit_count + << ", sum = 0x" << set_bit_count + clr_bit_count << std::endl; + std::cout << std::dec; + uint64_t offset = 0; + for(uint64_t i = 0; i < (set_op_count + clr_op_count); i++) { + extent_t ext = sbmap.get_next_set_extent(offset); + //std::cout << "set_ext:: " << i << ") [" << ext.offset << ", " << ext.length << "]" << std::endl; + for (uint64_t idx = ext.offset; idx < ext.offset + ext.length; idx++) { + if (map[idx] != 0xFF) { + std::cerr << "map[" << idx << "] is clear, but extent [" << ext.offset << ", " << ext.length << "] is set" << std::endl; + return -1; + } + } + offset = ext.offset + ext.length; + } + + offset = 0; + for(uint64_t i = 0; i < (set_op_count + clr_op_count); i++) { + extent_t ext = sbmap.get_next_clr_extent(offset); + //std::cout << "clr_ext:: " << i << ") [" << ext.offset << ", " << ext.length << "]" << std::endl; + for (uint64_t idx = ext.offset; idx < ext.offset + ext.length; idx++) { + if (map[idx] ) { + std::cerr << "map[" << idx << "] is set, but extent [" << ext.offset << ", " << ext.length << "] is free" << std::endl; + return -1; + } + } + offset = ext.offset + ext.length; + } + + return 0; +} + +//--------------------------------------------------------------------------------- +TEST(SimpleBitmap, intersection) +{ + const uint64_t MAP_SIZE = 1ULL << 30; // 1G + SimpleBitmap sbmap(g_ceph_context, MAP_SIZE); + + // use current time as seed for random generator + std::srand(std::time(nullptr)); + + std::unique_ptr<uint8_t[]> map = std::make_unique<uint8_t[]> (MAP_SIZE); + ASSERT_TRUE(map != nullptr); + + for (unsigned i = 0; i < 1; i++ ) { + sbmap.clear_all(); + memset(map.get(), 0, MAP_SIZE); + ASSERT_TRUE(test_intersections(i, sbmap, map.get(), MAP_SIZE) == 0); + + sbmap.set_all(); + memset(map.get(), 0xFF, MAP_SIZE); + ASSERT_TRUE(test_intersections(i, sbmap, map.get(), MAP_SIZE) == 0); + } +} + + +//--------------------------------------------------------------------------------- +static int test_extents_boundaries(uint64_t index, extent_t *ext_arr, uint64_t ext_arr_size, SimpleBitmap& sbmap, bool set) +{ + uint64_t n = sbmap.get_size(); + uint64_t offset = 0, k = 0; + for(unsigned i = 0; i < 64; i++) { + offset += i; + if (offset >= n) { + break; + } + + for(unsigned length = 1; length <= 128; length++) { + if (offset + length >= n) { + break; + } + + if (k >= ext_arr_size) { + break; + } + bool success; + if (set) { + success = sbmap.set(offset, length); + } else { + success = sbmap.clr(offset, length); + } + if (!success) { + std::cerr << "Failed sbmap." << (set ? "set(" : "clr(") << offset << ", " << length << ")"<< std::endl; + return -1; + } + ext_arr[k++] = {offset, length}; + if (length < 64) { + offset += 64; + } else { + offset += 128; + } + } + if (k >= ext_arr_size) { + break; + } + } + + unsigned arr_size = std::min((uint64_t)k, ext_arr_size); + std::cout << std::hex << std::right << __func__ ; + std::cout << " [" << index << "] " << (set ? "Set::" : "Clr::") << " extents count = 0x" << arr_size; + std::cout << std::dec << std::endl; + + offset = 0; + extent_t ext; + for(unsigned i = 0; i < arr_size; i++) { + if (set) { + ext = sbmap.get_next_set_extent(offset); + } else { + ext = sbmap.get_next_clr_extent(offset); + } + + if (verify_extent(ext, ext_arr, ext_arr_size, i) != 0) { + return -1; + } + offset = ext.offset + ext.length; + } + + if (set) { + ext = sbmap.get_next_set_extent(offset); + } else { + ext = sbmap.get_next_clr_extent(offset); + } + if (ext.length == 0) { + return 0; + } else { + std::cerr << "sbmap.get_next_" << (set ? "set" : "clr") << "_extent(" << offset << ") return length = " << ext.length << std::endl; + return -1; + } + +} + +//--------------------------------------------------------------------------------- +TEST(SimpleBitmap, boundaries) +{ + const uint64_t MAX_EXTENTS_COUNT = 64 << 10; + std::unique_ptr<extent_t[]> ext_arr = std::make_unique<extent_t[]>(MAX_EXTENTS_COUNT); + ASSERT_TRUE(ext_arr != nullptr); + + // use current time as seed for random generator + std::srand(std::time(nullptr)); + + uint64_t bit_count = 32 << 20; // 32Mb = 4MB + unsigned count = 0; + for (unsigned i = 0; i < 64; i++) { + SimpleBitmap sbmap(g_ceph_context, bit_count+i); + memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT); + sbmap.clear_all(); + ASSERT_TRUE(test_extents_boundaries(count, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, true) == 0); + + memset(ext_arr.get(), 0, sizeof(extent_t)*MAX_EXTENTS_COUNT); + sbmap.set_all(); + ASSERT_TRUE(test_extents_boundaries(count++, ext_arr.get(), MAX_EXTENTS_COUNT, sbmap, false) == 0); + } +} + +//--------------------------------------------------------------------------------- +TEST(SimpleBitmap, boundaries2) +{ + const uint64_t bit_count_base = 64 << 10; // 64Kb = 8MB + const extent_t null_extent = {0, 0}; + + for (unsigned i = 0; i < 64; i++) { + uint64_t bit_count = bit_count_base + i; + extent_t full_extent = {0, bit_count}; + SimpleBitmap sbmap(g_ceph_context, bit_count); + + sbmap.set(0, bit_count); + ASSERT_TRUE(sbmap.get_next_set_extent(0) == full_extent); + ASSERT_TRUE(sbmap.get_next_clr_extent(0) == null_extent); + + for (uint64_t bit = 0; bit < bit_count; bit++) { + sbmap.clr(bit, 1); + } + ASSERT_TRUE(sbmap.get_next_set_extent(0) == null_extent); + ASSERT_TRUE(sbmap.get_next_clr_extent(0) == full_extent); + + for (uint64_t bit = 0; bit < bit_count; bit++) { + sbmap.set(bit, 1); + } + ASSERT_TRUE(sbmap.get_next_set_extent(0) == full_extent); + ASSERT_TRUE(sbmap.get_next_clr_extent(0) == null_extent); + + sbmap.clr(0, bit_count); + ASSERT_TRUE(sbmap.get_next_set_extent(0) == null_extent); + ASSERT_TRUE(sbmap.get_next_clr_extent(0) == full_extent); + } +} + +TEST(shared_blob_2hash_tracker_t, basic_test) +{ + shared_blob_2hash_tracker_t t1(1024 * 1024, 4096); + + ASSERT_TRUE(t1.count_non_zero() == 0); + + t1.inc(0, 0, 1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(0, 0, -1); + ASSERT_TRUE(t1.count_non_zero() == 0); + + t1.inc(3, 0x1000, 2); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(3, 0x1000, -1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(3, 0x1000, -1); + ASSERT_TRUE(t1.count_non_zero() == 0); + + t1.inc(2, 0x2000, 5); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(18, 0x2000, -5); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(18, 0x2000, 1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(2, 0x2000, -1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(18, 0x2000, 4); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(2, 0x2000, -4); + ASSERT_TRUE(t1.count_non_zero() == 0); + + t1.inc(3, 0x3000, 2); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(4, 0x3000, -1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(4, 0x3000, -1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(3, 0x3000, -2); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(4, 0x3000, 1); + ASSERT_TRUE(t1.count_non_zero() != 0); + t1.inc(4, 0x3000, 1); + ASSERT_TRUE(t1.count_non_zero() == 0); + + t1.inc(5, 0x1000, 1); + t1.inc(5, 0x2000, 3); + t1.inc(5, 0x3000, 2); + t1.inc(5, 0x8000, 1); + + ASSERT_TRUE(t1.count_non_zero() != 0); + + ASSERT_TRUE(!t1.test_all_zero(5,0x1000)); + ASSERT_TRUE(!t1.test_all_zero(5, 0x2000)); + ASSERT_TRUE(!t1.test_all_zero(5, 0x3000)); + ASSERT_TRUE(t1.test_all_zero(5, 0x4000)); + ASSERT_TRUE(!t1.test_all_zero(5, 0x8000)); + + ASSERT_TRUE(t1.test_all_zero_range(5, 0, 0x1000)); + ASSERT_TRUE(t1.test_all_zero_range(5, 0x500, 0x500)); + ASSERT_TRUE(!t1.test_all_zero_range(5, 0x500, 0x1500)); + ASSERT_TRUE(!t1.test_all_zero_range(5, 0x1500, 0x3200)); + ASSERT_TRUE(t1.test_all_zero_range(5, 0x4500, 0x1500)); + ASSERT_TRUE(t1.test_all_zero_range(5, 0x4500, 0x3b00)); + ASSERT_TRUE(!t1.test_all_zero_range(5, 0, 0x9000)); +} + +TEST(bluestore_blob_use_tracker_t, mempool_stats_test) +{ + using mempool::bluestore_cache_other::allocated_items; + using mempool::bluestore_cache_other::allocated_bytes; + uint64_t other_items0 = allocated_items(); + uint64_t other_bytes0 = allocated_bytes(); + { + bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t; + + t1->init(1024 * 1024, 4096); + ASSERT_EQ(256, allocated_items() - other_items0); // = 1M / 4K + ASSERT_EQ(1024, allocated_bytes() - other_bytes0); // = 1M / 4K * 4 + + delete t1; + ASSERT_EQ(allocated_items(), other_items0); + ASSERT_EQ(allocated_bytes(), other_bytes0); + } + { + bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t; + + t1->init(1024 * 1024, 4096); + t1->add_tail(2048 * 1024, 4096); + // proper stats update after tail add + ASSERT_EQ(512, allocated_items() - other_items0); // = 2M / 4K + ASSERT_EQ(2048, allocated_bytes() - other_bytes0); // = 2M / 4K * 4 + + delete t1; + ASSERT_EQ(allocated_items(), other_items0); + ASSERT_EQ(allocated_bytes(), other_bytes0); + } + { + bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t; + + t1->init(1024 * 1024, 4096); + t1->prune_tail(512 * 1024); + // no changes in stats after pruning + ASSERT_EQ(256, allocated_items() - other_items0); // = 1M / 4K + ASSERT_EQ(1024, allocated_bytes() - other_bytes0); // = 1M / 4K * 4 + + delete t1; + ASSERT_EQ(allocated_items(), other_items0); + ASSERT_EQ(allocated_bytes(), other_bytes0); + } + { + bluestore_blob_use_tracker_t* t1 = new bluestore_blob_use_tracker_t; + bluestore_blob_use_tracker_t* t2 = new bluestore_blob_use_tracker_t; + + t1->init(1024 * 1024, 4096); + + // t1 keeps the same amount of entries + t2 has got half of them + t1->split(512 * 1024, t2); + ASSERT_EQ(256 + 128, allocated_items() - other_items0); //= 1M / 4K*1.5 + ASSERT_EQ(1024 + 512, allocated_bytes() - other_bytes0); //= 1M / 4K*4*1.5 + + // t1 & t2 release everything, then t2 get one less entry than t2 had had + // before + t1->split(4096, t2); + ASSERT_EQ(127, allocated_items() - other_items0); // = 512K / 4K - 1 + ASSERT_EQ(127 * 4, allocated_bytes() - other_bytes0); // = 512L / 4K * 4 - 4 + delete t1; + delete t2; + ASSERT_EQ(allocated_items(), other_items0); + ASSERT_EQ(allocated_bytes(), other_bytes0); + } +} + +int main(int argc, char **argv) { + auto args = argv_to_vec(argc, argv); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/test/objectstore/test_deferred.cc b/src/test/objectstore/test_deferred.cc new file mode 100644 index 000000000..1b5608101 --- /dev/null +++ b/src/test/objectstore/test_deferred.cc @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <memory> +#include <time.h> + +#include "os/ObjectStore.h" +#include "os/bluestore/BlueStore.h" +#include "include/Context.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "common/options.h" // for the size literals +#include <semaphore.h> + + + +class C_do_action : public Context { +public: + std::function<void()> action; + C_do_action(std::function<void()> action) + : action(action) {} + + void finish(int r) override { + action(); + } +}; + +void create_deferred_and_terminate() { + std::unique_ptr<ObjectStore> store; + + g_ceph_context->_conf._clear_safe_to_start_threads(); + g_ceph_context->_conf.set_val_or_die("bluestore_prefer_deferred_size", "4096"); + g_ceph_context->_conf.set_val_or_die("bluestore_allocator", "bitmap"); + g_ceph_context->_conf.set_val_or_die("bluestore_block_size", "10240000000"); + g_ceph_context->_conf.apply_changes(nullptr); + + int64_t poolid; + coll_t cid; + ghobject_t hoid; + ObjectStore::CollectionHandle ch; + ceph_assert(::mkdir("bluestore.test_temp_dir", 0777) == 0); + store = ObjectStore::create(g_ceph_context, + "bluestore", + "bluestore.test_temp_dir", + "store_test_temp_journal"); + ceph_assert(store->mkfs() == 0); + ceph_assert(store->mount() == 0); + + poolid = 11; + cid = coll_t(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD)); + ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + + { + ObjectStore::Transaction t; + std::string oid = "zapchajdziura"; + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 1, poolid, "")); + bufferlist bl; + bl.append(std::string(0xe000, '-')); + t.write(cid, hoid, 0, 0xe000, bl); + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + + size_t object_count = 10; + + // initial fill + bufferlist bl_64K; + bl_64K.append(std::string(64 * 1024, '-')); + + std::atomic<size_t> prefill_counter{0}; + sem_t prefill_mutex; + sem_init(&prefill_mutex, 0, 0); + + for (size_t o = 0; o < object_count; o++) { + ObjectStore::Transaction t; + std::string oid = "object-" + std::to_string(o); + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 1, poolid, "")); + + t.write(cid, hoid, 0, bl_64K.length(), bl_64K); + t.register_on_commit(new C_do_action([&] { + if (++prefill_counter == object_count) { + sem_post(&prefill_mutex); + } + })); + + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + sem_wait(&prefill_mutex); + + // small deferred writes over object + // and complete overwrite of previous one + bufferlist bl_8_bytes; + bl_8_bytes.append("abcdefgh"); + std::atomic<size_t> deferred_counter{0}; + for (size_t o = 0; o < object_count - 1; o++) { + ObjectStore::Transaction t; + + // sprinkle deferred writes + std::string oid_d = "object-" + std::to_string(o + 1); + ghobject_t hoid_d(hobject_t(oid_d, "", CEPH_NOSNAP, 1, poolid, "")); + + for(int i = 0; i < 16; i++) { + t.write(cid, hoid_d, 4096 * i, bl_8_bytes.length(), bl_8_bytes); + } + + // overwrite previous object + std::string oid_m = "object-" + std::to_string(o); + ghobject_t hoid_m(hobject_t(oid_m, "", CEPH_NOSNAP, 1, poolid, "")); + t.write(cid, hoid_m, 0, bl_64K.length(), bl_64K); + + t.register_on_commit(new C_do_action([&] { + if (++deferred_counter == object_count - 1) { + exit(0); + } + })); + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + sleep(10); + ceph_assert(0 && "should not reach here"); +} + +int main(int argc, char **argv) { + auto args = argv_to_vec(argc, argv); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + create_deferred_and_terminate(); + return 0; +} diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc new file mode 100644 index 000000000..33ffd6ab3 --- /dev/null +++ b/src/test/objectstore/test_kv.cc @@ -0,0 +1,1304 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <stdio.h> +#include <string.h> +#include <iostream> +#include <time.h> +#include <sys/mount.h> +#include "kv/KeyValueDB.h" +#include "kv/RocksDBStore.h" +#include "include/Context.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "include/stringify.h" +#include <gtest/gtest.h> + +using namespace std; + +class KVTest : public ::testing::TestWithParam<const char*> { +public: + boost::scoped_ptr<KeyValueDB> db; + + KVTest() : db(0) {} + + string _bl_to_str(bufferlist val) { + string str(val.c_str(), val.length()); + return str; + } + + void rm_r(string path) { + string cmd = string("rm -r ") + path; + cout << "==> " << cmd << std::endl; + int r = ::system(cmd.c_str()); + if (r) { + cerr << "failed with exit code " << r + << ", continuing anyway" << std::endl; + } + } + + void init() { + cout << "Creating " << string(GetParam()) << "\n"; + db.reset(KeyValueDB::create(g_ceph_context, string(GetParam()), + "kv_test_temp_dir")); + } + void fini() { + db.reset(NULL); + } + + void SetUp() override { + int r = ::mkdir("kv_test_temp_dir", 0777); + if (r < 0 && errno != EEXIST) { + r = -errno; + cerr << __func__ << ": unable to create kv_test_temp_dir: " + << cpp_strerror(r) << std::endl; + return; + } + init(); + } + void TearDown() override { + fini(); + rm_r("kv_test_temp_dir"); + } +}; + +TEST_P(KVTest, OpenClose) { + ASSERT_EQ(0, db->create_and_open(cout)); + db->close(); + db->open(cout); + fini(); +} + +TEST_P(KVTest, OpenCloseReopenClose) { + ASSERT_EQ(0, db->create_and_open(cout)); + fini(); + init(); + ASSERT_EQ(0, db->open(cout)); + fini(); +} + +/* + * Basic write and read test case in same database session. + */ +TEST_P(KVTest, OpenWriteRead) { + ASSERT_EQ(0, db->create_and_open(cout)); + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist value; + value.append("value"); + t->set("prefix", "key", value); + value.clear(); + value.append("value2"); + t->set("prefix", "key2", value); + value.clear(); + value.append("value3"); + t->set("prefix", "key3", value); + db->submit_transaction_sync(t); + + bufferlist v1, v2; + ASSERT_EQ(0, db->get("prefix", "key", &v1)); + ASSERT_EQ(v1.length(), 5u); + (v1.c_str())[v1.length()] = 0x0; + ASSERT_EQ(std::string(v1.c_str()), std::string("value")); + ASSERT_EQ(0, db->get("prefix", "key2", &v2)); + ASSERT_EQ(v2.length(), 6u); + (v2.c_str())[v2.length()] = 0x0; + ASSERT_EQ(std::string(v2.c_str()), std::string("value2")); + } + fini(); +} + +TEST_P(KVTest, PutReopen) { + ASSERT_EQ(0, db->create_and_open(cout)); + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist value; + value.append("value"); + t->set("prefix", "key", value); + t->set("prefix", "key2", value); + t->set("prefix", "key3", value); + db->submit_transaction_sync(t); + } + fini(); + + init(); + ASSERT_EQ(0, db->open(cout)); + { + bufferlist v1, v2; + ASSERT_EQ(0, db->get("prefix", "key", &v1)); + ASSERT_EQ(v1.length(), 5u); + ASSERT_EQ(0, db->get("prefix", "key2", &v2)); + ASSERT_EQ(v2.length(), 5u); + } + { + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkey("prefix", "key"); + t->rmkey("prefix", "key3"); + db->submit_transaction_sync(t); + } + fini(); + + init(); + ASSERT_EQ(0, db->open(cout)); + { + bufferlist v1, v2, v3; + ASSERT_EQ(-ENOENT, db->get("prefix", "key", &v1)); + ASSERT_EQ(0, db->get("prefix", "key2", &v2)); + ASSERT_EQ(v2.length(), 5u); + ASSERT_EQ(-ENOENT, db->get("prefix", "key3", &v3)); + } + fini(); +} + +TEST_P(KVTest, BenchCommit) { + int n = 1024; + ASSERT_EQ(0, db->create_and_open(cout)); + utime_t start = ceph_clock_now(); + { + cout << "priming" << std::endl; + // prime + bufferlist big; + bufferptr bp(1048576); + bp.zero(); + big.append(bp); + for (int i=0; i<30; ++i) { + KeyValueDB::Transaction t = db->get_transaction(); + t->set("prefix", "big" + stringify(i), big); + db->submit_transaction_sync(t); + } + } + cout << "now doing small writes" << std::endl; + bufferlist data; + bufferptr bp(1024); + bp.zero(); + data.append(bp); + for (int i=0; i<n; ++i) { + KeyValueDB::Transaction t = db->get_transaction(); + t->set("prefix", "key" + stringify(i), data); + db->submit_transaction_sync(t); + } + utime_t end = ceph_clock_now(); + utime_t dur = end - start; + cout << n << " commits in " << dur << ", avg latency " << (dur / (double)n) + << std::endl; + fini(); +} + +struct AppendMOP : public KeyValueDB::MergeOperator { + void merge_nonexistent( + const char *rdata, size_t rlen, std::string *new_value) override { + *new_value = "?" + std::string(rdata, rlen); + } + void merge( + const char *ldata, size_t llen, + const char *rdata, size_t rlen, + std::string *new_value) override { + *new_value = std::string(ldata, llen) + std::string(rdata, rlen); + } + // We use each operator name and each prefix to construct the + // overall RocksDB operator name for consistency check at open time. + const char *name() const override { + return "Append"; + } +}; + +string tostr(bufferlist& b) { + return string(b.c_str(),b.length()); +} + +TEST_P(KVTest, Merge) { + shared_ptr<KeyValueDB::MergeOperator> p(new AppendMOP); + int r = db->set_merge_operator("A",p); + if (r < 0) + return; // No merge operators for this database type + ASSERT_EQ(0, db->create_and_open(cout)); + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist v1, v2, v3; + v1.append(string("1")); + v2.append(string("2")); + v3.append(string("3")); + t->set("P", "K1", v1); + t->set("A", "A1", v2); + t->rmkey("A", "A2"); + t->merge("A", "A2", v3); + db->submit_transaction_sync(t); + } + { + bufferlist v1, v2, v3; + ASSERT_EQ(0, db->get("P", "K1", &v1)); + ASSERT_EQ(tostr(v1), "1"); + ASSERT_EQ(0, db->get("A", "A1", &v2)); + ASSERT_EQ(tostr(v2), "2"); + ASSERT_EQ(0, db->get("A", "A2", &v3)); + ASSERT_EQ(tostr(v3), "?3"); + } + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist v1; + v1.append(string("1")); + t->merge("A", "A2", v1); + db->submit_transaction_sync(t); + } + { + bufferlist v; + ASSERT_EQ(0, db->get("A", "A2", &v)); + ASSERT_EQ(tostr(v), "?31"); + } + fini(); +} + +TEST_P(KVTest, RMRange) { + ASSERT_EQ(0, db->create_and_open(cout)); + bufferlist value; + value.append("value"); + { + KeyValueDB::Transaction t = db->get_transaction(); + t->set("prefix", "key1", value); + t->set("prefix", "key2", value); + t->set("prefix", "key3", value); + t->set("prefix", "key4", value); + t->set("prefix", "key45", value); + t->set("prefix", "key5", value); + t->set("prefix", "key6", value); + db->submit_transaction_sync(t); + } + + { + KeyValueDB::Transaction t = db->get_transaction(); + t->set("prefix", "key7", value); + t->set("prefix", "key8", value); + t->rm_range_keys("prefix", "key2", "key7"); + db->submit_transaction_sync(t); + bufferlist v1, v2; + ASSERT_EQ(0, db->get("prefix", "key1", &v1)); + v1.clear(); + ASSERT_EQ(-ENOENT, db->get("prefix", "key45", &v1)); + ASSERT_EQ(0, db->get("prefix", "key8", &v1)); + v1.clear(); + ASSERT_EQ(-ENOENT, db->get("prefix", "key2", &v1)); + ASSERT_EQ(0, db->get("prefix", "key7", &v2)); + } + + { + KeyValueDB::Transaction t = db->get_transaction(); + t->rm_range_keys("prefix", "key", "key"); + db->submit_transaction_sync(t); + bufferlist v1, v2; + ASSERT_EQ(0, db->get("prefix", "key1", &v1)); + ASSERT_EQ(0, db->get("prefix", "key8", &v2)); + } + + { + KeyValueDB::Transaction t = db->get_transaction(); + t->rm_range_keys("prefix", "key-", "key~"); + db->submit_transaction_sync(t); + bufferlist v1, v2; + ASSERT_EQ(-ENOENT, db->get("prefix", "key1", &v1)); + ASSERT_EQ(-ENOENT, db->get("prefix", "key8", &v2)); + } + + fini(); +} + +TEST_P(KVTest, ShardingRMRange) { + if(string(GetParam()) != "rocksdb") + return; + std::string cfs("O(7)="); + ASSERT_EQ(0, db->create_and_open(cout, cfs)); + { + KeyValueDB::Transaction t = db->get_transaction(); + for (size_t i = 0; i < 1000; i++) { + bufferlist value; + char* a; + ASSERT_EQ(asprintf(&a, "key%3.3ld", i), 6); + value.append(a); + t->set("O", a, value); + free(a); + } + db->submit_transaction_sync(t); + } + + { + KeyValueDB::Transaction t = db->get_transaction(); + t->rm_range_keys("O", "key277", "key467"); + db->submit_transaction_sync(t); + } + + for (size_t i = 0; i < 1000; i++) { + char* key; + ASSERT_EQ(asprintf(&key, "key%3.3ld", i), 6); + bufferlist value; + int r = db->get("O", key, &value); + ASSERT_EQ(r, (i >= 277 && i < 467 ? -ENOENT : 0)); + free(key); + } + + fini(); +} + + +TEST_P(KVTest, RocksDBColumnFamilyTest) { + if(string(GetParam()) != "rocksdb") + return; + + std::string cfs("cf1 cf2"); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + cout << "creating two column families and opening them" << std::endl; + ASSERT_EQ(0, db->create_and_open(cout, cfs)); + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist value; + value.append("value"); + cout << "write a transaction includes three keys in different CFs" << std::endl; + t->set("prefix", "key", value); + t->set("cf1", "key", value); + t->set("cf2", "key2", value); + ASSERT_EQ(0, db->submit_transaction_sync(t)); + } + fini(); + + init(); + ASSERT_EQ(0, db->open(cout, cfs)); + { + bufferlist v1, v2, v3; + cout << "reopen db and read those keys" << std::endl; + ASSERT_EQ(0, db->get("prefix", "key", &v1)); + ASSERT_EQ(0, _bl_to_str(v1) != "value"); + ASSERT_EQ(0, db->get("cf1", "key", &v2)); + ASSERT_EQ(0, _bl_to_str(v2) != "value"); + ASSERT_EQ(0, db->get("cf2", "key2", &v3)); + ASSERT_EQ(0, _bl_to_str(v2) != "value"); + } + { + cout << "delete two keys in CFs" << std::endl; + KeyValueDB::Transaction t = db->get_transaction(); + t->rmkey("prefix", "key"); + t->rmkey("cf2", "key2"); + ASSERT_EQ(0, db->submit_transaction_sync(t)); + } + fini(); + + init(); + ASSERT_EQ(0, db->open(cout, cfs)); + { + cout << "reopen db and read keys again." << std::endl; + bufferlist v1, v2, v3; + ASSERT_EQ(-ENOENT, db->get("prefix", "key", &v1)); + ASSERT_EQ(0, db->get("cf1", "key", &v2)); + ASSERT_EQ(0, _bl_to_str(v2) != "value"); + ASSERT_EQ(-ENOENT, db->get("cf2", "key2", &v3)); + } + fini(); +} + +TEST_P(KVTest, RocksDBIteratorTest) { + if(string(GetParam()) != "rocksdb") + return; + + std::string cfs("cf1"); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + cout << "creating one column family and opening it" << std::endl; + ASSERT_EQ(0, db->create_and_open(cout, cfs)); + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist bl1; + bl1.append("hello"); + bufferlist bl2; + bl2.append("world"); + cout << "write some kv pairs into default and new CFs" << std::endl; + t->set("prefix", "key1", bl1); + t->set("prefix", "key2", bl2); + t->set("cf1", "key1", bl1); + t->set("cf1", "key2", bl2); + ASSERT_EQ(0, db->submit_transaction_sync(t)); + } + { + cout << "iterating the default CF" << std::endl; + KeyValueDB::Iterator iter = db->get_iterator("prefix"); + iter->seek_to_first(); + ASSERT_EQ(1, iter->valid()); + ASSERT_EQ("key1", iter->key()); + ASSERT_EQ("hello", _bl_to_str(iter->value())); + ASSERT_EQ(0, iter->next()); + ASSERT_EQ(1, iter->valid()); + ASSERT_EQ("key2", iter->key()); + ASSERT_EQ("world", _bl_to_str(iter->value())); + } + { + cout << "iterating the new CF" << std::endl; + KeyValueDB::Iterator iter = db->get_iterator("cf1"); + iter->seek_to_first(); + ASSERT_EQ(1, iter->valid()); + ASSERT_EQ("key1", iter->key()); + ASSERT_EQ("hello", _bl_to_str(iter->value())); + ASSERT_EQ(0, iter->next()); + ASSERT_EQ(1, iter->valid()); + ASSERT_EQ("key2", iter->key()); + ASSERT_EQ("world", _bl_to_str(iter->value())); + } + fini(); +} + +TEST_P(KVTest, RocksDBShardingIteratorTest) { + if(string(GetParam()) != "rocksdb") + return; + + std::string cfs("A(6)"); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + cout << "creating one column family and opening it" << std::endl; + ASSERT_EQ(0, db->create_and_open(cout, cfs)); + { + KeyValueDB::Transaction t = db->get_transaction(); + for (int v = 100; v <= 999; v++) { + std::string str = to_string(v); + bufferlist val; + val.append(str); + t->set("A", str, val); + } + ASSERT_EQ(0, db->submit_transaction_sync(t)); + } + { + KeyValueDB::Iterator it = db->get_iterator("A"); + int pos = 0; + ASSERT_EQ(it->lower_bound(to_string(pos)), 0); + for (pos = 100; pos <= 999; pos++) { + ASSERT_EQ(it->valid(), true); + ASSERT_EQ(it->key(), to_string(pos)); + ASSERT_EQ(it->value().to_str(), to_string(pos)); + it->next(); + } + ASSERT_EQ(it->valid(), false); + pos = 999; + ASSERT_EQ(it->lower_bound(to_string(pos)), 0); + for (pos = 999; pos >= 100; pos--) { + ASSERT_EQ(it->valid(), true); + ASSERT_EQ(it->key(), to_string(pos)); + ASSERT_EQ(it->value().to_str(), to_string(pos)); + it->prev(); + } + ASSERT_EQ(it->valid(), false); + } + fini(); +} + +TEST_P(KVTest, RocksDBCFMerge) { + if(string(GetParam()) != "rocksdb") + return; + + shared_ptr<KeyValueDB::MergeOperator> p(new AppendMOP); + int r = db->set_merge_operator("cf1",p); + if (r < 0) + return; // No merge operators for this database type + std::string cfs("cf1"); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + cout << "creating one column family and opening it" << std::endl; + ASSERT_EQ(0, db->create_and_open(cout, cfs)); + + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist v1, v2, v3; + v1.append(string("1")); + v2.append(string("2")); + v3.append(string("3")); + t->set("P", "K1", v1); + t->set("cf1", "A1", v2); + t->rmkey("cf1", "A2"); + t->merge("cf1", "A2", v3); + db->submit_transaction_sync(t); + } + { + bufferlist v1, v2, v3; + ASSERT_EQ(0, db->get("P", "K1", &v1)); + ASSERT_EQ(tostr(v1), "1"); + ASSERT_EQ(0, db->get("cf1", "A1", &v2)); + ASSERT_EQ(tostr(v2), "2"); + ASSERT_EQ(0, db->get("cf1", "A2", &v3)); + ASSERT_EQ(tostr(v3), "?3"); + } + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist v1; + v1.append(string("1")); + t->merge("cf1", "A2", v1); + db->submit_transaction_sync(t); + } + { + bufferlist v; + ASSERT_EQ(0, db->get("cf1", "A2", &v)); + ASSERT_EQ(tostr(v), "?31"); + } + fini(); +} + +TEST_P(KVTest, RocksDB_estimate_size) { + if(string(GetParam()) != "rocksdb") + GTEST_SKIP(); + + std::string cfs("cf1"); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + cout << "creating one column family and opening it" << std::endl; + ASSERT_EQ(0, db->create_and_open(cout)); + + for(int test = 0; test < 20; test++) + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist v1; + v1.append(string(1000, '1')); + for (int i = 0; i < 100; i++) + t->set("A", to_string(rand()%100000), v1); + db->submit_transaction_sync(t); + db->compact(); + + int64_t size_a = db->estimate_prefix_size("A",""); + ASSERT_GT(size_a, (test + 1) * 1000 * 100 * 0.5); + ASSERT_LT(size_a, (test + 1) * 1000 * 100 * 1.5); + int64_t size_a1 = db->estimate_prefix_size("A","1"); + ASSERT_GT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 0.5); + ASSERT_LT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 1.5); + int64_t size_b = db->estimate_prefix_size("B",""); + ASSERT_EQ(size_b, 0); + } + + fini(); +} + +TEST_P(KVTest, RocksDB_estimate_size_column_family) { + if(string(GetParam()) != "rocksdb") + GTEST_SKIP(); + + std::string cfs("cf1"); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + cout << "creating one column family and opening it" << std::endl; + ASSERT_EQ(0, db->create_and_open(cout, cfs)); + + for(int test = 0; test < 20; test++) + { + KeyValueDB::Transaction t = db->get_transaction(); + bufferlist v1; + v1.append(string(1000, '1')); + for (int i = 0; i < 100; i++) + t->set("cf1", to_string(rand()%100000), v1); + db->submit_transaction_sync(t); + db->compact(); + + int64_t size_a = db->estimate_prefix_size("cf1",""); + ASSERT_GT(size_a, (test + 1) * 1000 * 100 * 0.5); + ASSERT_LT(size_a, (test + 1) * 1000 * 100 * 1.5); + int64_t size_a1 = db->estimate_prefix_size("cf1","1"); + ASSERT_GT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 0.5); + ASSERT_LT(size_a1, (test + 1) * 1000 * 100 * 0.1 * 1.5); + int64_t size_b = db->estimate_prefix_size("B",""); + ASSERT_EQ(size_b, 0); + } + + fini(); +} + +TEST_P(KVTest, RocksDB_parse_sharding_def) { + if(string(GetParam()) != "rocksdb") + GTEST_SKIP(); + + bool result; + std::vector<RocksDBStore::ColumnFamily> sharding_def; + char const* error_position = nullptr; + std::string error_msg; + + std::string_view text_def = "A(10,0-30) B(6)=option1,option2=aaaa C"; + result = RocksDBStore::parse_sharding_def(text_def, + sharding_def, + &error_position, + &error_msg); + + ASSERT_EQ(result, true); + ASSERT_EQ(error_position, nullptr); + ASSERT_EQ(error_msg, ""); + std::cout << text_def << std::endl; + if (error_position) std::cout << std::string(error_position - text_def.begin(), ' ') << "^" << error_msg << std::endl; + + ASSERT_EQ(sharding_def.size(), 3); + ASSERT_EQ(sharding_def[0].name, "A"); + ASSERT_EQ(sharding_def[0].shard_cnt, 10); + ASSERT_EQ(sharding_def[0].hash_l, 0); + ASSERT_EQ(sharding_def[0].hash_h, 30); + + ASSERT_EQ(sharding_def[1].name, "B"); + ASSERT_EQ(sharding_def[1].shard_cnt, 6); + ASSERT_EQ(sharding_def[1].options, "option1,option2=aaaa"); + ASSERT_EQ(sharding_def[2].name, "C"); + ASSERT_EQ(sharding_def[2].shard_cnt, 1); + + + text_def = "A(10 B(6)=option C"; + result = RocksDBStore::parse_sharding_def(text_def, + sharding_def, + &error_position, + &error_msg); + std::cout << text_def << std::endl; + if (error_position) + std::cout << std::string(error_position - text_def.begin(), ' ') << "^" << error_msg << std::endl; + ASSERT_EQ(result, false); + ASSERT_NE(error_position, nullptr); + ASSERT_NE(error_msg, ""); + + text_def = "A(10,1) B(6)=option C"; + result = RocksDBStore::parse_sharding_def(text_def, + sharding_def, + &error_position, + &error_msg); + std::cout << text_def << std::endl; + std::cout << std::string(error_position - text_def.begin(), ' ') << "^" << error_msg << std::endl; + ASSERT_EQ(result, false); + ASSERT_NE(error_position, nullptr); + ASSERT_NE(error_msg, ""); +} + + + +class RocksDBShardingTest : public ::testing::TestWithParam<const char*> { +public: + boost::scoped_ptr<KeyValueDB> db; + + RocksDBShardingTest() : db(0) {} + + string _bl_to_str(bufferlist val) { + string str(val.c_str(), val.length()); + return str; + } + + void rm_r(string path) { + string cmd = string("rm -r ") + path; + if (verbose) + cout << "==> " << cmd << std::endl; + int r = ::system(cmd.c_str()); + if (r) { + cerr << "failed with exit code " << r + << ", continuing anyway" << std::endl; + } + } + + void SetUp() override { + verbose = getenv("VERBOSE") && strcmp(getenv("VERBOSE"), "1") == 0; + + int r = ::mkdir("kv_test_temp_dir", 0777); + if (r < 0 && errno != EEXIST) { + r = -errno; + cerr << __func__ << ": unable to create kv_test_temp_dir: " + << cpp_strerror(r) << std::endl; + return; + } + db.reset(KeyValueDB::create(g_ceph_context, "rocksdb", + "kv_test_temp_dir")); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + if (verbose) + cout << "Creating database with sharding: " << GetParam() << std::endl; + ASSERT_EQ(0, db->create_and_open(cout, GetParam())); + } + void TearDown() override { + db.reset(nullptr); + rm_r("kv_test_temp_dir"); + } + + /* + A - main 0/1/20 + B - shard 1/3 x 0/1/20 + C - main 0/1/20 + D - shard 1/3 x 0/1/20 + E - main 0/1/20 + */ + bool verbose; + std::vector<std::string> sharding_defs = { + "Betelgeuse D", + "Betelgeuse(3) D", + "Betelgeuse D(3)", + "Betelgeuse(3) D(3)"}; + std::vector<std::string> prefixes = {"Ad", "Betelgeuse", "C", "D", "Evade"}; + std::vector<std::string> randoms = {"0", "1", "2", "3", "4", "5", + "found", "brain", "fully", "pen", "worth", "race", + "stand", "nodded", "whenever", "surrounded", "industrial", "skin", + "this", "direction", "family", "beginning", "whenever", "held", + "metal", "year", "like", "valuable", "softly", "whistle", + "perfectly", "broken", "idea", "also", "coffee", "branch", + "tongue", "immediately", "bent", "partly", "burn", "include", + "certain", "burst", "final", "smoke", "positive", "perfectly" + }; + int R = randoms.size(); + + typedef int test_id[6]; + void zero(test_id& x) { + k = 0; + v = 0; + for (auto& i:x) + i = 0; + } + bool end(const test_id& x) { + return x[5] != 0; + } + void next(test_id& x) { + x[0]++; + for (int i = 0; i < 5; i++) { + if (x[i] == 3) { + x[i] = 0; + ++x[i + 1]; + } + } + } + + std::map<std::string, std::string> data; + int k = 0; + int v = 0; + + void generate_data(const test_id& x) { + data.clear(); + for (int i = 0; i < 5; i++) { + if (verbose) + std::cout << x[i] << "-"; + switch (x[i]) { + case 0: + break; + case 1: + data[RocksDBStore::combine_strings(prefixes[i], randoms[k++ % R])] = randoms[v++ % R]; + break; + case 2: + std::string base = randoms[k++ % R]; + for (int j = 0; j < 10; j++) { + data[RocksDBStore::combine_strings(prefixes[i], base + "." + randoms[k++ % R])] = randoms[v++ % R]; + } + break; + } + } + } + + void data_to_db() { + KeyValueDB::Transaction t = db->get_transaction(); + for (auto &d : data) { + bufferlist v1; + v1.append(d.second); + string prefix; + string key; + RocksDBStore::split_key(d.first, &prefix, &key); + t->set(prefix, key, v1); + if (verbose) + std::cout << "SET " << prefix << " " << key << std::endl; + } + ASSERT_EQ(db->submit_transaction_sync(t), 0); + } + + void clear_db() { + KeyValueDB::Transaction t = db->get_transaction(); + for (auto &d : data) { + string prefix; + string key; + RocksDBStore::split_key(d.first, &prefix, &key); + t->rmkey(prefix, key); + } + ASSERT_EQ(db->submit_transaction_sync(t), 0); + //paranoid, check if db empty + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + ASSERT_EQ(it->seek_to_first(), 0); + ASSERT_EQ(it->valid(), false); + } +}; + +TEST_P(RocksDBShardingTest, wholespace_next) { + test_id X; + zero(X); + do { + generate_data(X); + data_to_db(); + + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + //move forward + auto dit = data.begin(); + int r = it->seek_to_first(); + ASSERT_EQ(r, 0); + ASSERT_EQ(it->valid(), (dit != data.end())); + + while (dit != data.end()) { + ASSERT_EQ(it->valid(), true); + string prefix; + string key; + RocksDBStore::split_key(dit->first, &prefix, &key); + auto raw_key = it->raw_key(); + ASSERT_EQ(raw_key.first, prefix); + ASSERT_EQ(raw_key.second, key); + ASSERT_EQ(it->value().to_str(), dit->second); + if (verbose) + std::cout << "next " << prefix << " " << key << std::endl; + ASSERT_EQ(it->next(), 0); + ++dit; + } + ASSERT_EQ(it->valid(), false); + + clear_db(); + next(X); + } while (!end(X)); +} + +TEST_P(RocksDBShardingTest, wholespace_prev) { + test_id X; + zero(X); + do { + generate_data(X); + data_to_db(); + + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + auto dit = data.rbegin(); + int r = it->seek_to_last(); + ASSERT_EQ(r, 0); + ASSERT_EQ(it->valid(), (dit != data.rend())); + + while (dit != data.rend()) { + ASSERT_EQ(it->valid(), true); + string prefix; + string key; + RocksDBStore::split_key(dit->first, &prefix, &key); + auto raw_key = it->raw_key(); + ASSERT_EQ(raw_key.first, prefix); + ASSERT_EQ(raw_key.second, key); + ASSERT_EQ(it->value().to_str(), dit->second); + if (verbose) + std::cout << "prev " << prefix << " " << key << std::endl; + ASSERT_EQ(it->prev(), 0); + ++dit; + } + ASSERT_EQ(it->valid(), false); + + clear_db(); + next(X); + } while (!end(X)); +} + +TEST_P(RocksDBShardingTest, wholespace_lower_bound) { + test_id X; + zero(X); + do { + generate_data(X); + data_to_db(); + + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + auto dit = data.begin(); + int r = it->seek_to_first(); + ASSERT_EQ(r, 0); + ASSERT_EQ(it->valid(), (dit != data.end())); + + while (dit != data.end()) { + ASSERT_EQ(it->valid(), true); + string prefix; + string key; + RocksDBStore::split_key(dit->first, &prefix, &key); + KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator(); + ASSERT_EQ(it1->lower_bound(prefix, key), 0); + ASSERT_EQ(it1->valid(), true); + auto raw_key = it1->raw_key(); + ASSERT_EQ(raw_key.first, prefix); + ASSERT_EQ(raw_key.second, key); + if (verbose) + std::cout << "lower_bound " << prefix << " " << key << std::endl; + ASSERT_EQ(it->next(), 0); + ++dit; + } + ASSERT_EQ(it->valid(), false); + + clear_db(); + next(X); + } while (!end(X)); +} + +TEST_P(RocksDBShardingTest, wholespace_upper_bound) { + test_id X; + zero(X); + do { + generate_data(X); + data_to_db(); + + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + auto dit = data.begin(); + int r = it->seek_to_first(); + ASSERT_EQ(r, 0); + ASSERT_EQ(it->valid(), (dit != data.end())); + + while (dit != data.end()) { + ASSERT_EQ(it->valid(), true); + string prefix; + string key; + string key_minus_1; + RocksDBStore::split_key(dit->first, &prefix, &key); + //decrement key minimally + key_minus_1 = key.substr(0, key.length() - 1) + std::string(1, key[key.length() - 1] - 1); + KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator(); + ASSERT_EQ(it1->upper_bound(prefix, key_minus_1), 0); + ASSERT_EQ(it1->valid(), true); + auto raw_key = it1->raw_key(); + ASSERT_EQ(raw_key.first, prefix); + ASSERT_EQ(raw_key.second, key); + if (verbose) + std::cout << "upper_bound " << prefix << " " << key_minus_1 << std::endl; + ASSERT_EQ(it->next(), 0); + ++dit; + } + ASSERT_EQ(it->valid(), false); + + clear_db(); + next(X); + } while (!end(X)); +} + +TEST_P(RocksDBShardingTest, wholespace_lookup_limits) { + test_id X; + zero(X); + do { + generate_data(X); + data_to_db(); + + //lookup before first + if (data.size() > 0) { + auto dit = data.begin(); + string prefix; + string key; + RocksDBStore::split_key(dit->first, &prefix, &key); + KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator(); + ASSERT_EQ(it1->lower_bound(" ", " "), 0); + ASSERT_EQ(it1->valid(), true); + auto raw_key = it1->raw_key(); + ASSERT_EQ(raw_key.first, prefix); + ASSERT_EQ(raw_key.second, key); + } + //lookup after last + KeyValueDB::WholeSpaceIterator it1 = db->get_wholespace_iterator(); + ASSERT_EQ(it1->lower_bound("~", "~"), 0); + ASSERT_EQ(it1->valid(), false); + + clear_db(); + next(X); + } while (!end(X)); +} + + + +class RocksDBResharding : public ::testing::Test { +public: + boost::scoped_ptr<RocksDBStore> db; + + RocksDBResharding() : db(0) {} + + string _bl_to_str(bufferlist val) { + string str(val.c_str(), val.length()); + return str; + } + + void rm_r(string path) { + string cmd = string("rm -r ") + path; + if (verbose) + cout << "==> " << cmd << std::endl; + int r = ::system(cmd.c_str()); + if (r) { + cerr << "failed with exit code " << r + << ", continuing anyway" << std::endl; + } + } + + void SetUp() override { + verbose = getenv("VERBOSE") && strcmp(getenv("VERBOSE"), "1") == 0; + + int r = ::mkdir("kv_test_temp_dir", 0777); + if (r < 0 && errno != EEXIST) { + r = -errno; + cerr << __func__ << ": unable to create kv_test_temp_dir: " + << cpp_strerror(r) << std::endl; + return; + } + + KeyValueDB* db_kv = KeyValueDB::create(g_ceph_context, "rocksdb", + "kv_test_temp_dir"); + RocksDBStore* db_rocks = dynamic_cast<RocksDBStore*>(db_kv); + ceph_assert(db_rocks); + db.reset(db_rocks); + ASSERT_EQ(0, db->init(g_conf()->bluestore_rocksdb_options)); + } + void TearDown() override { + db.reset(nullptr); + rm_r("kv_test_temp_dir"); + } + + bool verbose; + std::vector<std::string> prefixes = {"Ad", "Betelgeuse", "C", "D", "Evade"}; + std::vector<std::string> randoms = {"0", "1", "2", "3", "4", "5", + "found", "brain", "fully", "pen", "worth", "race", + "stand", "nodded", "whenever", "surrounded", "industrial", "skin", + "this", "direction", "family", "beginning", "whenever", "held", + "metal", "year", "like", "valuable", "softly", "whistle", + "perfectly", "broken", "idea", "also", "coffee", "branch", + "tongue", "immediately", "bent", "partly", "burn", "include", + "certain", "burst", "final", "smoke", "positive", "perfectly" + }; + int R = randoms.size(); + int k = 0; + std::map<std::string, std::string> data; + + void generate_data() { + data.clear(); + for (size_t p = 0; p < prefixes.size(); p++) { + size_t elem_count = 1 << (( p * 3 ) + 3); + for (size_t i = 0; i < elem_count; i++) { + std::string key; + for (int x = 0; x < 5; x++) { + key = key + randoms[rand() % R]; + } + std::string value; + for (int x = 0; x < 3; x++) { + value = value + randoms[rand() % R]; + } + data[RocksDBStore::combine_strings(prefixes[p], key)] = value; + } + } + } + + void data_to_db() { + KeyValueDB::Transaction t = db->get_transaction(); + size_t i = 0; + for (auto& d: data) { + bufferlist v1; + v1.append(d.second); + string prefix; + string key; + RocksDBStore::split_key(d.first, &prefix, &key); + t->set(prefix, key, v1); + if (verbose) + std::cout << "SET " << prefix << " " << key << std::endl; + i++; + if ((i % 1000) == 0) { + ASSERT_EQ(db->submit_transaction_sync(t), 0); + t.reset(); + if (verbose) + std::cout << "writing key to DB" << std::endl; + t = db->get_transaction(); + } + } + if (verbose) + std::cout << "writing keys to DB" << std::endl; + ASSERT_EQ(db->submit_transaction_sync(t), 0); + } + + void clear_db() { + KeyValueDB::Transaction t = db->get_transaction(); + for (auto &d : data) { + string prefix; + string key; + RocksDBStore::split_key(d.first, &prefix, &key); + t->rmkey(prefix, key); + } + ASSERT_EQ(db->submit_transaction_sync(t), 0); + //paranoid, check if db empty + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + ASSERT_EQ(it->seek_to_first(), 0); + ASSERT_EQ(it->valid(), false); + } + + void check_db() { + KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator(); + //move forward + auto dit = data.begin(); + int r = it->seek_to_first(); + ASSERT_EQ(r, 0); + ASSERT_EQ(it->valid(), (dit != data.end())); + + while (dit != data.end()) { + ASSERT_EQ(it->valid(), true); + string prefix; + string key; + RocksDBStore::split_key(dit->first, &prefix, &key); + auto raw_key = it->raw_key(); + ASSERT_EQ(raw_key.first, prefix); + ASSERT_EQ(raw_key.second, key); + ASSERT_EQ(it->value().to_str(), dit->second); + if (verbose) + std::cout << "next " << prefix << " " << key << std::endl; + ASSERT_EQ(it->next(), 0); + ++dit; + } + ASSERT_EQ(it->valid(), false); + } +}; + +TEST_F(RocksDBResharding, basic) { + ASSERT_EQ(0, db->create_and_open(cout, "")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + ASSERT_EQ(db->reshard("Evade(4)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, all_to_shards) { + ASSERT_EQ(0, db->create_and_open(cout, "")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + ASSERT_EQ(db->reshard("Ad(1) Betelgeuse(1) C(1) D(1) Evade(1)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, all_to_shards_and_back_again) { + ASSERT_EQ(0, db->create_and_open(cout, "")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + ASSERT_EQ(db->reshard("Ad(1) Betelgeuse(1) C(1) D(1) Evade(1)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); + ASSERT_EQ(db->reshard(""), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, resume_interrupted_at_batch) { + ASSERT_EQ(0, db->create_and_open(cout, "")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_first_batch = true; + ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1000); + ASSERT_NE(db->open(cout), 0); + ASSERT_EQ(db->reshard("Evade(4)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, resume_interrupted_at_column) { + ASSERT_EQ(0, db->create_and_open(cout, "")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_processing_column = true; + ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1001); + ASSERT_NE(db->open(cout), 0); + ASSERT_EQ(db->reshard("Evade(4)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, resume_interrupted_before_commit) { + ASSERT_EQ(0, db->create_and_open(cout, "")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_successful_processing = true; + ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1002); + ASSERT_NE(db->open(cout), 0); + ASSERT_EQ(db->reshard("Evade(4)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, prevent_incomplete_hash_change) { + ASSERT_EQ(0, db->create_and_open(cout, "Evade(4,0-3)")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_successful_processing = true; + ASSERT_EQ(db->reshard("Evade(4,0-8)", &ctrl), -1002); + ASSERT_NE(db->open(cout), 0); + ASSERT_EQ(db->reshard("Evade(4,0-8)"), 0); + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); +} + +TEST_F(RocksDBResharding, change_reshard) { + ASSERT_EQ(0, db->create_and_open(cout, "Ad(4)")); + generate_data(); + data_to_db(); + check_db(); + db->close(); + { + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_first_batch = true; + ASSERT_EQ(db->reshard("C(5) D(3)", &ctrl), -1000); + } + { + RocksDBStore::resharding_ctrl ctrl; + ASSERT_NE(db->open(cout), 0); + ctrl.unittest_fail_after_first_batch = false; + ctrl.unittest_fail_after_processing_column = true; + ASSERT_EQ(db->reshard("C(5) Evade(2)", &ctrl), -1001); + } + { + RocksDBStore::resharding_ctrl ctrl; + ASSERT_NE(db->open(cout), 0); + ctrl.unittest_fail_after_processing_column = false; + ctrl.unittest_fail_after_successful_processing = true; + ASSERT_EQ(db->reshard("Evade(2) D(3)", &ctrl), -1002); + } + { + ASSERT_NE(db->open(cout), 0); + ASSERT_EQ(db->reshard("Ad(1) Evade(5)"), 0); + } + { + ASSERT_EQ(db->open(cout), 0); + check_db(); + db->close(); + } +} + + +INSTANTIATE_TEST_SUITE_P( + KeyValueDB, + KVTest, + ::testing::Values("rocksdb")); + +INSTANTIATE_TEST_SUITE_P( + KeyValueDB, + RocksDBShardingTest, + ::testing::Values("Betelgeuse D", + "Betelgeuse(3) D", + "Betelgeuse D(3)", + "Betelgeuse(3) D(3)")); + +int main(int argc, char **argv) { + auto args = argv_to_vec(argc, argv); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + g_ceph_context->_conf.set_val( + "enable_experimental_unrecoverable_data_corrupting_features", + "rocksdb"); + g_ceph_context->_conf.apply_changes(nullptr); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/test/objectstore/test_memstore_clone.cc b/src/test/objectstore/test_memstore_clone.cc new file mode 100644 index 000000000..507f74d22 --- /dev/null +++ b/src/test/objectstore/test_memstore_clone.cc @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <boost/intrusive_ptr.hpp> +#include "global/global_init.h" +#include "common/ceph_argparse.h" +#include "os/ObjectStore.h" +#include <gtest/gtest.h> +#include "include/ceph_assert.h" +#include "common/errno.h" +#include "store_test_fixture.h" + +#define dout_context g_ceph_context + +using namespace std; + +namespace { + +const coll_t cid; + +ghobject_t make_ghobject(const char *oid) +{ + return ghobject_t{hobject_t{oid, "", CEPH_NOSNAP, 0, 0, ""}}; +} + +} // anonymous namespace + +class MemStoreClone : public StoreTestFixture { +public: + MemStoreClone() + : StoreTestFixture("memstore") + {} + void SetUp() override { + StoreTestFixture::SetUp(); + if (HasFailure()) { + return; + } + ObjectStore::Transaction t; + ch = store->create_new_collection(cid); + t.create_collection(cid, 4); + unsigned r = store->queue_transaction(ch, std::move(t)); + if (r != 0) { + derr << "failed to create collection with " << cpp_strerror(r) << dendl; + } + ASSERT_EQ(0U, r); + } + void TearDown() override { + ch.reset(); + StoreTestFixture::TearDown(); + } +}; + +// src 11[11 11 11 11]11 +// dst 22 22 22 22 22 22 +// res 22 11 11 11 11 22 +TEST_F(MemStoreClone, CloneRangeAllocated) +{ + ASSERT_TRUE(store); + + const auto src = make_ghobject("src1"); + const auto dst = make_ghobject("dst1"); + + bufferlist srcbl, dstbl, result, expected; + srcbl.append("111111111111"); + dstbl.append("222222222222"); + expected.append("221111111122"); + + ObjectStore::Transaction t; + t.write(cid, src, 0, 12, srcbl); + t.write(cid, dst, 0, 12, dstbl); + t.clone_range(cid, src, dst, 2, 8, 2); + ASSERT_EQ(0, store->queue_transaction(ch, std::move(t))); + ASSERT_EQ(12, store->read(ch, dst, 0, 12, result)); + ASSERT_EQ(expected, result); +} + +// src __[__ __ __ __]__ 11 11 +// dst 22 22 22 22 22 22 +// res 22 00 00 00 00 22 +TEST_F(MemStoreClone, CloneRangeHole) +{ + ASSERT_TRUE(store); + + const auto src = make_ghobject("src2"); + const auto dst = make_ghobject("dst2"); + + bufferlist srcbl, dstbl, result, expected; + srcbl.append("1111"); + dstbl.append("222222222222"); + expected.append("22\000\000\000\000\000\000\000\00022", 12); + + ObjectStore::Transaction t; + t.write(cid, src, 12, 4, srcbl); + t.write(cid, dst, 0, 12, dstbl); + t.clone_range(cid, src, dst, 2, 8, 2); + ASSERT_EQ(0, store->queue_transaction(ch, std::move(t))); + ASSERT_EQ(12, store->read(ch, dst, 0, 12, result)); + ASSERT_EQ(expected, result); +} + +// src __[__ __ __ 11]11 +// dst 22 22 22 22 22 22 +// res 22 00 00 00 11 22 +TEST_F(MemStoreClone, CloneRangeHoleStart) +{ + ASSERT_TRUE(store); + + const auto src = make_ghobject("src3"); + const auto dst = make_ghobject("dst3"); + + bufferlist srcbl, dstbl, result, expected; + srcbl.append("1111"); + dstbl.append("222222222222"); + expected.append("22\000\000\000\000\000\0001122", 12); + + ObjectStore::Transaction t; + t.write(cid, src, 8, 4, srcbl); + t.write(cid, dst, 0, 12, dstbl); + t.clone_range(cid, src, dst, 2, 8, 2); + ASSERT_EQ(0, store->queue_transaction(ch, std::move(t))); + ASSERT_EQ(12, store->read(ch, dst, 0, 12, result)); + ASSERT_EQ(expected, result); +} + +// src 11[11 __ __ 11]11 +// dst 22 22 22 22 22 22 +// res 22 11 00 00 11 22 +TEST_F(MemStoreClone, CloneRangeHoleMiddle) +{ + ASSERT_TRUE(store); + + const auto src = make_ghobject("src4"); + const auto dst = make_ghobject("dst4"); + + bufferlist srcbl, dstbl, result, expected; + srcbl.append("1111"); + dstbl.append("222222222222"); + expected.append("2211\000\000\000\0001122", 12); + + ObjectStore::Transaction t; + t.write(cid, src, 0, 4, srcbl); + t.write(cid, src, 8, 4, srcbl); + t.write(cid, dst, 0, 12, dstbl); + t.clone_range(cid, src, dst, 2, 8, 2); + ASSERT_EQ(0, store->queue_transaction(ch, std::move(t))); + ASSERT_EQ(12, store->read(ch, dst, 0, 12, result)); + ASSERT_EQ(expected, result); +} + +// src 11[11 __ __ __]__ 11 11 +// dst 22 22 22 22 22 22 +// res 22 11 00 00 00 22 +TEST_F(MemStoreClone, CloneRangeHoleEnd) +{ + ASSERT_TRUE(store); + + const auto src = make_ghobject("src5"); + const auto dst = make_ghobject("dst5"); + + bufferlist srcbl, dstbl, result, expected; + srcbl.append("1111"); + dstbl.append("222222222222"); + expected.append("2211\000\000\000\000\000\00022", 12); + + ObjectStore::Transaction t; + t.write(cid, src, 0, 4, srcbl); + t.write(cid, src, 12, 4, srcbl); + t.write(cid, dst, 0, 12, dstbl); + t.clone_range(cid, src, dst, 2, 8, 2); + ASSERT_EQ(0, store->queue_transaction(ch, std::move(t))); + ASSERT_EQ(12, store->read(ch, dst, 0, 12, result)); + ASSERT_EQ(expected, result); +} + +int main(int argc, char** argv) +{ + // default to memstore + map<string,string> defaults = { + { "osd_objectstore", "memstore" }, + { "osd_data", "msc.test_temp_dir" }, + { "memstore_page_size", "4" } + }; + + auto args = argv_to_vec(argc, argv); + auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/test/objectstore/test_transaction.cc b/src/test/objectstore/test_transaction.cc new file mode 100644 index 000000000..381b9df7d --- /dev/null +++ b/src/test/objectstore/test_transaction.cc @@ -0,0 +1,215 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Casey Bodley <cbodley@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "os/ObjectStore.h" +#include <gtest/gtest.h> +#include "common/Clock.h" +#include "include/utime.h" +#include <boost/tuple/tuple.hpp> + +using namespace std; + +TEST(Transaction, MoveConstruct) +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + ASSERT_FALSE(a.empty()); + + // move-construct in b + auto b = std::move(a); + ASSERT_TRUE(a.empty()); + ASSERT_FALSE(b.empty()); +} + +TEST(Transaction, MoveAssign) +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + ASSERT_FALSE(a.empty()); + + auto b = ObjectStore::Transaction{}; + b = std::move(a); // move-assign to b + ASSERT_TRUE(a.empty()); + ASSERT_FALSE(b.empty()); +} + +TEST(Transaction, CopyConstruct) +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + ASSERT_FALSE(a.empty()); + + auto b = a; // copy-construct in b + ASSERT_FALSE(a.empty()); + ASSERT_FALSE(b.empty()); +} + +TEST(Transaction, CopyAssign) +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + ASSERT_FALSE(a.empty()); + + auto b = ObjectStore::Transaction{}; + b = a; // copy-assign to b + ASSERT_FALSE(a.empty()); + ASSERT_FALSE(b.empty()); +} + +TEST(Transaction, Swap) +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + ASSERT_FALSE(a.empty()); + + auto b = ObjectStore::Transaction{}; + std::swap(a, b); // swap a and b + ASSERT_TRUE(a.empty()); + ASSERT_FALSE(b.empty()); +} + +ObjectStore::Transaction generate_transaction() +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + + coll_t cid; + object_t obj("test_name"); + snapid_t snap(0); + hobject_t hoid(obj, "key", snap, 0, 0, "nspace"); + ghobject_t oid(hoid); + + coll_t acid; + object_t aobj("another_test_name"); + snapid_t asnap(0); + hobject_t ahoid(obj, "another_key", snap, 0, 0, "another_nspace"); + ghobject_t aoid(hoid); + std::set<string> keys; + keys.insert("any_1"); + keys.insert("any_2"); + keys.insert("any_3"); + + bufferlist bl; + bl.append_zero(4096); + + a.write(cid, oid, 1, 4096, bl, 0); + + a.omap_setkeys(acid, aoid, bl); + + a.omap_rmkeys(cid, aoid, keys); + + a.touch(acid, oid); + + return a; +} + +TEST(Transaction, MoveRangesDelSrcObj) +{ + auto t = ObjectStore::Transaction{}; + t.nop(); + + coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD)); + + ghobject_t o1(hobject_t("obj", "", 123, 456, -1, "")); + ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, "")); + vector<std::pair<uint64_t, uint64_t>> move_info = { + make_pair(1, 5), + make_pair(10, 5) + }; + + t.touch(c, o1); + bufferlist bl; + bl.append("some data"); + t.write(c, o1, 1, bl.length(), bl); + t.write(c, o1, 10, bl.length(), bl); + + t.clone(c, o1, o2); + bl.append("some other data"); + t.write(c, o2, 1, bl.length(), bl); +} + +TEST(Transaction, GetNumBytes) +{ + auto a = ObjectStore::Transaction{}; + a.nop(); + ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test()); + + coll_t cid; + object_t obj("test_name"); + snapid_t snap(0); + hobject_t hoid(obj, "key", snap, 0, 0, "nspace"); + ghobject_t oid(hoid); + + coll_t acid; + object_t aobj("another_test_name"); + snapid_t asnap(0); + hobject_t ahoid(obj, "another_key", snap, 0, 0, "another_nspace"); + ghobject_t aoid(hoid); + std::set<string> keys; + keys.insert("any_1"); + keys.insert("any_2"); + keys.insert("any_3"); + + bufferlist bl; + bl.append_zero(4096); + + a.write(cid, oid, 1, 4096, bl, 0); + ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test()); + + a.omap_setkeys(acid, aoid, bl); + ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test()); + + a.omap_rmkeys(cid, aoid, keys); + ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test()); + + a.touch(acid, oid); + ASSERT_TRUE(a.get_encoded_bytes() == a.get_encoded_bytes_test()); +} + +void bench_num_bytes(bool legacy) +{ + const int max = 2500000; + auto a = generate_transaction(); + + if (legacy) { + cout << "get_encoded_bytes_test: "; + } else { + cout << "get_encoded_bytes: "; + } + + utime_t start = ceph_clock_now(); + if (legacy) { + for (int i = 0; i < max; ++i) { + a.get_encoded_bytes_test(); + } + } else { + for (int i = 0; i < max; ++i) { + a.get_encoded_bytes(); + } + } + + utime_t end = ceph_clock_now(); + cout << max << " encodes in " << (end - start) << std::endl; + +} + +TEST(Transaction, GetNumBytesBenchLegacy) +{ + bench_num_bytes(true); +} + +TEST(Transaction, GetNumBytesBenchCurrent) +{ + bench_num_bytes(false); +} |