// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab #include #include #include #include #include #include #include #include #include #include "rocksdb/db.h" #include "rocksdb/table.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/cache.h" #include "rocksdb/filter_policy.h" #include "rocksdb/utilities/convenience.h" #include "rocksdb/utilities/table_properties_collectors.h" #include "rocksdb/merge_operator.h" #include "common/perf_counters.h" #include "common/PriorityCache.h" #include "include/common_fwd.h" #include "include/scope_guard.h" #include "include/str_list.h" #include "include/stringify.h" #include "include/str_map.h" #include "KeyValueDB.h" #include "RocksDBStore.h" #include "common/debug.h" #define dout_context cct #define dout_subsys ceph_subsys_rocksdb #undef dout_prefix #define dout_prefix *_dout << "rocksdb: " namespace fs = std::filesystem; using std::function; using std::list; using std::map; using std::ostream; using std::pair; using std::set; using std::string; using std::unique_ptr; using std::vector; using ceph::bufferlist; using ceph::bufferptr; using ceph::Formatter; static const char* sharding_def_dir = "sharding"; static const char* sharding_def_file = "sharding/def"; static const char* sharding_recreate = "sharding/recreate_columns"; static const char* resharding_column_lock = "reshardingXcommencingXlocked"; static bufferlist to_bufferlist(rocksdb::Slice in) { bufferlist bl; bl.append(bufferptr(in.data(), in.size())); return bl; } static rocksdb::SliceParts prepare_sliceparts(const bufferlist &bl, vector *slices) { unsigned n = 0; for (auto& buf : bl.buffers()) { (*slices)[n].data_ = buf.c_str(); (*slices)[n].size_ = buf.length(); n++; } return rocksdb::SliceParts(slices->data(), slices->size()); } // // One of these for the default rocksdb column family, routing each prefix // to the appropriate MergeOperator. // class RocksDBStore::MergeOperatorRouter : public rocksdb::AssociativeMergeOperator { RocksDBStore& store; public: const char *Name() const override { // Construct a name that rocksDB will validate against. We want to // do this in a way that doesn't constrain the ordering of calls // to set_merge_operator, so sort the merge operators and then // construct a name from all of those parts. store.assoc_name.clear(); map names; for (auto& p : store.merge_ops) { names[p.first] = p.second->name(); } for (auto& p : names) { store.assoc_name += '.'; store.assoc_name += p.first; store.assoc_name += ':'; store.assoc_name += p.second; } return store.assoc_name.c_str(); } explicit MergeOperatorRouter(RocksDBStore &_store) : store(_store) {} bool Merge(const rocksdb::Slice& key, const rocksdb::Slice* existing_value, const rocksdb::Slice& value, std::string* new_value, rocksdb::Logger* logger) const override { // for default column family // extract prefix from key and compare against each registered merge op; // even though merge operator for explicit CF is included in merge_ops, // it won't be picked up, since it won't match. for (auto& p : store.merge_ops) { if (p.first.compare(0, p.first.length(), key.data(), p.first.length()) == 0 && key.data()[p.first.length()] == 0) { if (existing_value) { p.second->merge(existing_value->data(), existing_value->size(), value.data(), value.size(), new_value); } else { p.second->merge_nonexistent(value.data(), value.size(), new_value); } break; } } return true; // OK :) } }; // // One of these per non-default column family, linked directly to the // merge operator for that CF/prefix (if any). // class RocksDBStore::MergeOperatorLinker : public rocksdb::AssociativeMergeOperator { private: std::shared_ptr mop; public: explicit MergeOperatorLinker(const std::shared_ptr &o) : mop(o) {} const char *Name() const override { return mop->name(); } bool Merge(const rocksdb::Slice& key, const rocksdb::Slice* existing_value, const rocksdb::Slice& value, std::string* new_value, rocksdb::Logger* logger) const override { if (existing_value) { mop->merge(existing_value->data(), existing_value->size(), value.data(), value.size(), new_value); } else { mop->merge_nonexistent(value.data(), value.size(), new_value); } return true; } }; int RocksDBStore::set_merge_operator( const string& prefix, std::shared_ptr mop) { // If you fail here, it's because you can't do this on an open database ceph_assert(db == nullptr); merge_ops.push_back(std::make_pair(prefix,mop)); return 0; } class CephRocksdbLogger : public rocksdb::Logger { CephContext *cct; public: explicit CephRocksdbLogger(CephContext *c) : cct(c) { cct->get(); } ~CephRocksdbLogger() override { cct->put(); } // Write an entry to the log file with the specified format. void Logv(const char* format, va_list ap) override { Logv(rocksdb::INFO_LEVEL, format, ap); } // Write an entry to the log file with the specified log level // and format. Any log with level under the internal log level // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be // printed. void Logv(const rocksdb::InfoLogLevel log_level, const char* format, va_list ap) override { int v = rocksdb::NUM_INFO_LOG_LEVELS - log_level - 1; dout(ceph::dout::need_dynamic(v)); char buf[65536]; vsnprintf(buf, sizeof(buf), format, ap); *_dout << buf << dendl; } }; rocksdb::Logger *create_rocksdb_ceph_logger() { return new CephRocksdbLogger(g_ceph_context); } static int string2bool(const string &val, bool &b_val) { if (strcasecmp(val.c_str(), "false") == 0) { b_val = false; return 0; } else if (strcasecmp(val.c_str(), "true") == 0) { b_val = true; return 0; } else { std::string err; int b = strict_strtol(val.c_str(), 10, &err); if (!err.empty()) return -EINVAL; b_val = !!b; return 0; } } namespace rocksdb { extern std::string trim(const std::string& str); } // this function is a modification of rocksdb's StringToMap: // 1) accepts ' \n ; as separators // 2) leaves compound options with enclosing { and } rocksdb::Status StringToMap(const std::string& opts_str, std::unordered_map* opts_map) { using rocksdb::Status; using rocksdb::trim; assert(opts_map); // Example: // opts_str = "write_buffer_size=1024;max_write_buffer_number=2;" // "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100" size_t pos = 0; std::string opts = trim(opts_str); while (pos < opts.size()) { size_t eq_pos = opts.find('=', pos); if (eq_pos == std::string::npos) { return Status::InvalidArgument("Mismatched key value pair, '=' expected"); } std::string key = trim(opts.substr(pos, eq_pos - pos)); if (key.empty()) { return Status::InvalidArgument("Empty key found"); } // skip space after '=' and look for '{' for possible nested options pos = eq_pos + 1; while (pos < opts.size() && isspace(opts[pos])) { ++pos; } // Empty value at the end if (pos >= opts.size()) { (*opts_map)[key] = ""; break; } if (opts[pos] == '{') { int count = 1; size_t brace_pos = pos + 1; while (brace_pos < opts.size()) { if (opts[brace_pos] == '{') { ++count; } else if (opts[brace_pos] == '}') { --count; if (count == 0) { break; } } ++brace_pos; } // found the matching closing brace if (count == 0) { //include both '{' and '}' (*opts_map)[key] = trim(opts.substr(pos, brace_pos - pos + 1)); // skip all whitespace and move to the next ';,' // brace_pos points to the matching '}' pos = brace_pos + 1; while (pos < opts.size() && isspace(opts[pos])) { ++pos; } if (pos < opts.size() && opts[pos] != ';' && opts[pos] != ',') { return Status::InvalidArgument( "Unexpected chars after nested options"); } ++pos; } else { return Status::InvalidArgument( "Mismatched curly braces for nested options"); } } else { size_t sc_pos = opts.find_first_of(",;", pos); if (sc_pos == std::string::npos) { (*opts_map)[key] = trim(opts.substr(pos)); // It either ends with a trailing , ; or the last key-value pair break; } else { (*opts_map)[key] = trim(opts.substr(pos, sc_pos - pos)); } pos = sc_pos + 1; } } return Status::OK(); } int RocksDBStore::tryInterpret(const string &key, const string &val, rocksdb::Options &opt) { if (key == "compaction_threads") { std::string err; int f = strict_iecstrtoll(val, &err); if (!err.empty()) return -EINVAL; //Low priority threadpool is used for compaction opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::LOW); } else if (key == "flusher_threads") { std::string err; int f = strict_iecstrtoll(val, &err); if (!err.empty()) return -EINVAL; //High priority threadpool is used for flusher opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::HIGH); } else if (key == "compact_on_mount") { int ret = string2bool(val, compact_on_mount); if (ret != 0) return ret; } else if (key == "disableWAL") { int ret = string2bool(val, disableWAL); if (ret != 0) return ret; } else { //unrecognize config options. return -EINVAL; } return 0; } int RocksDBStore::ParseOptionsFromString(const string &opt_str, rocksdb::Options &opt) { return ParseOptionsFromStringStatic(cct, opt_str, opt, [&](const string& k, const string& v, rocksdb::Options& o) { return tryInterpret(k, v, o); } ); } int RocksDBStore::ParseOptionsFromStringStatic( CephContext *cct, const string& opt_str, rocksdb::Options& opt, function interp) { // keep aligned with func tryInterpret const set need_interp_keys = {"compaction_threads", "flusher_threads", "compact_on_mount", "disableWAL"}; rocksdb::Status status; std::unordered_map str_map; status = StringToMap(opt_str, &str_map); if (!status.ok()) { dout(5) << __func__ << " error '" << status.getState() << "' while parsing options '" << opt_str << "'" << dendl; return -EINVAL; } for (auto it = str_map.begin(); it != str_map.end(); ++it) { string this_opt = it->first + "=" + it->second; rocksdb::Status status = rocksdb::GetOptionsFromString(opt, this_opt, &opt); int r = 0; if (!status.ok()) { if (interp != nullptr) { r = interp(it->first, it->second, opt); } else if (!need_interp_keys.count(it->first)) { r = -1; } if (r < 0) { derr << status.ToString() << dendl; return -EINVAL; } } lgeneric_dout(cct, 1) << " set rocksdb option " << it->first << " = " << it->second << dendl; } return 0; } int RocksDBStore::init(string _options_str) { options_str = _options_str; rocksdb::Options opt; //try parse options if (options_str.length()) { int r = ParseOptionsFromString(options_str, opt); if (r != 0) { return -EINVAL; } } return 0; } int RocksDBStore::create_db_dir() { if (env) { unique_ptr dir; env->NewDirectory(path, &dir); } else { if (!fs::exists(path)) { std::error_code ec; if (!fs::create_directory(path, ec)) { derr << __func__ << " failed to create " << path << ": " << ec.message() << dendl; return -ec.value(); } fs::permissions(path, fs::perms::owner_all | fs::perms::group_read | fs::perms::group_exec | fs::perms::others_read | fs::perms::others_exec); } } return 0; } int RocksDBStore::install_cf_mergeop( const string &key_prefix, rocksdb::ColumnFamilyOptions *cf_opt) { ceph_assert(cf_opt != nullptr); cf_opt->merge_operator.reset(); for (auto& i : merge_ops) { if (i.first == key_prefix) { cf_opt->merge_operator.reset(new MergeOperatorLinker(i.second)); } } return 0; } int RocksDBStore::create_and_open(ostream &out, const std::string& cfs) { int r = create_db_dir(); if (r < 0) return r; return do_open(out, true, false, cfs); } std::shared_ptr RocksDBStore::create_block_cache( const std::string& cache_type, size_t cache_size, double cache_prio_high) { std::shared_ptr cache; auto shard_bits = cct->_conf->rocksdb_cache_shard_bits; if (cache_type == "binned_lru") { cache = rocksdb_cache::NewBinnedLRUCache(cct, cache_size, shard_bits, false, cache_prio_high); } else if (cache_type == "lru") { cache = rocksdb::NewLRUCache(cache_size, shard_bits); } else if (cache_type == "clock") { cache = rocksdb::NewClockCache(cache_size, shard_bits); if (!cache) { derr << "rocksdb_cache_type '" << cache << "' chosen, but RocksDB not compiled with LibTBB. " << dendl; } } else { derr << "unrecognized rocksdb_cache_type '" << cache_type << "'" << dendl; } return cache; } int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options& opt) { rocksdb::Status status; if (options_str.length()) { int r = ParseOptionsFromString(options_str, opt); if (r != 0) { return -EINVAL; } } if (cct->_conf->rocksdb_perf) { dbstats = rocksdb::CreateDBStatistics(); opt.statistics = dbstats; } opt.create_if_missing = create_if_missing; if (kv_options.count("separate_wal_dir")) { opt.wal_dir = path + ".wal"; } // Since ceph::for_each_substr doesn't return a value and // std::stoull does throw, we may as well just catch everything here. try { if (kv_options.count("db_paths")) { list paths; get_str_list(kv_options["db_paths"], "; \t", paths); for (auto& p : paths) { size_t pos = p.find(','); if (pos == std::string::npos) { derr << __func__ << " invalid db path item " << p << " in " << kv_options["db_paths"] << dendl; return -EINVAL; } string path = p.substr(0, pos); string size_str = p.substr(pos + 1); uint64_t size = atoll(size_str.c_str()); if (!size) { derr << __func__ << " invalid db path item " << p << " in " << kv_options["db_paths"] << dendl; return -EINVAL; } opt.db_paths.push_back(rocksdb::DbPath(path, size)); dout(10) << __func__ << " db_path " << path << " size " << size << dendl; } } } catch (const std::system_error& e) { return -e.code().value(); } if (cct->_conf->rocksdb_log_to_ceph_log) { opt.info_log.reset(new CephRocksdbLogger(cct)); } if (priv) { dout(10) << __func__ << " using custom Env " << priv << dendl; opt.env = static_cast(priv); } else { env = opt.env; } opt.env->SetAllowNonOwnerAccess(false); // caches if (!set_cache_flag) { cache_size = cct->_conf->rocksdb_cache_size; } uint64_t row_cache_size = cache_size * cct->_conf->rocksdb_cache_row_ratio; uint64_t block_cache_size = cache_size - row_cache_size; bbt_opts.block_cache = create_block_cache(cct->_conf->rocksdb_cache_type, block_cache_size); if (!bbt_opts.block_cache) { return -EINVAL; } bbt_opts.block_size = cct->_conf->rocksdb_block_size; if (row_cache_size > 0) opt.row_cache = rocksdb::NewLRUCache(row_cache_size, cct->_conf->rocksdb_cache_shard_bits); uint64_t bloom_bits = cct->_conf.get_val("rocksdb_bloom_bits_per_key"); if (bloom_bits > 0) { dout(10) << __func__ << " set bloom filter bits per key to " << bloom_bits << dendl; bbt_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bloom_bits)); } using std::placeholders::_1; if (cct->_conf.with_val("rocksdb_index_type", std::bind(std::equal_to(), _1, "binary_search"))) bbt_opts.index_type = rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch; if (cct->_conf.with_val("rocksdb_index_type", std::bind(std::equal_to(), _1, "hash_search"))) bbt_opts.index_type = rocksdb::BlockBasedTableOptions::IndexType::kHashSearch; if (cct->_conf.with_val("rocksdb_index_type", std::bind(std::equal_to(), _1, "two_level"))) bbt_opts.index_type = rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; if (!bbt_opts.no_block_cache) { bbt_opts.cache_index_and_filter_blocks = cct->_conf.get_val("rocksdb_cache_index_and_filter_blocks"); bbt_opts.cache_index_and_filter_blocks_with_high_priority = cct->_conf.get_val("rocksdb_cache_index_and_filter_blocks_with_high_priority"); bbt_opts.pin_l0_filter_and_index_blocks_in_cache = cct->_conf.get_val("rocksdb_pin_l0_filter_and_index_blocks_in_cache"); } bbt_opts.partition_filters = cct->_conf.get_val("rocksdb_partition_filters"); if (cct->_conf.get_val("rocksdb_metadata_block_size") > 0) bbt_opts.metadata_block_size = cct->_conf.get_val("rocksdb_metadata_block_size"); opt.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbt_opts)); dout(10) << __func__ << " block size " << cct->_conf->rocksdb_block_size << ", block_cache size " << byte_u_t(block_cache_size) << ", row_cache size " << byte_u_t(row_cache_size) << "; shards " << (1 << cct->_conf->rocksdb_cache_shard_bits) << ", type " << cct->_conf->rocksdb_cache_type << dendl; opt.merge_operator.reset(new MergeOperatorRouter(*this)); comparator = opt.comparator; return 0; } void RocksDBStore::add_column_family(const std::string& cf_name, uint32_t hash_l, uint32_t hash_h, size_t shard_idx, rocksdb::ColumnFamilyHandle *handle) { dout(10) << __func__ << " column_name=" << cf_name << " shard_idx=" << shard_idx << " hash_l=" << hash_l << " hash_h=" << hash_h << " handle=" << (void*) handle << dendl; bool exists = cf_handles.count(cf_name) > 0; auto& column = cf_handles[cf_name]; if (exists) { ceph_assert(hash_l == column.hash_l); ceph_assert(hash_h == column.hash_h); } else { ceph_assert(hash_l < hash_h); column.hash_l = hash_l; column.hash_h = hash_h; } if (column.handles.size() <= shard_idx) column.handles.resize(shard_idx + 1); column.handles[shard_idx] = handle; cf_ids_to_prefix.emplace(handle->GetID(), cf_name); } bool RocksDBStore::is_column_family(const std::string& prefix) { return cf_handles.count(prefix); } std::string_view RocksDBStore::get_key_hash_view(const prefix_shards& shards, const char* key, const size_t keylen) { uint32_t hash_l = std::min(shards.hash_l, keylen); uint32_t hash_h = std::min(shards.hash_h, keylen); return { key + hash_l, hash_h - hash_l }; } rocksdb::ColumnFamilyHandle *RocksDBStore::get_key_cf(const prefix_shards& shards, const char* key, const size_t keylen) { auto sv = get_key_hash_view(shards, key, keylen); uint32_t hash = ceph_str_hash_rjenkins(sv.data(), sv.size()); return shards.handles[hash % shards.handles.size()]; } rocksdb::ColumnFamilyHandle *RocksDBStore::get_cf_handle(const std::string& prefix, const std::string& key) { auto iter = cf_handles.find(prefix); if (iter == cf_handles.end()) { return nullptr; } else { if (iter->second.handles.size() == 1) { return iter->second.handles[0]; } else { return get_key_cf(iter->second, key.data(), key.size()); } } } rocksdb::ColumnFamilyHandle *RocksDBStore::get_cf_handle(const std::string& prefix, const char* key, size_t keylen) { auto iter = cf_handles.find(prefix); if (iter == cf_handles.end()) { return nullptr; } else { if (iter->second.handles.size() == 1) { return iter->second.handles[0]; } else { return get_key_cf(iter->second, key, keylen); } } } /** * If the specified IteratorBounds arg has both an upper and a lower bound defined, and they have equal placement hash * strings, we can be sure that the entire iteration range exists in a single CF. In that case, we return the relevant * CF handle. In all other cases, we return a nullptr to indicate that the specified bounds cannot necessarily be mapped * to a single CF. */ rocksdb::ColumnFamilyHandle *RocksDBStore::check_cf_handle_bounds(const cf_handles_iterator& iter, const IteratorBounds& bounds) { if (!bounds.lower_bound || !bounds.upper_bound) { return nullptr; } ceph_assert(iter != cf_handles.end()); ceph_assert(iter->second.handles.size() != 1); if (iter->second.hash_l != 0) { return nullptr; } auto lower_bound_hash_str = get_key_hash_view(iter->second, bounds.lower_bound->data(), bounds.lower_bound->size()); auto upper_bound_hash_str = get_key_hash_view(iter->second, bounds.upper_bound->data(), bounds.upper_bound->size()); if (lower_bound_hash_str == upper_bound_hash_str) { auto key = *bounds.lower_bound; return get_key_cf(iter->second, key.data(), key.size()); } else { return nullptr; } } /** * Definition of sharding: * space-separated list of: column_def [ '=' options ] * column_def := column_name '(' shard_count ')' * column_def := column_name '(' shard_count ',' hash_begin '-' ')' * column_def := column_name '(' shard_count ',' hash_begin '-' hash_end ')' * I=write_buffer_size=1048576 O(6) m(7,10-) prefix(4,0-10)=disable_auto_compactions=true,max_bytes_for_level_base=1048576 */ bool RocksDBStore::parse_sharding_def(const std::string_view text_def_in, std::vector& sharding_def, char const* *error_position, std::string *error_msg) { std::string_view text_def = text_def_in; char const* error_position_local = nullptr; std::string error_msg_local; if (error_position == nullptr) { error_position = &error_position_local; } *error_position = nullptr; if (error_msg == nullptr) { error_msg = &error_msg_local; error_msg->clear(); } sharding_def.clear(); while (!text_def.empty()) { std::string_view options; std::string_view name; size_t shard_cnt = 1; uint32_t l_bound = 0; uint32_t h_bound = std::numeric_limits::max(); std::string_view column_def; size_t spos = text_def.find(' '); if (spos == std::string_view::npos) { column_def = text_def; text_def = std::string_view(text_def.end(), 0); } else { column_def = text_def.substr(0, spos); text_def = text_def.substr(spos + 1); } size_t eqpos = column_def.find('='); if (eqpos != std::string_view::npos) { options = column_def.substr(eqpos + 1); column_def = column_def.substr(0, eqpos); } size_t bpos = column_def.find('('); if (bpos != std::string_view::npos) { name = column_def.substr(0, bpos); const char* nptr = &column_def[bpos + 1]; char* endptr; shard_cnt = strtol(nptr, &endptr, 10); if (nptr == endptr) { *error_position = nptr; *error_msg = "expecting integer"; break; } nptr = endptr; if (*nptr == ',') { nptr++; l_bound = strtol(nptr, &endptr, 10); if (nptr == endptr) { *error_position = nptr; *error_msg = "expecting integer"; break; } nptr = endptr; if (*nptr != '-') { *error_position = nptr; *error_msg = "expecting '-'"; break; } nptr++; h_bound = strtol(nptr, &endptr, 10); if (nptr == endptr) { h_bound = std::numeric_limits::max(); } nptr = endptr; } if (*nptr != ')') { *error_position = nptr; *error_msg = "expecting ')'"; break; } } else { name = column_def; } sharding_def.emplace_back(std::string(name), shard_cnt, std::string(options), l_bound, h_bound); } return *error_position == nullptr; } void RocksDBStore::sharding_def_to_columns(const std::vector& sharding_def, std::vector& columns) { columns.clear(); for (size_t i = 0; i < sharding_def.size(); i++) { if (sharding_def[i].shard_cnt == 1) { columns.push_back(sharding_def[i].name); } else { for (size_t j = 0; j < sharding_def[i].shard_cnt; j++) { columns.push_back(sharding_def[i].name + "-" + std::to_string(j)); } } } } int RocksDBStore::create_shards(const rocksdb::Options& opt, const std::vector& sharding_def) { for (auto& p : sharding_def) { // copy default CF settings, block cache, merge operators as // the base for new CF rocksdb::ColumnFamilyOptions cf_opt(opt); rocksdb::Status status; // apply options to column family int r = update_column_family_options(p.name, p.options, &cf_opt); if (r != 0) { return r; } for (size_t idx = 0; idx < p.shard_cnt; idx++) { std::string cf_name; if (p.shard_cnt == 1) cf_name = p.name; else cf_name = p.name + "-" + std::to_string(idx); rocksdb::ColumnFamilyHandle *cf; status = db->CreateColumnFamily(cf_opt, cf_name, &cf); if (!status.ok()) { derr << __func__ << " Failed to create rocksdb column family: " << cf_name << dendl; return -EINVAL; } // store the new CF handle add_column_family(p.name, p.hash_l, p.hash_h, idx, cf); } } return 0; } int RocksDBStore::apply_sharding(const rocksdb::Options& opt, const std::string& sharding_text) { // create and open column families if (!sharding_text.empty()) { bool b; int r; rocksdb::Status status; std::vector sharding_def; char const* error_position; std::string error_msg; b = parse_sharding_def(sharding_text, sharding_def, &error_position, &error_msg); if (!b) { dout(1) << __func__ << " bad sharding: " << dendl; dout(1) << __func__ << sharding_text << dendl; dout(1) << __func__ << std::string(error_position - &sharding_text[0], ' ') << "^" << error_msg << dendl; return -EINVAL; } r = create_shards(opt, sharding_def); if (r != 0 ) { derr << __func__ << " create_shards failed error=" << r << dendl; return r; } opt.env->CreateDir(sharding_def_dir); status = rocksdb::WriteStringToFile(opt.env, sharding_text, sharding_def_file, true); if (!status.ok()) { derr << __func__ << " cannot write to " << sharding_def_file << dendl; return -EIO; } } else { opt.env->DeleteFile(sharding_def_file); } return 0; } // linking to rocksdb function defined in options_helper.cc // it can parse nested params like "nested_opt={opt1=1;opt2=2}" extern rocksdb::Status rocksdb::StringToMap(const std::string& opts_str, std::unordered_map* opts_map); // Splits column family options from single string into name->value column_opts_map. // The split is done using RocksDB parser that understands "{" and "}", so it // properly extracts compound options. // If non-RocksDB option "block_cache" is defined it is extracted to block_cache_opt. int RocksDBStore::split_column_family_options(const std::string& options, std::unordered_map* opt_map, std::string* block_cache_opt) { dout(20) << __func__ << " options=" << options << dendl; rocksdb::Status status = rocksdb::StringToMap(options, opt_map); if (!status.ok()) { dout(5) << __func__ << " error '" << status.getState() << "' while parsing options '" << options << "'" << dendl; return -EINVAL; } // if "block_cache" option exists, then move it to separate string if (auto it = opt_map->find("block_cache"); it != opt_map->end()) { *block_cache_opt = it->second; opt_map->erase(it); } else { block_cache_opt->clear(); } return 0; } // Updates column family options. // Take options from more_options and apply them to cf_opt. // Allowed options are exactly the same as allowed for column families in RocksDB. // Ceph addition is "block_cache" option that is translated to block_cache and // allows to specialize separate block cache for O column family. // // base_name - name of column without shard suffix: "-"+number // options - additional options to apply // cf_opt - column family options to update int RocksDBStore::update_column_family_options(const std::string& base_name, const std::string& more_options, rocksdb::ColumnFamilyOptions* cf_opt) { std::unordered_map options_map; std::string block_cache_opt; rocksdb::Status status; int r = split_column_family_options(more_options, &options_map, &block_cache_opt); if (r != 0) { dout(5) << __func__ << " failed to parse options; column family=" << base_name << " options=" << more_options << dendl; return r; } status = rocksdb::GetColumnFamilyOptionsFromMap(*cf_opt, options_map, cf_opt); if (!status.ok()) { dout(5) << __func__ << " invalid column family optionsp; column family=" << base_name << " options=" << more_options << dendl; dout(5) << __func__ << " RocksDB error='" << status.getState() << "'" << dendl; return -EINVAL; } if (base_name != rocksdb::kDefaultColumnFamilyName) { // default cf has its merge operator defined in load_rocksdb_options, should not override it install_cf_mergeop(base_name, cf_opt); } if (!block_cache_opt.empty()) { r = apply_block_cache_options(base_name, block_cache_opt, cf_opt); if (r != 0) { // apply_block_cache_options already does all necessary douts return r; } } // Set Compact on Deletion Factory if (cct->_conf->rocksdb_cf_compact_on_deletion) { size_t sliding_window = cct->_conf->rocksdb_cf_compact_on_deletion_sliding_window; size_t trigger = cct->_conf->rocksdb_cf_compact_on_deletion_trigger; cf_opt->table_properties_collector_factories.emplace_back( rocksdb::NewCompactOnDeletionCollectorFactory(sliding_window, trigger)); } return 0; } int RocksDBStore::apply_block_cache_options(const std::string& column_name, const std::string& block_cache_opt, rocksdb::ColumnFamilyOptions* cf_opt) { rocksdb::Status status; std::unordered_map cache_options_map; status = rocksdb::StringToMap(block_cache_opt, &cache_options_map); if (!status.ok()) { dout(5) << __func__ << " invalid block cache options; column=" << column_name << " options=" << block_cache_opt << dendl; dout(5) << __func__ << " RocksDB error='" << status.getState() << "'" << dendl; return -EINVAL; } bool require_new_block_cache = false; std::string cache_type = cct->_conf->rocksdb_cache_type; if (const auto it = cache_options_map.find("type"); it != cache_options_map.end()) { cache_type = it->second; cache_options_map.erase(it); require_new_block_cache = true; } size_t cache_size = cct->_conf->rocksdb_cache_size; if (auto it = cache_options_map.find("size"); it != cache_options_map.end()) { std::string error; cache_size = strict_iecstrtoll(it->second.c_str(), &error); if (!error.empty()) { dout(10) << __func__ << " invalid size: '" << it->second << "'" << dendl; return -EINVAL; } cache_options_map.erase(it); require_new_block_cache = true; } double high_pri_pool_ratio = 0.0; if (auto it = cache_options_map.find("high_ratio"); it != cache_options_map.end()) { std::string error; high_pri_pool_ratio = strict_strtod(it->second.c_str(), &error); if (!error.empty()) { dout(10) << __func__ << " invalid high_pri (float): '" << it->second << "'" << dendl; return -EINVAL; } cache_options_map.erase(it); require_new_block_cache = true; } rocksdb::BlockBasedTableOptions column_bbt_opts; status = GetBlockBasedTableOptionsFromMap(bbt_opts, cache_options_map, &column_bbt_opts); if (!status.ok()) { dout(5) << __func__ << " invalid block cache options; column=" << column_name << " options=" << block_cache_opt << dendl; dout(5) << __func__ << " RocksDB error='" << status.getState() << "'" << dendl; return -EINVAL; } std::shared_ptr block_cache; if (column_bbt_opts.no_block_cache) { // clear all settings except no_block_cache // rocksdb does not like then column_bbt_opts = rocksdb::BlockBasedTableOptions(); column_bbt_opts.no_block_cache = true; } else { if (require_new_block_cache) { block_cache = create_block_cache(cache_type, cache_size, high_pri_pool_ratio); if (!block_cache) { dout(5) << __func__ << " failed to create block cache for params: " << block_cache_opt << dendl; return -EINVAL; } } else { block_cache = bbt_opts.block_cache; } } column_bbt_opts.block_cache = block_cache; cf_bbt_opts[column_name] = column_bbt_opts; cf_opt->table_factory.reset(NewBlockBasedTableFactory(cf_bbt_opts[column_name])); return 0; } int RocksDBStore::verify_sharding(const rocksdb::Options& opt, std::vector& existing_cfs, std::vector >& existing_cfs_shard, std::vector& missing_cfs, std::vector >& missing_cfs_shard) { rocksdb::Status status; std::string stored_sharding_text; status = opt.env->FileExists(sharding_def_file); if (status.ok()) { status = rocksdb::ReadFileToString(opt.env, sharding_def_file, &stored_sharding_text); if(!status.ok()) { derr << __func__ << " cannot read from " << sharding_def_file << dendl; return -EIO; } dout(20) << __func__ << " sharding=" << stored_sharding_text << dendl; } else { dout(30) << __func__ << " no sharding" << dendl; //no "sharding_def" present } //check if sharding_def matches stored_sharding_def std::vector stored_sharding_def; parse_sharding_def(stored_sharding_text, stored_sharding_def); std::sort(stored_sharding_def.begin(), stored_sharding_def.end(), [](ColumnFamily& a, ColumnFamily& b) { return a.name < b.name; } ); std::vector rocksdb_cfs; status = rocksdb::DB::ListColumnFamilies(rocksdb::DBOptions(opt), path, &rocksdb_cfs); if (!status.ok()) { derr << __func__ << " unable to list column families: " << status.ToString() << dendl; return -EIO; } dout(5) << __func__ << " column families from rocksdb: " << rocksdb_cfs << dendl; auto emplace_cf = [&] (const RocksDBStore::ColumnFamily& column, int32_t shard_id, const std::string& shard_name, const rocksdb::ColumnFamilyOptions& opt) { if (std::find(rocksdb_cfs.begin(), rocksdb_cfs.end(), shard_name) != rocksdb_cfs.end()) { existing_cfs.emplace_back(shard_name, opt); existing_cfs_shard.emplace_back(shard_id, column); } else { missing_cfs.emplace_back(shard_name, opt); missing_cfs_shard.emplace_back(shard_id, column); } }; for (auto& column : stored_sharding_def) { rocksdb::ColumnFamilyOptions cf_opt(opt); int r = update_column_family_options(column.name, column.options, &cf_opt); if (r != 0) { return r; } if (column.shard_cnt == 1) { emplace_cf(column, 0, column.name, cf_opt); } else { for (size_t i = 0; i < column.shard_cnt; i++) { std::string cf_name = column.name + "-" + std::to_string(i); emplace_cf(column, i, cf_name, cf_opt); } } } existing_cfs.emplace_back("default", opt); if (existing_cfs.size() != rocksdb_cfs.size()) { std::vector columns_from_stored; sharding_def_to_columns(stored_sharding_def, columns_from_stored); derr << __func__ << " extra columns in rocksdb. rocksdb columns = " << rocksdb_cfs << " target columns = " << columns_from_stored << dendl; return -EIO; } return 0; } std::ostream& operator<<(std::ostream& out, const RocksDBStore::ColumnFamily& cf) { out << "("; out << cf.name; out << ","; out << cf.shard_cnt; out << ","; out << cf.hash_l; out << "-"; if (cf.hash_h != std::numeric_limits::max()) { out << cf.hash_h; } out << ","; out << cf.options; out << ")"; return out; } int RocksDBStore::do_open(ostream &out, bool create_if_missing, bool open_readonly, const std::string& sharding_text) { ceph_assert(!(create_if_missing && open_readonly)); rocksdb::Options opt; int r = load_rocksdb_options(create_if_missing, opt); if (r) { dout(1) << __func__ << " load rocksdb options failed" << dendl; return r; } rocksdb::Status status; if (create_if_missing) { status = rocksdb::DB::Open(opt, path, &db); if (!status.ok()) { derr << status.ToString() << dendl; return -EINVAL; } r = apply_sharding(opt, sharding_text); if (r < 0) { return r; } default_cf = db->DefaultColumnFamily(); } else { std::vector existing_cfs; std::vector > existing_cfs_shard; std::vector missing_cfs; std::vector > missing_cfs_shard; r = verify_sharding(opt, existing_cfs, existing_cfs_shard, missing_cfs, missing_cfs_shard); if (r < 0) { return r; } std::string sharding_recreate_text; status = rocksdb::ReadFileToString(opt.env, sharding_recreate, &sharding_recreate_text); bool recreate_mode = status.ok() && sharding_recreate_text == "1"; ceph_assert(!recreate_mode || !open_readonly); if (recreate_mode == false && missing_cfs.size() != 0) { // We do not accept when there are missing column families, except case that we are during resharding. // We can get into this case if resharding was interrupted. It gives a chance to continue. // Opening DB is only allowed in read-only mode. if (open_readonly == false && std::find_if(missing_cfs.begin(), missing_cfs.end(), [](const rocksdb::ColumnFamilyDescriptor& c) { return c.name == resharding_column_lock; } ) != missing_cfs.end()) { derr << __func__ << " missing column families: " << missing_cfs_shard << dendl; return -EIO; } } if (existing_cfs.empty()) { // no column families if (open_readonly) { status = rocksdb::DB::OpenForReadOnly(opt, path, &db); } else { status = rocksdb::DB::Open(opt, path, &db); } if (!status.ok()) { derr << status.ToString() << dendl; return -EINVAL; } default_cf = db->DefaultColumnFamily(); } else { std::vector handles; if (open_readonly) { status = rocksdb::DB::OpenForReadOnly(rocksdb::DBOptions(opt), path, existing_cfs, &handles, &db); } else { status = rocksdb::DB::Open(rocksdb::DBOptions(opt), path, existing_cfs, &handles, &db); } if (!status.ok()) { derr << status.ToString() << dendl; return -EINVAL; } ceph_assert(existing_cfs.size() == existing_cfs_shard.size() + 1); ceph_assert(handles.size() == existing_cfs.size()); dout(10) << __func__ << " existing_cfs=" << existing_cfs.size() << dendl; for (size_t i = 0; i < existing_cfs_shard.size(); i++) { add_column_family(existing_cfs_shard[i].second.name, existing_cfs_shard[i].second.hash_l, existing_cfs_shard[i].second.hash_h, existing_cfs_shard[i].first, handles[i]); } default_cf = handles[handles.size() - 1]; must_close_default_cf = true; if (missing_cfs.size() > 0 && std::find_if(missing_cfs.begin(), missing_cfs.end(), [](const rocksdb::ColumnFamilyDescriptor& c) { return c.name == resharding_column_lock; } ) == missing_cfs.end()) { dout(10) << __func__ << " missing_cfs=" << missing_cfs.size() << dendl; ceph_assert(recreate_mode); ceph_assert(missing_cfs.size() == missing_cfs_shard.size()); for (size_t i = 0; i < missing_cfs.size(); i++) { rocksdb::ColumnFamilyHandle *cf; status = db->CreateColumnFamily(missing_cfs[i].options, missing_cfs[i].name, &cf); if (!status.ok()) { derr << __func__ << " Failed to create rocksdb column family: " << missing_cfs[i].name << dendl; return -EINVAL; } add_column_family(missing_cfs_shard[i].second.name, missing_cfs_shard[i].second.hash_l, missing_cfs_shard[i].second.hash_h, missing_cfs_shard[i].first, cf); } opt.env->DeleteFile(sharding_recreate); } } } ceph_assert(default_cf != nullptr); PerfCountersBuilder plb(cct, "rocksdb", l_rocksdb_first, l_rocksdb_last); plb.add_time_avg(l_rocksdb_get_latency, "get_latency", "Get latency"); plb.add_time_avg(l_rocksdb_submit_latency, "submit_latency", "Submit Latency"); plb.add_time_avg(l_rocksdb_submit_sync_latency, "submit_sync_latency", "Submit Sync Latency"); plb.add_u64_counter(l_rocksdb_compact, "compact", "Compactions"); plb.add_u64_counter(l_rocksdb_compact_range, "compact_range", "Compactions by range"); plb.add_u64_counter(l_rocksdb_compact_queue_merge, "compact_queue_merge", "Mergings of ranges in compaction queue"); plb.add_u64(l_rocksdb_compact_queue_len, "compact_queue_len", "Length of compaction queue"); plb.add_time_avg(l_rocksdb_write_wal_time, "rocksdb_write_wal_time", "Rocksdb write wal time"); plb.add_time_avg(l_rocksdb_write_memtable_time, "rocksdb_write_memtable_time", "Rocksdb write memtable time"); plb.add_time_avg(l_rocksdb_write_delay_time, "rocksdb_write_delay_time", "Rocksdb write delay time"); plb.add_time_avg(l_rocksdb_write_pre_and_post_process_time, "rocksdb_write_pre_and_post_time", "total time spent on writing a record, excluding write process"); logger = plb.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); if (compact_on_mount) { derr << "Compacting rocksdb store..." << dendl; compact(); derr << "Finished compacting rocksdb store" << dendl; } return 0; } int RocksDBStore::_test_init(const string& dir) { rocksdb::Options options; options.create_if_missing = true; rocksdb::DB *db; rocksdb::Status status = rocksdb::DB::Open(options, dir, &db); delete db; db = nullptr; return status.ok() ? 0 : -EIO; } RocksDBStore::~RocksDBStore() { close(); if (priv) { delete static_cast(priv); } } void RocksDBStore::close() { // stop compaction thread compact_queue_lock.lock(); if (compact_thread.is_started()) { dout(1) << __func__ << " waiting for compaction thread to stop" << dendl; compact_queue_stop = true; compact_queue_cond.notify_all(); compact_queue_lock.unlock(); compact_thread.join(); dout(1) << __func__ << " compaction thread to stopped" << dendl; } else { compact_queue_lock.unlock(); } if (logger) { cct->get_perfcounters_collection()->remove(logger); delete logger; logger = nullptr; } // Ensure db is destroyed before dependent db_cache and filterpolicy for (auto& p : cf_handles) { for (size_t i = 0; i < p.second.handles.size(); i++) { db->DestroyColumnFamilyHandle(p.second.handles[i]); } } cf_handles.clear(); if (must_close_default_cf) { db->DestroyColumnFamilyHandle(default_cf); must_close_default_cf = false; } default_cf = nullptr; delete db; db = nullptr; } int RocksDBStore::repair(std::ostream &out) { rocksdb::Status status; rocksdb::Options opt; int r = load_rocksdb_options(false, opt); if (r) { dout(1) << __func__ << " load rocksdb options failed" << dendl; out << "load rocksdb options failed" << std::endl; return r; } //need to save sharding definition, repairDB will delete files it does not know std::string stored_sharding_text; status = opt.env->FileExists(sharding_def_file); if (status.ok()) { status = rocksdb::ReadFileToString(opt.env, sharding_def_file, &stored_sharding_text); if (!status.ok()) { stored_sharding_text.clear(); } } dout(10) << __func__ << " stored_sharding: " << stored_sharding_text << dendl; status = rocksdb::RepairDB(path, opt); bool repaired = status.ok(); if (!stored_sharding_text.empty()) { //recreate markers even if repair failed opt.env->CreateDir(sharding_def_dir); status = rocksdb::WriteStringToFile(opt.env, stored_sharding_text, sharding_def_file, true); if (!status.ok()) { derr << __func__ << " cannot write to " << sharding_def_file << dendl; return -1; } status = rocksdb::WriteStringToFile(opt.env, "1", sharding_recreate, true); if (!status.ok()) { derr << __func__ << " cannot write to " << sharding_recreate << dendl; return -1; } // fiinalize sharding recreate if (do_open(out, false, false)) { derr << __func__ << " cannot finalize repair" << dendl; return -1; } close(); } if (repaired && status.ok()) { return 0; } else { out << "repair rocksdb failed : " << status.ToString() << std::endl; return -1; } } void RocksDBStore::split_stats(const std::string &s, char delim, std::vector &elems) { std::stringstream ss; ss.str(s); std::string item; while (std::getline(ss, item, delim)) { elems.push_back(item); } } bool RocksDBStore::get_property( const std::string &property, uint64_t *out) { return db->GetIntProperty(property, out); } int64_t RocksDBStore::estimate_prefix_size(const string& prefix, const string& key_prefix) { uint64_t size = 0; auto p_iter = cf_handles.find(prefix); if (p_iter != cf_handles.end()) { for (auto cf : p_iter->second.handles) { uint64_t s = 0; string start = key_prefix + string(1, '\x00'); string limit = key_prefix + string("\xff\xff\xff\xff"); rocksdb::Range r(start, limit); db->GetApproximateSizes(cf, &r, 1, &s); size += s; } } else { string start = combine_strings(prefix , key_prefix); string limit = combine_strings(prefix , key_prefix + "\xff\xff\xff\xff"); rocksdb::Range r(start, limit); db->GetApproximateSizes(default_cf, &r, 1, &size); } return size; } void RocksDBStore::get_statistics(Formatter *f) { if (!cct->_conf->rocksdb_perf) { dout(20) << __func__ << " RocksDB perf is disabled, can't probe for stats" << dendl; return; } if (cct->_conf->rocksdb_collect_compaction_stats) { std::string stat_str; bool status = db->GetProperty("rocksdb.stats", &stat_str); if (status) { f->open_object_section("rocksdb_statistics"); f->dump_string("rocksdb_compaction_statistics", ""); vector stats; split_stats(stat_str, '\n', stats); for (auto st :stats) { f->dump_string("", st); } f->close_section(); } } if (cct->_conf->rocksdb_collect_extended_stats) { if (dbstats) { f->open_object_section("rocksdb_extended_statistics"); string stat_str = dbstats->ToString(); vector stats; split_stats(stat_str, '\n', stats); f->dump_string("rocksdb_extended_statistics", ""); for (auto st :stats) { f->dump_string(".", st); } f->close_section(); } f->open_object_section("rocksdbstore_perf_counters"); logger->dump_formatted(f, false, false); f->close_section(); } if (cct->_conf->rocksdb_collect_memory_stats) { f->open_object_section("rocksdb_memtable_statistics"); std::string str; if (!bbt_opts.no_block_cache) { str.append(stringify(bbt_opts.block_cache->GetUsage())); f->dump_string("block_cache_usage", str.data()); str.clear(); str.append(stringify(bbt_opts.block_cache->GetPinnedUsage())); f->dump_string("block_cache_pinned_blocks_usage", str); str.clear(); } db->GetProperty("rocksdb.cur-size-all-mem-tables", &str); f->dump_string("rocksdb_memtable_usage", str); str.clear(); db->GetProperty("rocksdb.estimate-table-readers-mem", &str); f->dump_string("rocksdb_index_filter_blocks_usage", str); f->close_section(); } } struct RocksDBStore::RocksWBHandler: public rocksdb::WriteBatch::Handler { RocksWBHandler(const RocksDBStore& db) : db(db) {} const RocksDBStore& db; std::stringstream seen; int num_seen = 0; void dump(const char* op_name, uint32_t column_family_id, const rocksdb::Slice& key_in, const rocksdb::Slice* value = nullptr) { string prefix; string key; ssize_t size = value ? value->size() : -1; seen << std::endl << op_name << "("; if (column_family_id == 0) { db.split_key(key_in, &prefix, &key); } else { auto it = db.cf_ids_to_prefix.find(column_family_id); ceph_assert(it != db.cf_ids_to_prefix.end()); prefix = it->second; key = key_in.ToString(); } seen << " prefix = " << prefix; seen << " key = " << pretty_binary_string(key); if (size != -1) seen << " value size = " << std::to_string(size); seen << ")"; num_seen++; } void Put(const rocksdb::Slice& key, const rocksdb::Slice& value) override { dump("Put", 0, key, &value); } rocksdb::Status PutCF(uint32_t column_family_id, const rocksdb::Slice& key, const rocksdb::Slice& value) override { dump("PutCF", column_family_id, key, &value); return rocksdb::Status::OK(); } void SingleDelete(const rocksdb::Slice& key) override { dump("SingleDelete", 0, key); } rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override { dump("SingleDeleteCF", column_family_id, key); return rocksdb::Status::OK(); } void Delete(const rocksdb::Slice& key) override { dump("Delete", 0, key); } rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override { dump("DeleteCF", column_family_id, key); return rocksdb::Status::OK(); } void Merge(const rocksdb::Slice& key, const rocksdb::Slice& value) override { dump("Merge", 0, key, &value); } rocksdb::Status MergeCF(uint32_t column_family_id, const rocksdb::Slice& key, const rocksdb::Slice& value) override { dump("MergeCF", column_family_id, key, &value); return rocksdb::Status::OK(); } bool Continue() override { return num_seen < 50; } }; int RocksDBStore::submit_common(rocksdb::WriteOptions& woptions, KeyValueDB::Transaction t) { // enable rocksdb breakdown // considering performance overhead, default is disabled if (cct->_conf->rocksdb_perf) { rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex); rocksdb::get_perf_context()->Reset(); } RocksDBTransactionImpl * _t = static_cast(t.get()); woptions.disableWAL = disableWAL; lgeneric_subdout(cct, rocksdb, 30) << __func__; RocksWBHandler bat_txc(*this); _t->bat.Iterate(&bat_txc); *_dout << " Rocksdb transaction: " << bat_txc.seen.str() << dendl; rocksdb::Status s = db->Write(woptions, &_t->bat); if (!s.ok()) { RocksWBHandler rocks_txc(*this); _t->bat.Iterate(&rocks_txc); derr << __func__ << " error: " << s.ToString() << " code = " << s.code() << " Rocksdb transaction: " << rocks_txc.seen.str() << dendl; } if (cct->_conf->rocksdb_perf) { utime_t write_memtable_time; utime_t write_delay_time; utime_t write_wal_time; utime_t write_pre_and_post_process_time; write_wal_time.set_from_double( static_cast(rocksdb::get_perf_context()->write_wal_time)/1000000000); write_memtable_time.set_from_double( static_cast(rocksdb::get_perf_context()->write_memtable_time)/1000000000); write_delay_time.set_from_double( static_cast(rocksdb::get_perf_context()->write_delay_time)/1000000000); write_pre_and_post_process_time.set_from_double( static_cast(rocksdb::get_perf_context()->write_pre_and_post_process_time)/1000000000); logger->tinc(l_rocksdb_write_memtable_time, write_memtable_time); logger->tinc(l_rocksdb_write_delay_time, write_delay_time); logger->tinc(l_rocksdb_write_wal_time, write_wal_time); logger->tinc(l_rocksdb_write_pre_and_post_process_time, write_pre_and_post_process_time); } return s.ok() ? 0 : -1; } int RocksDBStore::submit_transaction(KeyValueDB::Transaction t) { utime_t start = ceph_clock_now(); rocksdb::WriteOptions woptions; woptions.sync = false; int result = submit_common(woptions, t); utime_t lat = ceph_clock_now() - start; logger->tinc(l_rocksdb_submit_latency, lat); return result; } int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t) { utime_t start = ceph_clock_now(); rocksdb::WriteOptions woptions; // if disableWAL, sync can't set woptions.sync = !disableWAL; int result = submit_common(woptions, t); utime_t lat = ceph_clock_now() - start; logger->tinc(l_rocksdb_submit_sync_latency, lat); return result; } RocksDBStore::RocksDBTransactionImpl::RocksDBTransactionImpl(RocksDBStore *_db) { db = _db; } void RocksDBStore::RocksDBTransactionImpl::put_bat( rocksdb::WriteBatch& bat, rocksdb::ColumnFamilyHandle *cf, const string &key, const bufferlist &to_set_bl) { // bufferlist::c_str() is non-constant, so we can't call c_str() if (to_set_bl.is_contiguous() && to_set_bl.length() > 0) { bat.Put(cf, rocksdb::Slice(key), rocksdb::Slice(to_set_bl.buffers().front().c_str(), to_set_bl.length())); } else { rocksdb::Slice key_slice(key); vector value_slices(to_set_bl.get_num_buffers()); bat.Put(cf, rocksdb::SliceParts(&key_slice, 1), prepare_sliceparts(to_set_bl, &value_slices)); } } void RocksDBStore::RocksDBTransactionImpl::set( const string &prefix, const string &k, const bufferlist &to_set_bl) { auto cf = db->get_cf_handle(prefix, k); if (cf) { put_bat(bat, cf, k, to_set_bl); } else { string key = combine_strings(prefix, k); put_bat(bat, db->default_cf, key, to_set_bl); } } void RocksDBStore::RocksDBTransactionImpl::set( const string &prefix, const char *k, size_t keylen, const bufferlist &to_set_bl) { auto cf = db->get_cf_handle(prefix, k, keylen); if (cf) { string key(k, keylen); // fixme? put_bat(bat, cf, key, to_set_bl); } else { string key; combine_strings(prefix, k, keylen, &key); put_bat(bat, cf, key, to_set_bl); } } void RocksDBStore::RocksDBTransactionImpl::rmkey(const string &prefix, const string &k) { auto cf = db->get_cf_handle(prefix, k); if (cf) { bat.Delete(cf, rocksdb::Slice(k)); } else { bat.Delete(db->default_cf, combine_strings(prefix, k)); } } void RocksDBStore::RocksDBTransactionImpl::rmkey(const string &prefix, const char *k, size_t keylen) { auto cf = db->get_cf_handle(prefix, k, keylen); if (cf) { bat.Delete(cf, rocksdb::Slice(k, keylen)); } else { string key; combine_strings(prefix, k, keylen, &key); bat.Delete(db->default_cf, rocksdb::Slice(key)); } } void RocksDBStore::RocksDBTransactionImpl::rm_single_key(const string &prefix, const string &k) { auto cf = db->get_cf_handle(prefix, k); if (cf) { bat.SingleDelete(cf, k); } else { bat.SingleDelete(db->default_cf, combine_strings(prefix, k)); } } void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix) { auto p_iter = db->cf_handles.find(prefix); if (p_iter == db->cf_handles.end()) { uint64_t cnt = db->get_delete_range_threshold(); bat.SetSavePoint(); auto it = db->get_iterator(prefix); for (it->seek_to_first(); it->valid() && (--cnt) != 0; it->next()) { bat.Delete(db->default_cf, combine_strings(prefix, it->key())); } if (cnt == 0) { bat.RollbackToSavePoint(); string endprefix = prefix; endprefix.push_back('\x01'); bat.DeleteRange(db->default_cf, combine_strings(prefix, string()), combine_strings(endprefix, string())); } else { bat.PopSavePoint(); } } else { ceph_assert(p_iter->second.handles.size() >= 1); for (auto cf : p_iter->second.handles) { uint64_t cnt = db->get_delete_range_threshold(); bat.SetSavePoint(); auto it = db->new_shard_iterator(cf); for (it->seek_to_first(); it->valid() && (--cnt) != 0; it->next()) { bat.Delete(cf, it->key()); } if (cnt == 0) { bat.RollbackToSavePoint(); string endprefix = "\xff\xff\xff\xff"; // FIXME: this is cheating... bat.DeleteRange(cf, string(), endprefix); } else { bat.PopSavePoint(); } } } } void RocksDBStore::RocksDBTransactionImpl::rm_range_keys(const string &prefix, const string &start, const string &end) { ldout(db->cct, 10) << __func__ << " enter prefix=" << prefix << " start=" << pretty_binary_string(start) << " end=" << pretty_binary_string(end) << dendl; auto p_iter = db->cf_handles.find(prefix); uint64_t cnt = db->get_delete_range_threshold(); if (p_iter == db->cf_handles.end()) { uint64_t cnt0 = cnt; bat.SetSavePoint(); auto it = db->get_iterator(prefix); for (it->lower_bound(start); it->valid() && db->comparator->Compare(it->key(), end) < 0 && (--cnt) != 0; it->next()) { bat.Delete(db->default_cf, combine_strings(prefix, it->key())); } ldout(db->cct, 15) << __func__ << " count = " << cnt0 - cnt << dendl; if (cnt == 0) { ldout(db->cct, 10) << __func__ << " p_iter == end(), resorting to DeleteRange" << dendl; bat.RollbackToSavePoint(); bat.DeleteRange(db->default_cf, rocksdb::Slice(combine_strings(prefix, start)), rocksdb::Slice(combine_strings(prefix, end))); } else { bat.PopSavePoint(); } } else if (cnt == 0) { ceph_assert(p_iter->second.handles.size() >= 1); for (auto cf : p_iter->second.handles) { ldout(db->cct, 10) << __func__ << " p_iter != end(), resorting to DeleteRange" << dendl; bat.DeleteRange(cf, rocksdb::Slice(start), rocksdb::Slice(end)); } } else { auto bounds = KeyValueDB::IteratorBounds(); bounds.lower_bound = start; bounds.upper_bound = end; ceph_assert(p_iter->second.handles.size() >= 1); for (auto cf : p_iter->second.handles) { cnt = db->get_delete_range_threshold(); uint64_t cnt0 = cnt; bat.SetSavePoint(); auto it = db->new_shard_iterator(cf, prefix, bounds); for (it->lower_bound(start); it->valid() && (--cnt) != 0; it->next()) { bat.Delete(cf, it->key()); } ldout(db->cct, 10) << __func__ << " count = " << cnt0 - cnt << dendl; if (cnt == 0) { ldout(db->cct, 10) << __func__ << " p_iter != end(), resorting to DeleteRange" << dendl; bat.RollbackToSavePoint(); bat.DeleteRange(cf, rocksdb::Slice(start), rocksdb::Slice(end)); } else { bat.PopSavePoint(); } } } ldout(db->cct, 10) << __func__ << " end" << dendl; } void RocksDBStore::RocksDBTransactionImpl::merge( const string &prefix, const string &k, const bufferlist &to_set_bl) { auto cf = db->get_cf_handle(prefix, k); if (cf) { // bufferlist::c_str() is non-constant, so we can't call c_str() if (to_set_bl.is_contiguous() && to_set_bl.length() > 0) { bat.Merge( cf, rocksdb::Slice(k), rocksdb::Slice(to_set_bl.buffers().front().c_str(), to_set_bl.length())); } else { // make a copy rocksdb::Slice key_slice(k); vector value_slices(to_set_bl.get_num_buffers()); bat.Merge(cf, rocksdb::SliceParts(&key_slice, 1), prepare_sliceparts(to_set_bl, &value_slices)); } } else { string key = combine_strings(prefix, k); // bufferlist::c_str() is non-constant, so we can't call c_str() if (to_set_bl.is_contiguous() && to_set_bl.length() > 0) { bat.Merge( db->default_cf, rocksdb::Slice(key), rocksdb::Slice(to_set_bl.buffers().front().c_str(), to_set_bl.length())); } else { // make a copy rocksdb::Slice key_slice(key); vector value_slices(to_set_bl.get_num_buffers()); bat.Merge( db->default_cf, rocksdb::SliceParts(&key_slice, 1), prepare_sliceparts(to_set_bl, &value_slices)); } } } int RocksDBStore::get( const string &prefix, const std::set &keys, std::map *out) { rocksdb::PinnableSlice value; utime_t start = ceph_clock_now(); if (cf_handles.count(prefix) > 0) { for (auto& key : keys) { auto cf_handle = get_cf_handle(prefix, key); auto status = db->Get(rocksdb::ReadOptions(), cf_handle, rocksdb::Slice(key), &value); if (status.ok()) { (*out)[key].append(value.data(), value.size()); } else if (status.IsIOError()) { ceph_abort_msg(status.getState()); } value.Reset(); } } else { for (auto& key : keys) { string k = combine_strings(prefix, key); auto status = db->Get(rocksdb::ReadOptions(), default_cf, rocksdb::Slice(k), &value); if (status.ok()) { (*out)[key].append(value.data(), value.size()); } else if (status.IsIOError()) { ceph_abort_msg(status.getState()); } value.Reset(); } } utime_t lat = ceph_clock_now() - start; logger->tinc(l_rocksdb_get_latency, lat); return 0; } int RocksDBStore::get( const string &prefix, const string &key, bufferlist *out) { ceph_assert(out && (out->length() == 0)); utime_t start = ceph_clock_now(); int r = 0; rocksdb::PinnableSlice value; rocksdb::Status s; auto cf = get_cf_handle(prefix, key); if (cf) { s = db->Get(rocksdb::ReadOptions(), cf, rocksdb::Slice(key), &value); } else { string k = combine_strings(prefix, key); s = db->Get(rocksdb::ReadOptions(), default_cf, rocksdb::Slice(k), &value); } if (s.ok()) { out->append(value.data(), value.size()); } else if (s.IsNotFound()) { r = -ENOENT; } else { ceph_abort_msg(s.getState()); } utime_t lat = ceph_clock_now() - start; logger->tinc(l_rocksdb_get_latency, lat); return r; } int RocksDBStore::get( const string& prefix, const char *key, size_t keylen, bufferlist *out) { ceph_assert(out && (out->length() == 0)); utime_t start = ceph_clock_now(); int r = 0; rocksdb::PinnableSlice value; rocksdb::Status s; auto cf = get_cf_handle(prefix, key, keylen); if (cf) { s = db->Get(rocksdb::ReadOptions(), cf, rocksdb::Slice(key, keylen), &value); } else { string k; combine_strings(prefix, key, keylen, &k); s = db->Get(rocksdb::ReadOptions(), default_cf, rocksdb::Slice(k), &value); } if (s.ok()) { out->append(value.data(), value.size()); } else if (s.IsNotFound()) { r = -ENOENT; } else { ceph_abort_msg(s.getState()); } utime_t lat = ceph_clock_now() - start; logger->tinc(l_rocksdb_get_latency, lat); return r; } int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) { size_t prefix_len = 0; // Find separator inside Slice char* separator = (char*) memchr(in.data(), 0, in.size()); if (separator == NULL) return -EINVAL; prefix_len = size_t(separator - in.data()); if (prefix_len >= in.size()) return -EINVAL; // Fetch prefix and/or key directly from Slice if (prefix) *prefix = string(in.data(), prefix_len); if (key) *key = string(separator+1, in.size()-prefix_len-1); return 0; } void RocksDBStore::compact() { logger->inc(l_rocksdb_compact); rocksdb::CompactRangeOptions options; db->CompactRange(options, default_cf, nullptr, nullptr); for (auto cf : cf_handles) { for (auto shard_cf : cf.second.handles) { db->CompactRange( options, shard_cf, nullptr, nullptr); } } } void RocksDBStore::compact_thread_entry() { std::unique_lock l{compact_queue_lock}; dout(10) << __func__ << " enter" << dendl; while (!compact_queue_stop) { if (!compact_queue.empty()) { auto range = compact_queue.front(); compact_queue.pop_front(); logger->set(l_rocksdb_compact_queue_len, compact_queue.size()); l.unlock(); logger->inc(l_rocksdb_compact_range); if (range.first.empty() && range.second.empty()) { compact(); } else { compact_range(range.first, range.second); } l.lock(); continue; } dout(10) << __func__ << " waiting" << dendl; compact_queue_cond.wait(l); } dout(10) << __func__ << " exit" << dendl; } void RocksDBStore::compact_range_async(const string& start, const string& end) { std::lock_guard l(compact_queue_lock); // try to merge adjacent ranges. this is O(n), but the queue should // be short. note that we do not cover all overlap cases and merge // opportunities here, but we capture the ones we currently need. list< pair >::iterator p = compact_queue.begin(); while (p != compact_queue.end()) { if (p->first == start && p->second == end) { // dup; no-op return; } if (start <= p->first && p->first <= end) { // new region crosses start of existing range // select right bound that is bigger compact_queue.push_back(make_pair(start, end > p->second ? end : p->second)); compact_queue.erase(p); logger->inc(l_rocksdb_compact_queue_merge); break; } if (start <= p->second && p->second <= end) { // new region crosses end of existing range //p->first < p->second and p->second <= end, so p->first <= end. //But we break if previous condition, so start > p->first. compact_queue.push_back(make_pair(p->first, end)); compact_queue.erase(p); logger->inc(l_rocksdb_compact_queue_merge); break; } ++p; } if (p == compact_queue.end()) { // no merge, new entry. compact_queue.push_back(make_pair(start, end)); logger->set(l_rocksdb_compact_queue_len, compact_queue.size()); } compact_queue_cond.notify_all(); if (!compact_thread.is_started()) { compact_thread.create("rstore_compact"); } } bool RocksDBStore::check_omap_dir(string &omap_dir) { rocksdb::Options options; options.create_if_missing = true; rocksdb::DB *db; rocksdb::Status status = rocksdb::DB::Open(options, omap_dir, &db); delete db; db = nullptr; return status.ok(); } void RocksDBStore::compact_range(const string& start, const string& end) { rocksdb::CompactRangeOptions options; rocksdb::Slice cstart(start); rocksdb::Slice cend(end); string prefix_start, key_start; string prefix_end, key_end; string key_highest = "\xff\xff\xff\xff"; //cheating string key_lowest = ""; auto compact_range = [&] (const decltype(cf_handles)::iterator column_it, const std::string& start, const std::string& end) { rocksdb::Slice cstart(start); rocksdb::Slice cend(end); for (const auto& shard_it : column_it->second.handles) { db->CompactRange(options, shard_it, &cstart, &cend); } }; db->CompactRange(options, default_cf, &cstart, &cend); split_key(cstart, &prefix_start, &key_start); split_key(cend, &prefix_end, &key_end); if (prefix_start == prefix_end) { const auto& column = cf_handles.find(prefix_start); if (column != cf_handles.end()) { compact_range(column, key_start, key_end); } } else { auto column = cf_handles.find(prefix_start); if (column != cf_handles.end()) { compact_range(column, key_start, key_highest); ++column; } const auto& column_end = cf_handles.find(prefix_end); while (column != column_end) { compact_range(column, key_lowest, key_highest); column++; } if (column != cf_handles.end()) { compact_range(column, key_lowest, key_end); } } } RocksDBStore::RocksDBWholeSpaceIteratorImpl::~RocksDBWholeSpaceIteratorImpl() { delete dbiter; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_first() { dbiter->SeekToFirst(); ceph_assert(!dbiter->status().IsIOError()); return dbiter->status().ok() ? 0 : -1; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_first(const string &prefix) { rocksdb::Slice slice_prefix(prefix); dbiter->Seek(slice_prefix); ceph_assert(!dbiter->status().IsIOError()); return dbiter->status().ok() ? 0 : -1; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_last() { dbiter->SeekToLast(); ceph_assert(!dbiter->status().IsIOError()); return dbiter->status().ok() ? 0 : -1; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_last(const string &prefix) { string limit = past_prefix(prefix); rocksdb::Slice slice_limit(limit); dbiter->Seek(slice_limit); if (!dbiter->Valid()) { dbiter->SeekToLast(); } else { dbiter->Prev(); } return dbiter->status().ok() ? 0 : -1; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::upper_bound(const string &prefix, const string &after) { lower_bound(prefix, after); if (valid()) { pair key = raw_key(); if (key.first == prefix && key.second == after) next(); } return dbiter->status().ok() ? 0 : -1; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::lower_bound(const string &prefix, const string &to) { string bound = combine_strings(prefix, to); rocksdb::Slice slice_bound(bound); dbiter->Seek(slice_bound); return dbiter->status().ok() ? 0 : -1; } bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::valid() { return dbiter->Valid(); } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::next() { if (valid()) { dbiter->Next(); } ceph_assert(!dbiter->status().IsIOError()); return dbiter->status().ok() ? 0 : -1; } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev() { if (valid()) { dbiter->Prev(); } ceph_assert(!dbiter->status().IsIOError()); return dbiter->status().ok() ? 0 : -1; } string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key() { string out_key; split_key(dbiter->key(), 0, &out_key); return out_key; } pair RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() { string prefix, key; split_key(dbiter->key(), &prefix, &key); return make_pair(prefix, key); } bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) { // Look for "prefix\0" right in rocksb::Slice rocksdb::Slice key = dbiter->key(); if ((key.size() > prefix.length()) && (key[prefix.length()] == '\0')) { return memcmp(key.data(), prefix.c_str(), prefix.length()) == 0; } else { return false; } } bufferlist RocksDBStore::RocksDBWholeSpaceIteratorImpl::value() { return to_bufferlist(dbiter->value()); } size_t RocksDBStore::RocksDBWholeSpaceIteratorImpl::key_size() { return dbiter->key().size(); } size_t RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_size() { return dbiter->value().size(); } bufferptr RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_ptr() { rocksdb::Slice val = dbiter->value(); return bufferptr(val.data(), val.size()); } int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status() { return dbiter->status().ok() ? 0 : -1; } string RocksDBStore::past_prefix(const string &prefix) { string limit = prefix; limit.push_back(1); return limit; } class CFIteratorImpl : public KeyValueDB::IteratorImpl { protected: string prefix; rocksdb::Iterator *dbiter; const KeyValueDB::IteratorBounds bounds; const rocksdb::Slice iterate_lower_bound; const rocksdb::Slice iterate_upper_bound; public: explicit CFIteratorImpl(const RocksDBStore* db, const std::string& p, rocksdb::ColumnFamilyHandle* cf, KeyValueDB::IteratorBounds bounds_) : prefix(p), bounds(std::move(bounds_)), iterate_lower_bound(make_slice(bounds.lower_bound)), iterate_upper_bound(make_slice(bounds.upper_bound)) { auto options = rocksdb::ReadOptions(); if (db->cct->_conf->osd_rocksdb_iterator_bounds_enabled) { if (bounds.lower_bound) { options.iterate_lower_bound = &iterate_lower_bound; } if (bounds.upper_bound) { options.iterate_upper_bound = &iterate_upper_bound; } } dbiter = db->db->NewIterator(options, cf); } ~CFIteratorImpl() { delete dbiter; } int seek_to_first() override { dbiter->SeekToFirst(); return dbiter->status().ok() ? 0 : -1; } int seek_to_last() override { dbiter->SeekToLast(); return dbiter->status().ok() ? 0 : -1; } int upper_bound(const string &after) override { lower_bound(after); if (valid() && (key() == after)) { next(); } return dbiter->status().ok() ? 0 : -1; } int lower_bound(const string &to) override { rocksdb::Slice slice_bound(to); dbiter->Seek(slice_bound); return dbiter->status().ok() ? 0 : -1; } int next() override { if (valid()) { dbiter->Next(); } return dbiter->status().ok() ? 0 : -1; } int prev() override { if (valid()) { dbiter->Prev(); } return dbiter->status().ok() ? 0 : -1; } bool valid() override { return dbiter->Valid(); } string key() override { return dbiter->key().ToString(); } std::pair raw_key() override { return make_pair(prefix, key()); } bufferlist value() override { return to_bufferlist(dbiter->value()); } bufferptr value_as_ptr() override { rocksdb::Slice val = dbiter->value(); return bufferptr(val.data(), val.size()); } int status() override { return dbiter->status().ok() ? 0 : -1; } }; //merge column iterators and rest iterator class WholeMergeIteratorImpl : public KeyValueDB::WholeSpaceIteratorImpl { private: RocksDBStore* db; KeyValueDB::WholeSpaceIterator main; std::map shards; std::map::iterator current_shard; enum {on_main, on_shard} smaller; public: WholeMergeIteratorImpl(RocksDBStore* db) : db(db) , main(db->get_default_cf_iterator()) { for (auto& e : db->cf_handles) { shards.emplace(e.first, db->get_iterator(e.first)); } } // returns true if value in main is smaller then in shards // invalid is larger then actual value bool is_main_smaller() { if (main->valid()) { if (current_shard != shards.end()) { auto main_rk = main->raw_key(); ceph_assert(current_shard->second->valid()); auto shards_rk = current_shard->second->raw_key(); if (main_rk.first < shards_rk.first) return true; if (main_rk.first > shards_rk.first) return false; return main_rk.second < shards_rk.second; } else { return true; } } else { if (current_shard != shards.end()) { return false; } else { //this means that neither is valid //we select main to be smaller, so valid() will signal properly return true; } } } int seek_to_first() override { int r0 = main->seek_to_first(); int r1 = 0; // find first shard that has some data current_shard = shards.begin(); while (current_shard != shards.end()) { r1 = current_shard->second->seek_to_first(); if (r1 != 0 || current_shard->second->valid()) { //this is the first shard that will yield some keys break; } ++current_shard; } smaller = is_main_smaller() ? on_main : on_shard; return r0 == 0 && r1 == 0 ? 0 : -1; } int seek_to_first(const std::string &prefix) override { int r0 = main->seek_to_first(prefix); int r1 = 0; // find first shard that has some data current_shard = shards.lower_bound(prefix); while (current_shard != shards.end()) { r1 = current_shard->second->seek_to_first(); if (r1 != 0 || current_shard->second->valid()) { //this is the first shard that will yield some keys break; } ++current_shard; } smaller = is_main_smaller() ? on_main : on_shard; return r0 == 0 && r1 == 0 ? 0 : -1; }; int seek_to_last() override { int r0 = main->seek_to_last(); int r1 = 0; r1 = shards_seek_to_last(); //if we have 2 candidates, we need to select if (main->valid()) { if (shards_valid()) { if (is_main_smaller()) { smaller = on_shard; main->next(); } else { smaller = on_main; shards_next(); } } else { smaller = on_main; } } else { if (shards_valid()) { smaller = on_shard; } else { smaller = on_main; } } return r0 == 0 && r1 == 0 ? 0 : -1; } int seek_to_last(const std::string &prefix) override { int r0 = main->seek_to_last(prefix); int r1 = 0; // find last shard that has some data bool found = false; current_shard = shards.lower_bound(prefix); while (current_shard != shards.begin()) { r1 = current_shard->second->seek_to_last(); if (r1 != 0) break; if (current_shard->second->valid()) { found = true; break; } } //if we have 2 candidates, we need to select if (main->valid() && found) { if (is_main_smaller()) { main->next(); } else { shards_next(); } } if (!found) { //set shards state that properly represents eof current_shard = shards.end(); } smaller = is_main_smaller() ? on_main : on_shard; return r0 == 0 && r1 == 0 ? 0 : -1; } int upper_bound(const std::string &prefix, const std::string &after) override { int r0 = main->upper_bound(prefix, after); int r1 = 0; if (r0 != 0) return r0; current_shard = shards.lower_bound(prefix); if (current_shard != shards.end()) { bool located = false; if (current_shard->first == prefix) { r1 = current_shard->second->upper_bound(after); if (r1 != 0) return r1; if (current_shard->second->valid()) { located = true; } } if (!located) { while (current_shard != shards.end()) { r1 = current_shard->second->seek_to_first(); if (r1 != 0) return r1; if (current_shard->second->valid()) break; ++current_shard; } } } smaller = is_main_smaller() ? on_main : on_shard; return 0; } int lower_bound(const std::string &prefix, const std::string &to) override { int r0 = main->lower_bound(prefix, to); int r1 = 0; if (r0 != 0) return r0; current_shard = shards.lower_bound(prefix); if (current_shard != shards.end()) { bool located = false; if (current_shard->first == prefix) { r1 = current_shard->second->lower_bound(to); if (r1 != 0) return r1; if (current_shard->second->valid()) { located = true; } } if (!located) { while (current_shard != shards.end()) { r1 = current_shard->second->seek_to_first(); if (r1 != 0) return r1; if (current_shard->second->valid()) break; ++current_shard; } } } smaller = is_main_smaller() ? on_main : on_shard; return 0; } bool valid() override { if (smaller == on_main) { return main->valid(); } else { if (current_shard == shards.end()) return false; return current_shard->second->valid(); } }; int next() override { int r; if (smaller == on_main) { r = main->next(); } else { r = shards_next(); } if (r != 0) return r; smaller = is_main_smaller() ? on_main : on_shard; return 0; } int prev() override { int r; bool main_was_valid = false; if (main->valid()) { main_was_valid = true; r = main->prev(); } else { r = main->seek_to_last(); } if (r != 0) return r; bool shards_was_valid = false; if (shards_valid()) { shards_was_valid = true; r = shards_prev(); } else { r = shards_seek_to_last(); } if (r != 0) return r; if (!main->valid() && !shards_valid()) { //end, no previous. set marker so valid() can work smaller = on_main; return 0; } //if 1 is valid, select it //if 2 are valid select larger and advance the other if (main->valid()) { if (shards_valid()) { if (is_main_smaller()) { smaller = on_shard; if (main_was_valid) { if (main->valid()) { r = main->next(); } else { r = main->seek_to_first(); } } else { //if we have resurrected main, kill it if (main->valid()) { main->next(); } } } else { smaller = on_main; if (shards_was_valid) { if (shards_valid()) { r = shards_next(); } else { r = shards_seek_to_first(); } } else { //if we have resurected shards, kill it if (shards_valid()) { shards_next(); } } } } else { smaller = on_main; r = shards_seek_to_first(); } } else { smaller = on_shard; r = main->seek_to_first(); } return r; } std::string key() override { if (smaller == on_main) { return main->key(); } else { return current_shard->second->key(); } } std::pair raw_key() override { if (smaller == on_main) { return main->raw_key(); } else { return { current_shard->first, current_shard->second->key() }; } } bool raw_key_is_prefixed(const std::string &prefix) override { if (smaller == on_main) { return main->raw_key_is_prefixed(prefix); } else { return current_shard->first == prefix; } } ceph::buffer::list value() override { if (smaller == on_main) { return main->value(); } else { return current_shard->second->value(); } } int status() override { //because we already had to inspect key, it must be ok return 0; } size_t key_size() override { if (smaller == on_main) { return main->key_size(); } else { return current_shard->second->key().size(); } } size_t value_size() override { if (smaller == on_main) { return main->value_size(); } else { return current_shard->second->value().length(); } } int shards_valid() { if (current_shard == shards.end()) return false; return current_shard->second->valid(); } int shards_next() { if (current_shard == shards.end()) { //illegal to next() on !valid() return -1; } int r = 0; r = current_shard->second->next(); if (r != 0) return r; if (current_shard->second->valid()) return 0; //current shard exhaused, search for key ++current_shard; while (current_shard != shards.end()) { r = current_shard->second->seek_to_first(); if (r != 0) return r; if (current_shard->second->valid()) break; ++current_shard; } //either we found key or not, but it is success return 0; } int shards_prev() { if (current_shard == shards.end()) { //illegal to prev() on !valid() return -1; } int r = current_shard->second->prev(); while (r == 0) { if (current_shard->second->valid()) { break; } if (current_shard == shards.begin()) { //we have reached pre-first element //this makes it !valid(), but guarantees next() moves to first element break; } --current_shard; r = current_shard->second->seek_to_last(); } return r; } int shards_seek_to_last() { int r = 0; current_shard = shards.end(); if (current_shard == shards.begin()) { //no shards at all return 0; } while (current_shard != shards.begin()) { --current_shard; r = current_shard->second->seek_to_last(); if (r != 0) return r; if (current_shard->second->valid()) { return 0; } } //no keys at all current_shard = shards.end(); return r; } int shards_seek_to_first() { int r = 0; current_shard = shards.begin(); while (current_shard != shards.end()) { r = current_shard->second->seek_to_first(); if (r != 0) break; if (current_shard->second->valid()) { //this is the first shard that will yield some keys break; } ++current_shard; } return r; } }; class ShardMergeIteratorImpl : public KeyValueDB::IteratorImpl { private: struct KeyLess { private: const rocksdb::Comparator* comparator; public: KeyLess(const rocksdb::Comparator* comparator) : comparator(comparator) { }; bool operator()(rocksdb::Iterator* a, rocksdb::Iterator* b) const { if (a->Valid()) { if (b->Valid()) { return comparator->Compare(a->key(), b->key()) < 0; } else { return true; } } else { if (b->Valid()) { return false; } else { return false; } } } }; const RocksDBStore* db; KeyLess keyless; string prefix; const KeyValueDB::IteratorBounds bounds; const rocksdb::Slice iterate_lower_bound; const rocksdb::Slice iterate_upper_bound; std::vector iters; public: explicit ShardMergeIteratorImpl(const RocksDBStore* db, const std::string& prefix, const std::vector& shards, KeyValueDB::IteratorBounds bounds_) : db(db), keyless(db->comparator), prefix(prefix), bounds(std::move(bounds_)), iterate_lower_bound(make_slice(bounds.lower_bound)), iterate_upper_bound(make_slice(bounds.upper_bound)) { iters.reserve(shards.size()); auto options = rocksdb::ReadOptions(); if (db->cct->_conf->osd_rocksdb_iterator_bounds_enabled) { if (bounds.lower_bound) { options.iterate_lower_bound = &iterate_lower_bound; } if (bounds.upper_bound) { options.iterate_upper_bound = &iterate_upper_bound; } } for (auto& s : shards) { iters.push_back(db->db->NewIterator(options, s)); } } ~ShardMergeIteratorImpl() { for (auto& it : iters) { delete it; } } int seek_to_first() override { for (auto& it : iters) { it->SeekToFirst(); if (!it->status().ok()) { return -1; } } //all iterators seeked, sort std::sort(iters.begin(), iters.end(), keyless); return 0; } int seek_to_last() override { for (auto& it : iters) { it->SeekToLast(); if (!it->status().ok()) { return -1; } } for (size_t i = 1; i < iters.size(); i++) { if (iters[0]->Valid()) { if (iters[i]->Valid()) { if (keyless(iters[0], iters[i])) { std::swap(iters[0], iters[i]); } } else { //iters[i] empty } } else { if (iters[i]->Valid()) { std::swap(iters[0], iters[i]); } } //it might happen that cf was empty if (iters[i]->Valid()) { iters[i]->Next(); } } //no need to sort, as at most 1 iterator is valid now return 0; } int upper_bound(const string &after) override { rocksdb::Slice slice_bound(after); for (auto& it : iters) { it->Seek(slice_bound); if (it->Valid() && it->key() == after) { it->Next(); } if (!it->status().ok()) { return -1; } } std::sort(iters.begin(), iters.end(), keyless); return 0; } int lower_bound(const string &to) override { rocksdb::Slice slice_bound(to); for (auto& it : iters) { it->Seek(slice_bound); if (!it->status().ok()) { return -1; } } std::sort(iters.begin(), iters.end(), keyless); return 0; } int next() override { int r = -1; if (iters[0]->Valid()) { iters[0]->Next(); if (iters[0]->status().ok()) { r = 0; //bubble up for (size_t i = 0; i < iters.size() - 1; i++) { if (keyless(iters[i], iters[i + 1])) { //matches, fixed break; } std::swap(iters[i], iters[i + 1]); } } } return r; } // iters are sorted, so // a[0] < b[0] < c[0] < d[0] // a[0] > a[-1], a[0] > b[-1], a[0] > c[-1], a[0] > d[-1] // so, prev() will be one of: // a[-1], b[-1], c[-1], d[-1] // prev() will be the one that is *largest* of them // // alg: // 1. go prev() on each iterator we can // 2. select largest key from those iterators // 3. go next() on all iterators except (2) // 4. sort int prev() override { std::vector prev_done; //1 for (auto it: iters) { if (it->Valid()) { it->Prev(); if (it->Valid()) { prev_done.push_back(it); } else { it->SeekToFirst(); } } else { it->SeekToLast(); if (it->Valid()) { prev_done.push_back(it); } } } if (prev_done.size() == 0) { /* there is no previous element */ if (iters[0]->Valid()) { iters[0]->Prev(); ceph_assert(!iters[0]->Valid()); } return 0; } //2,3 rocksdb::Iterator* highest = prev_done[0]; for (size_t i = 1; i < prev_done.size(); i++) { if (keyless(highest, prev_done[i])) { highest->Next(); highest = prev_done[i]; } else { prev_done[i]->Next(); } } //4 //insert highest in the beginning, and shift values until we pick highest //untouched rest is sorted - we just prev()/next() them rocksdb::Iterator* hold = highest; for (size_t i = 0; i < iters.size(); i++) { std::swap(hold, iters[i]); if (hold == highest) break; } ceph_assert(hold == highest); return 0; } bool valid() override { return iters[0]->Valid(); } string key() override { return iters[0]->key().ToString(); } std::pair raw_key() override { return make_pair(prefix, key()); } bufferlist value() override { return to_bufferlist(iters[0]->value()); } bufferptr value_as_ptr() override { rocksdb::Slice val = iters[0]->value(); return bufferptr(val.data(), val.size()); } int status() override { return iters[0]->status().ok() ? 0 : -1; } }; KeyValueDB::Iterator RocksDBStore::get_iterator(const std::string& prefix, IteratorOpts opts, IteratorBounds bounds) { auto cf_it = cf_handles.find(prefix); if (cf_it != cf_handles.end()) { rocksdb::ColumnFamilyHandle* cf = nullptr; if (cf_it->second.handles.size() == 1) { cf = cf_it->second.handles[0]; } else if (cct->_conf->osd_rocksdb_iterator_bounds_enabled) { cf = check_cf_handle_bounds(cf_it, bounds); } if (cf) { return std::make_shared( this, prefix, cf, std::move(bounds)); } else { return std::make_shared( this, prefix, cf_it->second.handles, std::move(bounds)); } } else { // use wholespace engine if no cfs are configured // or use default cf otherwise as there is no // matching cf for the specified prefix. auto w_it = cf_handles.size() == 0 || prefix.empty() ? get_wholespace_iterator(opts) : get_default_cf_iterator(); return KeyValueDB::make_iterator(prefix, w_it); } } RocksDBStore::WholeSpaceIterator RocksDBStore::new_shard_iterator(rocksdb::ColumnFamilyHandle* cf) { return std::make_shared( this, cf, 0); } KeyValueDB::Iterator RocksDBStore::new_shard_iterator(rocksdb::ColumnFamilyHandle* cf, const std::string& prefix, IteratorBounds bounds) { return std::make_shared( this, prefix, cf, std::move(bounds)); } RocksDBStore::WholeSpaceIterator RocksDBStore::get_wholespace_iterator(IteratorOpts opts) { if (cf_handles.size() == 0) { return std::make_shared( this, default_cf, opts); } else { return std::make_shared(this); } } RocksDBStore::WholeSpaceIterator RocksDBStore::get_default_cf_iterator() { return std::make_shared(this, default_cf, 0); } int RocksDBStore::prepare_for_reshard(const std::string& new_sharding, RocksDBStore::columns_t& to_process_columns) { //0. lock db from opening //1. list existing columns //2. apply merge operator to (main + columns) opts //3. prepare std::vector existing_cfs //4. open db, acquire existing column handles //5. calculate missing columns //6. create missing columns //7. construct cf_handles according to new sharding //8. check is all cf_handles are filled bool b; std::vector new_sharding_def; char const* error_position; std::string error_msg; b = parse_sharding_def(new_sharding, new_sharding_def, &error_position, &error_msg); if (!b) { dout(1) << __func__ << " bad sharding: " << dendl; dout(1) << __func__ << new_sharding << dendl; dout(1) << __func__ << std::string(error_position - &new_sharding[0], ' ') << "^" << error_msg << dendl; return -EINVAL; } //0. lock db from opening std::string stored_sharding_text; rocksdb::ReadFileToString(env, sharding_def_file, &stored_sharding_text); if (stored_sharding_text.find(resharding_column_lock) == string::npos) { rocksdb::Status status; if (stored_sharding_text.size() != 0) stored_sharding_text += " "; stored_sharding_text += resharding_column_lock; env->CreateDir(sharding_def_dir); status = rocksdb::WriteStringToFile(env, stored_sharding_text, sharding_def_file, true); if (!status.ok()) { derr << __func__ << " cannot write to " << sharding_def_file << dendl; return -EIO; } } //1. list existing columns rocksdb::Status status; std::vector existing_columns; rocksdb::Options opt; int r = load_rocksdb_options(false, opt); if (r) { dout(1) << __func__ << " load rocksdb options failed" << dendl; return r; } status = rocksdb::DB::ListColumnFamilies(rocksdb::DBOptions(opt), path, &existing_columns); if (!status.ok()) { derr << "Unable to list column families: " << status.ToString() << dendl; return -EINVAL; } dout(5) << "existing columns = " << existing_columns << dendl; //2. apply merge operator to (main + columns) opts //3. prepare std::vector cfs_to_open std::vector cfs_to_open; for (const auto& full_name : existing_columns) { //split col_name to - std::string base_name; size_t pos = full_name.find('-'); if (std::string::npos == pos) base_name = full_name; else base_name = full_name.substr(0,pos); rocksdb::ColumnFamilyOptions cf_opt(opt); // search if we have options for this column std::string options; for (const auto& nsd : new_sharding_def) { if (nsd.name == base_name) { options = nsd.options; break; } } int r = update_column_family_options(base_name, options, &cf_opt); if (r != 0) { return r; } cfs_to_open.emplace_back(full_name, cf_opt); } //4. open db, acquire existing column handles std::vector handles; status = rocksdb::DB::Open(rocksdb::DBOptions(opt), path, cfs_to_open, &handles, &db); if (!status.ok()) { derr << status.ToString() << dendl; return -EINVAL; } for (size_t i = 0; i < cfs_to_open.size(); i++) { dout(10) << "column " << cfs_to_open[i].name << " handle " << (void*)handles[i] << dendl; } //5. calculate missing columns std::vector new_sharding_columns; std::vector missing_columns; sharding_def_to_columns(new_sharding_def, new_sharding_columns); dout(5) << "target columns = " << new_sharding_columns << dendl; for (const auto& n : new_sharding_columns) { bool found = false; for (const auto& e : existing_columns) { if (n == e) { found = true; break; } } if (!found) { missing_columns.push_back(n); } } dout(5) << "missing columns = " << missing_columns << dendl; //6. create missing columns for (const auto& full_name : missing_columns) { std::string base_name; size_t pos = full_name.find('-'); if (std::string::npos == pos) base_name = full_name; else base_name = full_name.substr(0,pos); rocksdb::ColumnFamilyOptions cf_opt(opt); // search if we have options for this column std::string options; for (const auto& nsd : new_sharding_def) { if (nsd.name == base_name) { options = nsd.options; break; } } int r = update_column_family_options(base_name, options, &cf_opt); if (r != 0) { return r; } rocksdb::ColumnFamilyHandle *cf; status = db->CreateColumnFamily(cf_opt, full_name, &cf); if (!status.ok()) { derr << __func__ << " Failed to create rocksdb column family: " << full_name << dendl; return -EINVAL; } dout(10) << "created column " << full_name << " handle = " << (void*)cf << dendl; existing_columns.push_back(full_name); handles.push_back(cf); } //7. construct cf_handles according to new sharding for (size_t i = 0; i < existing_columns.size(); i++) { std::string full_name = existing_columns[i]; rocksdb::ColumnFamilyHandle *cf = handles[i]; std::string base_name; size_t shard_idx = 0; size_t pos = full_name.find('-'); dout(10) << "processing column " << full_name << dendl; if (std::string::npos == pos) { base_name = full_name; } else { base_name = full_name.substr(0,pos); shard_idx = atoi(full_name.substr(pos+1).c_str()); } if (rocksdb::kDefaultColumnFamilyName == base_name) { default_cf = handles[i]; must_close_default_cf = true; std::unique_ptr ptr{ cf, [](rocksdb::ColumnFamilyHandle*) {}}; to_process_columns.emplace(full_name, std::move(ptr)); } else { for (const auto& nsd : new_sharding_def) { if (nsd.name == base_name) { if (shard_idx < nsd.shard_cnt) { add_column_family(base_name, nsd.hash_l, nsd.hash_h, shard_idx, cf); } else { //ignore columns with index larger then shard count } break; } } std::unique_ptr ptr{ cf, [this](rocksdb::ColumnFamilyHandle* handle) { db->DestroyColumnFamilyHandle(handle); }}; to_process_columns.emplace(full_name, std::move(ptr)); } } //8. check if all cf_handles are filled for (const auto& col : cf_handles) { for (size_t i = 0; i < col.second.handles.size(); i++) { if (col.second.handles[i] == nullptr) { derr << "missing handle for column " << col.first << " shard " << i << dendl; return -EIO; } } } return 0; } int RocksDBStore::reshard_cleanup(const RocksDBStore::columns_t& current_columns) { std::vector new_sharding_columns; for (const auto& [name, handle] : cf_handles) { if (handle.handles.size() == 1) { new_sharding_columns.push_back(name); } else { for (size_t i = 0; i < handle.handles.size(); i++) { new_sharding_columns.push_back(name + "-" + std::to_string(i)); } } } for (auto& [name, handle] : current_columns) { auto found = std::find(new_sharding_columns.begin(), new_sharding_columns.end(), name) != new_sharding_columns.end(); if (found || name == rocksdb::kDefaultColumnFamilyName) { dout(5) << "Column " << name << " is part of new sharding." << dendl; continue; } dout(5) << "Column " << name << " not part of new sharding. Deleting." << dendl; // verify that column is empty std::unique_ptr it{ db->NewIterator(rocksdb::ReadOptions(), handle.get())}; ceph_assert(it); it->SeekToFirst(); ceph_assert(!it->Valid()); if (rocksdb::Status status = db->DropColumnFamily(handle.get()); !status.ok()) { derr << __func__ << " Failed to drop column: " << name << dendl; return -EINVAL; } } return 0; } int RocksDBStore::reshard(const std::string& new_sharding, const RocksDBStore::resharding_ctrl* ctrl_in) { resharding_ctrl ctrl = ctrl_in ? *ctrl_in : resharding_ctrl(); size_t bytes_in_batch = 0; size_t keys_in_batch = 0; size_t bytes_per_iterator = 0; size_t keys_per_iterator = 0; size_t keys_processed = 0; size_t keys_moved = 0; auto flush_batch = [&](rocksdb::WriteBatch* batch) { dout(10) << "flushing batch, " << keys_in_batch << " keys, for " << bytes_in_batch << " bytes" << dendl; rocksdb::WriteOptions woptions; woptions.sync = true; rocksdb::Status s = db->Write(woptions, batch); ceph_assert(s.ok()); bytes_in_batch = 0; keys_in_batch = 0; batch->Clear(); }; auto process_column = [&](rocksdb::ColumnFamilyHandle* handle, const std::string& fixed_prefix) { dout(5) << " column=" << (void*)handle << " prefix=" << fixed_prefix << dendl; std::unique_ptr it{ db->NewIterator(rocksdb::ReadOptions(), handle)}; ceph_assert(it); rocksdb::WriteBatch bat; for (it->SeekToFirst(); it->Valid(); it->Next()) { rocksdb::Slice raw_key = it->key(); dout(30) << "key=" << pretty_binary_string(raw_key.ToString()) << dendl; //check if need to refresh iterator if (bytes_per_iterator >= ctrl.bytes_per_iterator || keys_per_iterator >= ctrl.keys_per_iterator) { dout(8) << "refreshing iterator" << dendl; bytes_per_iterator = 0; keys_per_iterator = 0; std::string raw_key_str = raw_key.ToString(); it.reset(db->NewIterator(rocksdb::ReadOptions(), handle)); ceph_assert(it); it->Seek(raw_key_str); ceph_assert(it->Valid()); raw_key = it->key(); } rocksdb::Slice value = it->value(); std::string prefix, key; if (fixed_prefix.size() == 0) { split_key(raw_key, &prefix, &key); } else { prefix = fixed_prefix; key = raw_key.ToString(); } keys_processed++; if ((keys_processed % 10000) == 0) { dout(10) << "processed " << keys_processed << " keys, moved " << keys_moved << dendl; } rocksdb::ColumnFamilyHandle* new_handle = get_cf_handle(prefix, key); if (new_handle == nullptr) { new_handle = default_cf; } if (handle == new_handle) { continue; } std::string new_raw_key; if (new_handle == default_cf) { new_raw_key = combine_strings(prefix, key); } else { new_raw_key = key; } bat.Delete(handle, raw_key); bat.Put(new_handle, new_raw_key, value); dout(25) << "moving " << (void*)handle << "/" << pretty_binary_string(raw_key.ToString()) << " to " << (void*)new_handle << "/" << pretty_binary_string(new_raw_key) << " size " << value.size() << dendl; keys_moved++; bytes_in_batch += new_raw_key.size() * 2 + value.size(); keys_in_batch++; bytes_per_iterator += new_raw_key.size() * 2 + value.size(); keys_per_iterator++; //check if need to write batch if (bytes_in_batch >= ctrl.bytes_per_batch || keys_in_batch >= ctrl.keys_per_batch) { flush_batch(&bat); if (ctrl.unittest_fail_after_first_batch) { return -1000; } } } if (bat.Count() > 0) { flush_batch(&bat); } return 0; }; auto close_column_handles = make_scope_guard([this] { cf_handles.clear(); close(); }); columns_t to_process_columns; int r = prepare_for_reshard(new_sharding, to_process_columns); if (r != 0) { dout(1) << "failed to prepare db for reshard" << dendl; return r; } for (auto& [name, handle] : to_process_columns) { dout(5) << "Processing column=" << name << " handle=" << handle.get() << dendl; if (name == rocksdb::kDefaultColumnFamilyName) { ceph_assert(handle.get() == default_cf); r = process_column(default_cf, std::string()); } else { std::string fixed_prefix = name.substr(0, name.find('-')); dout(10) << "Prefix: " << fixed_prefix << dendl; r = process_column(handle.get(), fixed_prefix); } if (r != 0) { derr << "Error processing column " << name << dendl; return r; } if (ctrl.unittest_fail_after_processing_column) { return -1001; } } r = reshard_cleanup(to_process_columns); if (r != 0) { dout(5) << "failed to cleanup after reshard" << dendl; return r; } if (ctrl.unittest_fail_after_successful_processing) { return -1002; } env->CreateDir(sharding_def_dir); if (auto status = rocksdb::WriteStringToFile(env, new_sharding, sharding_def_file, true); !status.ok()) { derr << __func__ << " cannot write to " << sharding_def_file << dendl; return -EIO; } return r; } bool RocksDBStore::get_sharding(std::string& sharding) { rocksdb::Status status; std::string stored_sharding_text; bool result = false; sharding.clear(); status = env->FileExists(sharding_def_file); if (status.ok()) { status = rocksdb::ReadFileToString(env, sharding_def_file, &stored_sharding_text); if(status.ok()) { result = true; sharding = stored_sharding_text; } } return result; }