summaryrefslogtreecommitdiffstats
path: root/src/mon/OSDMonitor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/mon/OSDMonitor.cc')
-rw-r--r--src/mon/OSDMonitor.cc14832
1 files changed, 14832 insertions, 0 deletions
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
new file mode 100644
index 000000000..3191ed5bf
--- /dev/null
+++ b/src/mon/OSDMonitor.cc
@@ -0,0 +1,14832 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <boost/algorithm/string.hpp>
+#include <experimental/iterator>
+#include <locale>
+#include <sstream>
+
+#include "mon/OSDMonitor.h"
+#include "mon/Monitor.h"
+#include "mon/MDSMonitor.h"
+#include "mon/MgrStatMonitor.h"
+#include "mon/AuthMonitor.h"
+#include "mon/KVMonitor.h"
+
+#include "mon/MonitorDBStore.h"
+#include "mon/Session.h"
+
+#include "crush/CrushWrapper.h"
+#include "crush/CrushTester.h"
+#include "crush/CrushTreeDumper.h"
+
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDMarkMeDead.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDAlive.h"
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDPGReadyToMerge.h"
+#include "messages/MMonCommand.h"
+#include "messages/MRemoveSnaps.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MRoute.h"
+#include "messages/MMonGetPurgedSnaps.h"
+#include "messages/MMonGetPurgedSnapsReply.h"
+
+#include "common/TextTable.h"
+#include "common/Timer.h"
+#include "common/ceph_argparse.h"
+#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
+#include "common/strtol.h"
+#include "common/numa.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+
+#include "erasure-code/ErasureCodePlugin.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+
+#include "include/compat.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "include/util.h"
+#include "common/cmdparse.h"
+#include "include/str_list.h"
+#include "include/str_map.h"
+#include "include/scope_guard.h"
+#include "perfglue/heap_profiler.h"
+
+#include "auth/cephx/CephxKeyServer.h"
+#include "osd/OSDCap.h"
+
+#include "json_spirit/json_spirit_reader.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodePluginRegistry;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+
+#define dout_subsys ceph_subsys_mon
+static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
+static const string OSD_METADATA_PREFIX("osd_metadata");
+static const string OSD_SNAP_PREFIX("osd_snap");
+
+/*
+
+ OSD snapshot metadata
+ ---------------------
+
+ -- starting with mimic, removed in octopus --
+
+ "removed_epoch_%llu_%08lx" % (pool, epoch)
+ -> interval_set<snapid_t>
+
+ "removed_snap_%llu_%016llx" % (pool, last_snap)
+ -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
+
+
+ -- starting with mimic --
+
+ "purged_snap_%llu_%016llx" % (pool, last_snap)
+ -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
+
+ - note that the {removed,purged}_snap put the last snap in they key so
+ that we can use forward iteration only to search for an epoch in an
+ interval. e.g., to test if epoch N is removed/purged, we'll find a key
+ >= N that either does or doesn't contain the given snap.
+
+
+ -- starting with octopus --
+
+ "purged_epoch_%08lx" % epoch
+ -> map<int64_t,interval_set<snapid_t>>
+
+ */
+using namespace TOPNSPC::common;
+namespace {
+
+struct OSDMemCache : public PriorityCache::PriCache {
+ OSDMonitor *osdmon;
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+ int64_t committed_bytes = 0;
+ double cache_ratio = 0;
+
+ OSDMemCache(OSDMonitor *m) : osdmon(m) {};
+
+ virtual uint64_t _get_used_bytes() const = 0;
+
+ virtual int64_t request_cache_bytes(
+ PriorityCache::Priority pri, uint64_t total_cache) const {
+ int64_t assigned = get_cache_bytes(pri);
+
+ switch (pri) {
+ // All cache items are currently set to have PRI1 priority
+ case PriorityCache::Priority::PRI1:
+ {
+ int64_t request = _get_used_bytes();
+ return (request > assigned) ? request - assigned : 0;
+ }
+ default:
+ break;
+ }
+ return -EOPNOTSUPP;
+ }
+
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+ return cache_bytes[pri];
+ }
+
+ virtual int64_t get_cache_bytes() const {
+ int64_t total = 0;
+
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ total += get_cache_bytes(pri);
+ }
+ return total;
+ }
+
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] = bytes;
+ }
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] += bytes;
+ }
+ virtual int64_t commit_cache_size(uint64_t total_cache) {
+ committed_bytes = PriorityCache::get_chunk(
+ get_cache_bytes(), total_cache);
+ return committed_bytes;
+ }
+ virtual int64_t get_committed_size() const {
+ return committed_bytes;
+ }
+ virtual double get_cache_ratio() const {
+ return cache_ratio;
+ }
+ virtual void set_cache_ratio(double ratio) {
+ cache_ratio = ratio;
+ }
+ virtual string get_cache_name() const = 0;
+};
+
+struct IncCache : public OSDMemCache {
+ IncCache(OSDMonitor *m) : OSDMemCache(m) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ return osdmon->inc_osd_cache.get_bytes();
+ }
+
+ virtual string get_cache_name() const {
+ return "OSDMap Inc Cache";
+ }
+
+ uint64_t _get_num_osdmaps() const {
+ return osdmon->inc_osd_cache.get_size();
+ }
+};
+
+struct FullCache : public OSDMemCache {
+ FullCache(OSDMonitor *m) : OSDMemCache(m) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ return osdmon->full_osd_cache.get_bytes();
+ }
+
+ virtual string get_cache_name() const {
+ return "OSDMap Full Cache";
+ }
+
+ uint64_t _get_num_osdmaps() const {
+ return osdmon->full_osd_cache.get_size();
+ }
+};
+
+std::shared_ptr<IncCache> inc_cache;
+std::shared_ptr<FullCache> full_cache;
+
+const uint32_t MAX_POOL_APPLICATIONS = 4;
+const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
+const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
+
+bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
+ // Note: this doesn't include support for the application tag match
+ if ((grant.spec.allow & OSD_CAP_W) != 0) {
+ auto& match = grant.match;
+ if (match.is_match_all()) {
+ return true;
+ } else if (pool_name != nullptr &&
+ !match.pool_namespace.pool_name.empty() &&
+ match.pool_namespace.pool_name == *pool_name) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool is_unmanaged_snap_op_permitted(CephContext* cct,
+ const KeyServer& key_server,
+ const EntityName& entity_name,
+ const MonCap& mon_caps,
+ const entity_addr_t& peer_socket_addr,
+ const std::string* pool_name)
+{
+ typedef std::map<std::string, std::string> CommandArgs;
+
+ if (mon_caps.is_capable(
+ cct, entity_name, "osd",
+ "osd pool op unmanaged-snap",
+ (pool_name == nullptr ?
+ CommandArgs{} /* pool DNE, require unrestricted cap */ :
+ CommandArgs{{"poolname", *pool_name}}),
+ false, true, false,
+ peer_socket_addr)) {
+ return true;
+ }
+
+ AuthCapsInfo caps_info;
+ if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
+ caps_info)) {
+ dout(10) << "unable to locate OSD cap data for " << entity_name
+ << " in auth db" << dendl;
+ return false;
+ }
+
+ string caps_str;
+ if (caps_info.caps.length() > 0) {
+ auto p = caps_info.caps.cbegin();
+ try {
+ decode(caps_str, p);
+ } catch (const ceph::buffer::error &err) {
+ derr << "corrupt OSD cap data for " << entity_name << " in auth db"
+ << dendl;
+ return false;
+ }
+ }
+
+ OSDCap osd_cap;
+ if (!osd_cap.parse(caps_str, nullptr)) {
+ dout(10) << "unable to parse OSD cap data for " << entity_name
+ << " in auth db" << dendl;
+ return false;
+ }
+
+ // if the entity has write permissions in one or all pools, permit
+ // usage of unmanaged-snapshots
+ if (osd_cap.allow_all()) {
+ return true;
+ }
+
+ for (auto& grant : osd_cap.grants) {
+ if (grant.profile.is_valid()) {
+ for (auto& profile_grant : grant.profile_grants) {
+ if (is_osd_writable(profile_grant, pool_name)) {
+ return true;
+ }
+ }
+ } else if (is_osd_writable(grant, pool_name)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+} // anonymous namespace
+
+void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
+ epoch_t last_epoch_clean)
+{
+ if (ps >= pg_num) {
+ // removed PG
+ return;
+ }
+ epoch_by_pg.resize(pg_num, 0);
+ const auto old_lec = epoch_by_pg[ps];
+ if (old_lec >= last_epoch_clean) {
+ // stale lec
+ return;
+ }
+ epoch_by_pg[ps] = last_epoch_clean;
+ if (last_epoch_clean < floor) {
+ floor = last_epoch_clean;
+ } else if (last_epoch_clean > floor) {
+ if (old_lec == floor) {
+ // probably should increase floor?
+ auto new_floor = std::min_element(std::begin(epoch_by_pg),
+ std::end(epoch_by_pg));
+ floor = *new_floor;
+ }
+ }
+ if (ps != next_missing) {
+ return;
+ }
+ for (; next_missing < epoch_by_pg.size(); next_missing++) {
+ if (epoch_by_pg[next_missing] == 0) {
+ break;
+ }
+ }
+}
+
+void LastEpochClean::remove_pool(uint64_t pool)
+{
+ report_by_pool.erase(pool);
+}
+
+void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
+ epoch_t last_epoch_clean)
+{
+ auto& lec = report_by_pool[pg.pool()];
+ return lec.report(pg_num, pg.ps(), last_epoch_clean);
+}
+
+epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
+{
+ auto floor = latest.get_epoch();
+ for (auto& pool : latest.get_pools()) {
+ auto reported = report_by_pool.find(pool.first);
+ if (reported == report_by_pool.end()) {
+ return 0;
+ }
+ if (reported->second.next_missing < pool.second.get_pg_num()) {
+ return 0;
+ }
+ if (reported->second.floor < floor) {
+ floor = reported->second.floor;
+ }
+ }
+ return floor;
+}
+
+void LastEpochClean::dump(Formatter *f) const
+{
+ f->open_array_section("per_pool");
+
+ for (auto& [pool, lec] : report_by_pool) {
+ f->open_object_section("pool");
+ f->dump_unsigned("poolid", pool);
+ f->dump_unsigned("floor", lec.floor);
+ f->close_section();
+ }
+
+ f->close_section();
+}
+
+class C_UpdateCreatingPGs : public Context {
+public:
+ OSDMonitor *osdmon;
+ utime_t start;
+ epoch_t epoch;
+ C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
+ osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
+ void finish(int r) override {
+ if (r >= 0) {
+ utime_t end = ceph_clock_now();
+ dout(10) << "osdmap epoch " << epoch << " mapping took "
+ << (end - start) << " seconds" << dendl;
+ osdmon->update_creating_pgs();
+ osdmon->check_pg_creates_subs();
+ }
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, osdmap)
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
+ return *_dout << "mon." << mon.name << "@" << mon.rank
+ << "(" << mon.get_state_name()
+ << ").osd e" << osdmap.get_epoch() << " ";
+}
+
+OSDMonitor::OSDMonitor(
+ CephContext *cct,
+ Monitor &mn,
+ Paxos &p,
+ const string& service_name)
+ : PaxosService(mn, p, service_name),
+ cct(cct),
+ inc_osd_cache(g_conf()->mon_osd_cache_size),
+ full_osd_cache(g_conf()->mon_osd_cache_size),
+ has_osdmap_manifest(false),
+ mapper(mn.cct, &mn.cpu_tp)
+{
+ inc_cache = std::make_shared<IncCache>(this);
+ full_cache = std::make_shared<FullCache>(this);
+ cct->_conf.add_observer(this);
+ int r = _set_cache_sizes();
+ if (r < 0) {
+ derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
+ << g_conf()->mon_osd_cache_size
+ << ") without priority cache management"
+ << dendl;
+ }
+}
+
+const char **OSDMonitor::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "mon_memory_target",
+ "mon_memory_autotune",
+ "rocksdb_cache_size",
+ NULL
+ };
+ return KEYS;
+}
+
+void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ dout(10) << __func__ << " " << changed << dendl;
+
+ if (changed.count("mon_memory_autotune")) {
+ _set_cache_autotuning();
+ }
+ if (changed.count("mon_memory_target") ||
+ changed.count("rocksdb_cache_size")) {
+ int r = _update_mon_cache_settings();
+ if (r < 0) {
+ derr << __func__ << " mon_memory_target:"
+ << g_conf()->mon_memory_target
+ << " rocksdb_cache_size:"
+ << g_conf()->rocksdb_cache_size
+ << ". Unable to update cache size."
+ << dendl;
+ }
+ }
+}
+
+void OSDMonitor::_set_cache_autotuning()
+{
+ if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
+ // Disable cache autotuning
+ std::lock_guard l(balancer_lock);
+ pcm = nullptr;
+ }
+
+ if (g_conf()->mon_memory_autotune && pcm == nullptr) {
+ int r = register_cache_with_pcm();
+ if (r < 0) {
+ dout(10) << __func__
+ << " Error while registering osdmon caches with pcm."
+ << " Cache auto tuning not enabled."
+ << dendl;
+ mon_memory_autotune = false;
+ } else {
+ mon_memory_autotune = true;
+ }
+ }
+}
+
+int OSDMonitor::_update_mon_cache_settings()
+{
+ if (g_conf()->mon_memory_target <= 0 ||
+ g_conf()->mon_memory_target < mon_memory_min ||
+ g_conf()->rocksdb_cache_size <= 0) {
+ return -EINVAL;
+ }
+
+ if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
+ derr << __func__ << " not using pcm and rocksdb" << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t old_mon_memory_target = mon_memory_target;
+ uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
+
+ // Set the new pcm memory cache sizes
+ mon_memory_target = g_conf()->mon_memory_target;
+ rocksdb_cache_size = g_conf()->rocksdb_cache_size;
+
+ uint64_t base = mon_memory_base;
+ double fragmentation = mon_memory_fragmentation;
+ uint64_t target = mon_memory_target;
+ uint64_t min = mon_memory_min;
+ uint64_t max = min;
+
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ int r = _set_cache_ratios();
+ if (r < 0) {
+ derr << __func__ << " Cache ratios for pcm could not be set."
+ << " Review the kv (rocksdb) and mon_memory_target sizes."
+ << dendl;
+ mon_memory_target = old_mon_memory_target;
+ rocksdb_cache_size = old_rocksdb_cache_size;
+ return -EINVAL;
+ }
+
+ if (mon_memory_autotune && pcm != nullptr) {
+ std::lock_guard l(balancer_lock);
+ // set pcm cache levels
+ pcm->set_target_memory(target);
+ pcm->set_min_memory(min);
+ pcm->set_max_memory(max);
+ // tune memory based on new values
+ pcm->tune_memory();
+ pcm->balance();
+ _set_new_cache_sizes();
+ dout(1) << __func__ << " Updated mon cache setting."
+ << " target: " << target
+ << " min: " << min
+ << " max: " << max
+ << dendl;
+ }
+ return 0;
+}
+
+int OSDMonitor::_set_cache_sizes()
+{
+ if (g_conf()->mon_memory_autotune) {
+ // set the new osdmon cache targets to be managed by pcm
+ mon_osd_cache_size = g_conf()->mon_osd_cache_size;
+ rocksdb_cache_size = g_conf()->rocksdb_cache_size;
+ mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+ mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ mon_memory_target = g_conf()->mon_memory_target;
+ mon_memory_min = g_conf()->mon_osd_cache_size_min;
+ if (mon_memory_target <= 0 || mon_memory_min <= 0) {
+ derr << __func__ << " mon_memory_target:" << mon_memory_target
+ << " mon_memory_min:" << mon_memory_min
+ << ". Invalid size option(s) provided."
+ << dendl;
+ return -EINVAL;
+ }
+ // Set the initial inc and full LRU cache sizes
+ inc_osd_cache.set_bytes(mon_memory_min);
+ full_osd_cache.set_bytes(mon_memory_min);
+ mon_memory_autotune = g_conf()->mon_memory_autotune;
+ }
+ return 0;
+}
+
+bool OSDMonitor::_have_pending_crush()
+{
+ return pending_inc.crush.length() > 0;
+}
+
+CrushWrapper &OSDMonitor::_get_stable_crush()
+{
+ return *osdmap.crush;
+}
+
+void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
+{
+ bufferlist bl;
+ if (pending_inc.crush.length())
+ bl = pending_inc.crush;
+ else
+ osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ auto p = bl.cbegin();
+ newcrush.decode(p);
+}
+
+void OSDMonitor::create_initial()
+{
+ dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
+
+ OSDMap newmap;
+
+ bufferlist bl;
+ mon.store->get("mkfs", "osdmap", bl);
+
+ if (bl.length()) {
+ newmap.decode(bl);
+ newmap.set_fsid(mon.monmap->fsid);
+ } else {
+ newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
+ }
+ newmap.set_epoch(1);
+ newmap.created = newmap.modified = ceph_clock_now();
+
+ // new clusters should sort bitwise by default.
+ newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
+
+ newmap.flags |=
+ CEPH_OSDMAP_RECOVERY_DELETES |
+ CEPH_OSDMAP_PURGED_SNAPDIRS |
+ CEPH_OSDMAP_PGLOG_HARDLIMIT;
+ newmap.full_ratio = g_conf()->mon_osd_full_ratio;
+ if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
+ newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
+ if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
+ newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
+ if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
+
+ // new cluster should require latest by default
+ if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
+ if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
+ derr << __func__ << " mon_debug_no_require_pacific and octopus=true" << dendl;
+ newmap.require_osd_release = ceph_release_t::nautilus;
+ } else {
+ derr << __func__ << " mon_debug_no_require_pacific=true" << dendl;
+ newmap.require_osd_release = ceph_release_t::octopus;
+ }
+ } else {
+ newmap.require_osd_release = ceph_release_t::pacific;
+ }
+
+ if (newmap.require_osd_release >= ceph_release_t::octopus) {
+ ceph_release_t r = ceph_release_from_name(
+ g_conf()->mon_osd_initial_require_min_compat_client);
+ if (!r) {
+ ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
+ }
+ newmap.require_min_compat_client = r;
+ }
+
+ // encode into pending incremental
+ uint64_t features = newmap.get_encoding_features();
+ newmap.encode(pending_inc.fullmap,
+ features | CEPH_FEATURE_RESERVED);
+ pending_inc.full_crc = newmap.get_crc();
+ dout(20) << " full crc " << pending_inc.full_crc << dendl;
+}
+
+void OSDMonitor::get_store_prefixes(std::set<string>& s) const
+{
+ s.insert(service_name);
+ s.insert(OSD_PG_CREATING_PREFIX);
+ s.insert(OSD_METADATA_PREFIX);
+ s.insert(OSD_SNAP_PREFIX);
+}
+
+void OSDMonitor::update_from_paxos(bool *need_bootstrap)
+{
+ // we really don't care if the version has been updated, because we may
+ // have trimmed without having increased the last committed; yet, we may
+ // need to update the in-memory manifest.
+ load_osdmap_manifest();
+
+ version_t version = get_last_committed();
+ if (version == osdmap.epoch)
+ return;
+ ceph_assert(version > osdmap.epoch);
+
+ dout(15) << "update_from_paxos paxos e " << version
+ << ", my e " << osdmap.epoch << dendl;
+
+ int prev_num_up_osd = osdmap.num_up_osd;
+
+ if (mapping_job) {
+ if (!mapping_job->is_done()) {
+ dout(1) << __func__ << " mapping job "
+ << mapping_job.get() << " did not complete, "
+ << mapping_job->shards << " left, canceling" << dendl;
+ mapping_job->abort();
+ }
+ mapping_job.reset();
+ }
+
+ load_health();
+
+ /*
+ * We will possibly have a stashed latest that *we* wrote, and we will
+ * always be sure to have the oldest full map in the first..last range
+ * due to encode_trim_extra(), which includes the oldest full map in the trim
+ * transaction.
+ *
+ * encode_trim_extra() does not however write the full map's
+ * version to 'full_latest'. This is only done when we are building the
+ * full maps from the incremental versions. But don't panic! We make sure
+ * that the following conditions find whichever full map version is newer.
+ */
+ version_t latest_full = get_version_latest_full();
+ if (latest_full == 0 && get_first_committed() > 1)
+ latest_full = get_first_committed();
+
+ if (get_first_committed() > 1 &&
+ latest_full < get_first_committed()) {
+ // the monitor could be just sync'ed with its peer, and the latest_full key
+ // is not encoded in the paxos commits in encode_pending(), so we need to
+ // make sure we get it pointing to a proper version.
+ version_t lc = get_last_committed();
+ version_t fc = get_first_committed();
+
+ dout(10) << __func__ << " looking for valid full map in interval"
+ << " [" << fc << ", " << lc << "]" << dendl;
+
+ latest_full = 0;
+ for (version_t v = lc; v >= fc; v--) {
+ string full_key = "full_" + stringify(v);
+ if (mon.store->exists(get_service_name(), full_key)) {
+ dout(10) << __func__ << " found latest full map v " << v << dendl;
+ latest_full = v;
+ break;
+ }
+ }
+
+ ceph_assert(latest_full > 0);
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ put_version_latest_full(t, latest_full);
+ mon.store->apply_transaction(t);
+ dout(10) << __func__ << " updated the on-disk full map version to "
+ << latest_full << dendl;
+ }
+
+ if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
+ bufferlist latest_bl;
+ get_version_full(latest_full, latest_bl);
+ ceph_assert(latest_bl.length() != 0);
+ dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
+ osdmap = OSDMap();
+ osdmap.decode(latest_bl);
+ }
+
+ bufferlist bl;
+ if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
+ auto p = bl.cbegin();
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ creating_pgs.decode(p);
+ dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
+ << creating_pgs.last_scan_epoch
+ << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
+ } else {
+ dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
+ << dendl;
+ }
+
+ // walk through incrementals
+ MonitorDBStore::TransactionRef t;
+ size_t tx_size = 0;
+ while (version > osdmap.epoch) {
+ bufferlist inc_bl;
+ int err = get_version(osdmap.epoch+1, inc_bl);
+ ceph_assert(err == 0);
+ ceph_assert(inc_bl.length());
+ // set priority cache manager levels if the osdmap is
+ // being populated for the first time.
+ if (mon_memory_autotune && pcm == nullptr) {
+ int r = register_cache_with_pcm();
+ if (r < 0) {
+ dout(10) << __func__
+ << " Error while registering osdmon caches with pcm."
+ << " Proceeding without cache auto tuning."
+ << dendl;
+ }
+ }
+
+ dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
+ << dendl;
+ OSDMap::Incremental inc(inc_bl);
+ err = osdmap.apply_incremental(inc);
+ ceph_assert(err == 0);
+
+ if (!t)
+ t.reset(new MonitorDBStore::Transaction);
+
+ // Write out the full map for all past epochs. Encode the full
+ // map with the same features as the incremental. If we don't
+ // know, use the quorum features. If we don't know those either,
+ // encode with all features.
+ uint64_t f = inc.encode_features;
+ if (!f)
+ f = mon.get_quorum_con_features();
+ if (!f)
+ f = -1;
+ bufferlist full_bl;
+ osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
+ tx_size += full_bl.length();
+
+ bufferlist orig_full_bl;
+ get_version_full(osdmap.epoch, orig_full_bl);
+ if (orig_full_bl.length()) {
+ // the primary provided the full map
+ ceph_assert(inc.have_crc);
+ if (inc.full_crc != osdmap.crc) {
+ // This will happen if the mons were running mixed versions in
+ // the past or some other circumstance made the full encoded
+ // maps divergent. Reloading here will bring us back into
+ // sync with the primary for this and all future maps. OSDs
+ // will also be brought back into sync when they discover the
+ // crc mismatch and request a full map from a mon.
+ derr << __func__ << " full map CRC mismatch, resetting to canonical"
+ << dendl;
+
+ dout(20) << __func__ << " my (bad) full osdmap:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("osdmap", osdmap);
+ jf.flush(*_dout);
+ *_dout << "\nhexdump:\n";
+ full_bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ osdmap = OSDMap();
+ osdmap.decode(orig_full_bl);
+
+ dout(20) << __func__ << " canonical full osdmap:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("osdmap", osdmap);
+ jf.flush(*_dout);
+ *_dout << "\nhexdump:\n";
+ orig_full_bl.hexdump(*_dout);
+ *_dout << dendl;
+ }
+ } else {
+ ceph_assert(!inc.have_crc);
+ put_version_full(t, osdmap.epoch, full_bl);
+ }
+ put_version_latest_full(t, osdmap.epoch);
+
+ // share
+ dout(1) << osdmap << dendl;
+
+ if (osdmap.epoch == 1) {
+ t->erase("mkfs", "osdmap");
+ }
+
+ if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
+ mon.store->apply_transaction(t);
+ t = MonitorDBStore::TransactionRef();
+ tx_size = 0;
+ }
+ for (const auto [osd, state] : inc.new_state) {
+ if (state & CEPH_OSD_UP) {
+ // could be marked up *or* down, but we're too lazy to check which
+ last_osd_report.erase(osd);
+ }
+ if (state & CEPH_OSD_OUT) {
+ // could be marked in *or* out, but we can safely drop it
+ osd_epochs.erase(osd);
+ }
+ }
+ for (const auto [osd, weight] : inc.new_weight) {
+ if (weight == CEPH_OSD_OUT) {
+ // manually marked out, so drop it
+ osd_epochs.erase(osd);
+ }
+ }
+ }
+
+ if (t) {
+ mon.store->apply_transaction(t);
+ }
+
+ bool marked_osd_down = false;
+ for (int o = 0; o < osdmap.get_max_osd(); o++) {
+ if (osdmap.is_out(o))
+ continue;
+ auto found = down_pending_out.find(o);
+ if (osdmap.is_down(o)) {
+ // populate down -> out map
+ if (found == down_pending_out.end()) {
+ dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
+ down_pending_out[o] = ceph_clock_now();
+ marked_osd_down = true;
+ }
+ } else {
+ if (found != down_pending_out.end()) {
+ dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
+ down_pending_out.erase(found);
+ }
+ }
+ }
+ // XXX: need to trim MonSession connected with a osd whose id > max_osd?
+
+ check_osdmap_subs();
+ check_pg_creates_subs();
+
+ share_map_with_random_osd();
+ update_logger();
+ process_failures();
+
+ // make sure our feature bits reflect the latest map
+ update_msgr_features();
+
+ if (!mon.is_leader()) {
+ // will be called by on_active() on the leader, avoid doing so twice
+ start_mapping();
+ }
+ if (osdmap.stretch_mode_enabled) {
+ dout(20) << "Stretch mode enabled in this map" << dendl;
+ mon.try_engage_stretch_mode();
+ if (osdmap.degraded_stretch_mode) {
+ dout(20) << "Degraded stretch mode set in this map" << dendl;
+ if (!osdmap.recovering_stretch_mode) {
+ mon.set_degraded_stretch_mode();
+ if (prev_num_up_osd < osdmap.num_up_osd &&
+ (osdmap.num_up_osd / (double)osdmap.num_osd) >
+ cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
+ // TODO: This works for 2-site clusters when the OSD maps are appropriately
+ // trimmed and everything is "normal" but not if you have a lot of out OSDs
+ // you're ignoring or in some really degenerate failure cases
+ dout(10) << "Enabling recovery stretch mode in this map" << dendl;
+ mon.go_recovery_stretch_mode();
+ }
+ } else {
+ mon.set_recovery_stretch_mode();
+ }
+ } else {
+ mon.set_healthy_stretch_mode();
+ }
+ if (marked_osd_down &&
+ (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
+ dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
+ mon.maybe_go_degraded_stretch_mode();
+ }
+ }
+}
+
+int OSDMonitor::register_cache_with_pcm()
+{
+ if (mon_memory_target <= 0 || mon_memory_min <= 0) {
+ derr << __func__ << " Invalid memory size specified for mon caches."
+ << " Caches will not be auto-tuned."
+ << dendl;
+ return -EINVAL;
+ }
+ uint64_t base = mon_memory_base;
+ double fragmentation = mon_memory_fragmentation;
+ // For calculating total target memory, consider rocksdb cache size.
+ uint64_t target = mon_memory_target;
+ uint64_t min = mon_memory_min;
+ uint64_t max = min;
+
+ // Apply the same logic as in bluestore to set the max amount
+ // of memory to use for cache. Assume base memory for OSDMaps
+ // and then add in some overhead for fragmentation.
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ rocksdb_binned_kv_cache = mon.store->get_priority_cache();
+ if (!rocksdb_binned_kv_cache) {
+ derr << __func__ << " not using rocksdb" << dendl;
+ return -EINVAL;
+ }
+
+ int r = _set_cache_ratios();
+ if (r < 0) {
+ derr << __func__ << " Cache ratios for pcm could not be set."
+ << " Review the kv (rocksdb) and mon_memory_target sizes."
+ << dendl;
+ return -EINVAL;
+ }
+
+ pcm = std::make_shared<PriorityCache::Manager>(
+ cct, min, max, target, true);
+ pcm->insert("kv", rocksdb_binned_kv_cache, true);
+ pcm->insert("inc", inc_cache, true);
+ pcm->insert("full", full_cache, true);
+ dout(1) << __func__ << " pcm target: " << target
+ << " pcm max: " << max
+ << " pcm min: " << min
+ << " inc_osd_cache size: " << inc_osd_cache.get_size()
+ << dendl;
+ return 0;
+}
+
+int OSDMonitor::_set_cache_ratios()
+{
+ double old_cache_kv_ratio = cache_kv_ratio;
+
+ // Set the cache ratios for kv(rocksdb), inc and full caches
+ cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
+ if (cache_kv_ratio >= 1.0) {
+ derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
+ << ") must be in range [0,<1.0]."
+ << dendl;
+ cache_kv_ratio = old_cache_kv_ratio;
+ return -EINVAL;
+ }
+ rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
+ cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
+ inc_cache->set_cache_ratio(cache_inc_ratio);
+ full_cache->set_cache_ratio(cache_full_ratio);
+
+ dout(1) << __func__ << " kv ratio " << cache_kv_ratio
+ << " inc ratio " << cache_inc_ratio
+ << " full ratio " << cache_full_ratio
+ << dendl;
+ return 0;
+}
+
+void OSDMonitor::start_mapping()
+{
+ // initiate mapping job
+ if (mapping_job) {
+ dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
+ << dendl;
+ mapping_job->abort();
+ }
+ if (!osdmap.get_pools().empty()) {
+ auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
+ mapping_job = mapping.start_update(osdmap, mapper,
+ g_conf()->mon_osd_mapping_pgs_per_chunk);
+ dout(10) << __func__ << " started mapping job " << mapping_job.get()
+ << " at " << fin->start << dendl;
+ mapping_job->set_finish_event(fin);
+ } else {
+ dout(10) << __func__ << " no pools, no mapping job" << dendl;
+ mapping_job = nullptr;
+ }
+}
+
+void OSDMonitor::update_msgr_features()
+{
+ const int types[] = {
+ entity_name_t::TYPE_OSD,
+ entity_name_t::TYPE_CLIENT,
+ entity_name_t::TYPE_MDS,
+ entity_name_t::TYPE_MON
+ };
+ for (int type : types) {
+ uint64_t mask;
+ uint64_t features = osdmap.get_features(type, &mask);
+ if ((mon.messenger->get_policy(type).features_required & mask) != features) {
+ dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
+ ceph::net::Policy p = mon.messenger->get_policy(type);
+ p.features_required = (p.features_required & ~mask) | features;
+ mon.messenger->set_policy(type, p);
+ }
+ }
+}
+
+void OSDMonitor::on_active()
+{
+ update_logger();
+
+ if (mon.is_leader()) {
+ mon.clog->debug() << "osdmap " << osdmap;
+ if (!priority_convert) {
+ // Only do this once at start-up
+ convert_pool_priorities();
+ priority_convert = true;
+ }
+ } else {
+ list<MonOpRequestRef> ls;
+ take_all_failures(ls);
+ while (!ls.empty()) {
+ MonOpRequestRef op = ls.front();
+ op->mark_osdmon_event(__func__);
+ dispatch(op);
+ ls.pop_front();
+ }
+ }
+ start_mapping();
+}
+
+void OSDMonitor::on_restart()
+{
+ last_osd_report.clear();
+}
+
+void OSDMonitor::on_shutdown()
+{
+ dout(10) << __func__ << dendl;
+ if (mapping_job) {
+ dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
+ << dendl;
+ mapping_job->abort();
+ }
+
+ // discard failure info, waiters
+ list<MonOpRequestRef> ls;
+ take_all_failures(ls);
+ ls.clear();
+}
+
+void OSDMonitor::update_logger()
+{
+ dout(10) << "update_logger" << dendl;
+
+ mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
+ mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
+ mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
+ mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
+}
+
+void OSDMonitor::create_pending()
+{
+ pending_inc = OSDMap::Incremental(osdmap.epoch+1);
+ pending_inc.fsid = mon.monmap->fsid;
+ pending_metadata.clear();
+ pending_metadata_rm.clear();
+ pending_pseudo_purged_snaps.clear();
+
+ dout(10) << "create_pending e " << pending_inc.epoch << dendl;
+
+ // safety checks (this shouldn't really happen)
+ {
+ if (osdmap.backfillfull_ratio <= 0) {
+ pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
+ if (pending_inc.new_backfillfull_ratio > 1.0)
+ pending_inc.new_backfillfull_ratio /= 100;
+ dout(1) << __func__ << " setting backfillfull_ratio = "
+ << pending_inc.new_backfillfull_ratio << dendl;
+ }
+ if (osdmap.full_ratio <= 0) {
+ pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
+ if (pending_inc.new_full_ratio > 1.0)
+ pending_inc.new_full_ratio /= 100;
+ dout(1) << __func__ << " setting full_ratio = "
+ << pending_inc.new_full_ratio << dendl;
+ }
+ if (osdmap.nearfull_ratio <= 0) {
+ pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
+ if (pending_inc.new_nearfull_ratio > 1.0)
+ pending_inc.new_nearfull_ratio /= 100;
+ dout(1) << __func__ << " setting nearfull_ratio = "
+ << pending_inc.new_nearfull_ratio << dendl;
+ }
+ }
+
+ // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
+ // structure.
+ if (osdmap.crush->has_legacy_rule_ids()) {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ // First, for all pools, work out which rule they really used
+ // by resolving ruleset to rule.
+ for (const auto &i : osdmap.get_pools()) {
+ const auto pool_id = i.first;
+ const auto &pool = i.second;
+ int new_rule_id = newcrush.find_rule(pool.crush_rule,
+ pool.type, pool.size);
+
+ dout(1) << __func__ << " rewriting pool "
+ << osdmap.get_pool_name(pool_id) << " crush ruleset "
+ << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
+ if (pending_inc.new_pools.count(pool_id) == 0) {
+ pending_inc.new_pools[pool_id] = pool;
+ }
+ pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
+ }
+
+ // Now, go ahead and renumber all the rules so that their
+ // rule_id field corresponds to their position in the array
+ auto old_to_new = newcrush.renumber_rules();
+ dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
+ for (const auto &i : old_to_new) {
+ dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+}
+
+creating_pgs_t
+OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
+ const OSDMap& nextmap)
+{
+ dout(10) << __func__ << dendl;
+ creating_pgs_t pending_creatings;
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ pending_creatings = creating_pgs;
+ }
+ // check for new or old pools
+ if (pending_creatings.last_scan_epoch < inc.epoch) {
+ unsigned queued = 0;
+ queued += scan_for_creating_pgs(osdmap.get_pools(),
+ inc.old_pools,
+ inc.modified,
+ &pending_creatings);
+ queued += scan_for_creating_pgs(inc.new_pools,
+ inc.old_pools,
+ inc.modified,
+ &pending_creatings);
+ dout(10) << __func__ << " " << queued << " pools queued" << dendl;
+ for (auto deleted_pool : inc.old_pools) {
+ auto removed = pending_creatings.remove_pool(deleted_pool);
+ dout(10) << __func__ << " " << removed
+ << " pg removed because containing pool deleted: "
+ << deleted_pool << dendl;
+ last_epoch_clean.remove_pool(deleted_pool);
+ }
+ // pgmon updates its creating_pgs in check_osd_map() which is called by
+ // on_active() and check_osd_map() could be delayed if lease expires, so its
+ // creating_pgs could be stale in comparison with the one of osdmon. let's
+ // trim them here. otherwise, they will be added back after being erased.
+ unsigned removed = 0;
+ for (auto& pg : pending_created_pgs) {
+ dout(20) << __func__ << " noting created pg " << pg << dendl;
+ pending_creatings.created_pools.insert(pg.pool());
+ removed += pending_creatings.pgs.erase(pg);
+ }
+ pending_created_pgs.clear();
+ dout(10) << __func__ << " " << removed
+ << " pgs removed because they're created" << dendl;
+ pending_creatings.last_scan_epoch = osdmap.get_epoch();
+ }
+
+ // filter out any pgs that shouldn't exist.
+ {
+ auto i = pending_creatings.pgs.begin();
+ while (i != pending_creatings.pgs.end()) {
+ if (!nextmap.pg_exists(i->first)) {
+ dout(10) << __func__ << " removing pg " << i->first
+ << " which should not exist" << dendl;
+ i = pending_creatings.pgs.erase(i);
+ } else {
+ ++i;
+ }
+ }
+ }
+
+ // process queue
+ unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
+ const auto total = pending_creatings.pgs.size();
+ while (pending_creatings.pgs.size() < max &&
+ !pending_creatings.queue.empty()) {
+ auto p = pending_creatings.queue.begin();
+ int64_t poolid = p->first;
+ dout(10) << __func__ << " pool " << poolid
+ << " created " << p->second.created
+ << " modified " << p->second.modified
+ << " [" << p->second.start << "-" << p->second.end << ")"
+ << dendl;
+ int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
+ p->second.end - p->second.start);
+ ps_t first = p->second.start;
+ ps_t end = first + n;
+ for (ps_t ps = first; ps < end; ++ps) {
+ const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
+ // NOTE: use the *current* epoch as the PG creation epoch so that the
+ // OSD does not have to generate a long set of PastIntervals.
+ pending_creatings.pgs.emplace(
+ pgid,
+ creating_pgs_t::pg_create_info(inc.epoch,
+ p->second.modified));
+ dout(10) << __func__ << " adding " << pgid << dendl;
+ }
+ p->second.start = end;
+ if (p->second.done()) {
+ dout(10) << __func__ << " done with queue for " << poolid << dendl;
+ pending_creatings.queue.erase(p);
+ } else {
+ dout(10) << __func__ << " pool " << poolid
+ << " now [" << p->second.start << "-" << p->second.end << ")"
+ << dendl;
+ }
+ }
+ dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
+ << " pools" << dendl;
+
+ if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
+ // walk creating pgs' history and past_intervals forward
+ for (auto& i : pending_creatings.pgs) {
+ // this mirrors PG::start_peering_interval()
+ pg_t pgid = i.first;
+
+ // this is a bit imprecise, but sufficient?
+ struct min_size_predicate_t : public IsPGRecoverablePredicate {
+ const pg_pool_t *pi;
+ bool operator()(const set<pg_shard_t> &have) const {
+ return have.size() >= pi->min_size;
+ }
+ explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+ } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
+
+ vector<int> up, acting;
+ int up_primary, acting_primary;
+ nextmap.pg_to_up_acting_osds(
+ pgid, &up, &up_primary, &acting, &acting_primary);
+ if (i.second.history.epoch_created == 0) {
+ // new pg entry, set it up
+ i.second.up = up;
+ i.second.acting = acting;
+ i.second.up_primary = up_primary;
+ i.second.acting_primary = acting_primary;
+ i.second.history = pg_history_t(i.second.create_epoch,
+ i.second.create_stamp);
+ dout(10) << __func__ << " pg " << pgid << " just added, "
+ << " up " << i.second.up
+ << " p " << i.second.up_primary
+ << " acting " << i.second.acting
+ << " p " << i.second.acting_primary
+ << " history " << i.second.history
+ << " past_intervals " << i.second.past_intervals
+ << dendl;
+ } else {
+ std::stringstream debug;
+ if (PastIntervals::check_new_interval(
+ i.second.acting_primary, acting_primary,
+ i.second.acting, acting,
+ i.second.up_primary, up_primary,
+ i.second.up, up,
+ i.second.history.same_interval_since,
+ i.second.history.last_epoch_clean,
+ &nextmap,
+ &osdmap,
+ pgid,
+ min_size_predicate,
+ &i.second.past_intervals,
+ &debug)) {
+ epoch_t e = inc.epoch;
+ i.second.history.same_interval_since = e;
+ if (i.second.up != up) {
+ i.second.history.same_up_since = e;
+ }
+ if (i.second.acting_primary != acting_primary) {
+ i.second.history.same_primary_since = e;
+ }
+ if (pgid.is_split(
+ osdmap.get_pg_num(pgid.pool()),
+ nextmap.get_pg_num(pgid.pool()),
+ nullptr)) {
+ i.second.history.last_epoch_split = e;
+ }
+ dout(10) << __func__ << " pg " << pgid << " new interval,"
+ << " up " << i.second.up << " -> " << up
+ << " p " << i.second.up_primary << " -> " << up_primary
+ << " acting " << i.second.acting << " -> " << acting
+ << " p " << i.second.acting_primary << " -> "
+ << acting_primary
+ << " history " << i.second.history
+ << " past_intervals " << i.second.past_intervals
+ << dendl;
+ dout(20) << " debug: " << debug.str() << dendl;
+ i.second.up = up;
+ i.second.acting = acting;
+ i.second.up_primary = up_primary;
+ i.second.acting_primary = acting_primary;
+ }
+ }
+ }
+ }
+ dout(10) << __func__
+ << " " << (pending_creatings.pgs.size() - total)
+ << "/" << pending_creatings.pgs.size()
+ << " pgs added from queued pools" << dendl;
+ return pending_creatings;
+}
+
+void OSDMonitor::maybe_prime_pg_temp()
+{
+ bool all = false;
+ if (pending_inc.crush.length()) {
+ dout(10) << __func__ << " new crush map, all" << dendl;
+ all = true;
+ }
+
+ if (!pending_inc.new_up_client.empty()) {
+ dout(10) << __func__ << " new up osds, all" << dendl;
+ all = true;
+ }
+
+ // check for interesting OSDs
+ set<int> osds;
+ for (auto p = pending_inc.new_state.begin();
+ !all && p != pending_inc.new_state.end();
+ ++p) {
+ if ((p->second & CEPH_OSD_UP) &&
+ osdmap.is_up(p->first)) {
+ osds.insert(p->first);
+ }
+ }
+ for (auto p = pending_inc.new_weight.begin();
+ !all && p != pending_inc.new_weight.end();
+ ++p) {
+ if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
+ // weight reduction
+ osds.insert(p->first);
+ } else {
+ dout(10) << __func__ << " osd." << p->first << " weight increase, all"
+ << dendl;
+ all = true;
+ }
+ }
+
+ if (!all && osds.empty())
+ return;
+
+ if (!all) {
+ unsigned estimate =
+ mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
+ if (estimate > mapping.get_num_pgs() *
+ g_conf()->mon_osd_prime_pg_temp_max_estimate) {
+ dout(10) << __func__ << " estimate " << estimate << " pgs on "
+ << osds.size() << " osds >= "
+ << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
+ << mapping.get_num_pgs() << " pgs, all"
+ << dendl;
+ all = true;
+ } else {
+ dout(10) << __func__ << " estimate " << estimate << " pgs on "
+ << osds.size() << " osds" << dendl;
+ }
+ }
+
+ OSDMap next;
+ next.deepish_copy_from(osdmap);
+ next.apply_incremental(pending_inc);
+
+ if (next.get_pools().empty()) {
+ dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
+ } else if (all) {
+ PrimeTempJob job(next, this);
+ mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
+ if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
+ dout(10) << __func__ << " done in " << job.get_duration() << dendl;
+ } else {
+ dout(10) << __func__ << " did not finish in "
+ << g_conf()->mon_osd_prime_pg_temp_max_time
+ << ", stopping" << dendl;
+ job.abort();
+ }
+ } else {
+ dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
+ utime_t stop = ceph_clock_now();
+ stop += g_conf()->mon_osd_prime_pg_temp_max_time;
+ const int chunk = 1000;
+ int n = chunk;
+ std::unordered_set<pg_t> did_pgs;
+ for (auto osd : osds) {
+ auto& pgs = mapping.get_osd_acting_pgs(osd);
+ dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
+ for (auto pgid : pgs) {
+ if (!did_pgs.insert(pgid).second) {
+ continue;
+ }
+ prime_pg_temp(next, pgid);
+ if (--n <= 0) {
+ n = chunk;
+ if (ceph_clock_now() > stop) {
+ dout(10) << __func__ << " consumed more than "
+ << g_conf()->mon_osd_prime_pg_temp_max_time
+ << " seconds, stopping"
+ << dendl;
+ return;
+ }
+ }
+ }
+ }
+ }
+}
+
+void OSDMonitor::prime_pg_temp(
+ const OSDMap& next,
+ pg_t pgid)
+{
+ // TODO: remove this creating_pgs direct access?
+ if (creating_pgs.pgs.count(pgid)) {
+ return;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ return;
+ }
+
+ vector<int> up, acting;
+ mapping.get(pgid, &up, nullptr, &acting, nullptr);
+
+ vector<int> next_up, next_acting;
+ int next_up_primary, next_acting_primary;
+ next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
+ &next_acting, &next_acting_primary);
+ if (acting == next_acting &&
+ !(up != acting && next_up == next_acting))
+ return; // no change since last epoch
+
+ if (acting.empty())
+ return; // if previously empty now we can be no worse off
+ const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
+ if (pool && acting.size() < pool->min_size)
+ return; // can be no worse off than before
+
+ if (next_up == next_acting) {
+ acting.clear();
+ dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
+ << dendl;
+ }
+
+ dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
+ << " -> " << next_up << "/" << next_acting
+ << ", priming " << acting
+ << dendl;
+ {
+ std::lock_guard l(prime_pg_temp_lock);
+ // do not touch a mapping if a change is pending
+ pending_inc.new_pg_temp.emplace(
+ pgid,
+ mempool::osdmap::vector<int>(acting.begin(), acting.end()));
+ }
+}
+
+/**
+ * @note receiving a transaction in this function gives a fair amount of
+ * freedom to the service implementation if it does need it. It shouldn't.
+ */
+void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+ dout(10) << "encode_pending e " << pending_inc.epoch
+ << dendl;
+
+ if (do_prune(t)) {
+ dout(1) << __func__ << " osdmap full prune encoded e"
+ << pending_inc.epoch << dendl;
+ }
+
+ // finalize up pending_inc
+ pending_inc.modified = ceph_clock_now();
+
+ int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
+ ceph_assert(r == 0);
+
+ if (mapping_job) {
+ if (!mapping_job->is_done()) {
+ dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
+ << mapping_job.get() << " did not complete, "
+ << mapping_job->shards << " left" << dendl;
+ mapping_job->abort();
+ } else if (mapping.get_epoch() < osdmap.get_epoch()) {
+ dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
+ << mapping_job.get() << " is prior epoch "
+ << mapping.get_epoch() << dendl;
+ } else {
+ if (g_conf()->mon_osd_prime_pg_temp) {
+ maybe_prime_pg_temp();
+ }
+ }
+ } else if (g_conf()->mon_osd_prime_pg_temp) {
+ dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
+ << dendl;
+ }
+ mapping_job.reset();
+
+ // ensure we don't have blank new_state updates. these are interrpeted as
+ // CEPH_OSD_UP (and almost certainly not what we want!).
+ auto p = pending_inc.new_state.begin();
+ while (p != pending_inc.new_state.end()) {
+ if (p->second == 0) {
+ dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
+ p = pending_inc.new_state.erase(p);
+ } else {
+ if (p->second & CEPH_OSD_UP) {
+ pending_inc.new_last_up_change = pending_inc.modified;
+ }
+ ++p;
+ }
+ }
+ if (!pending_inc.new_up_client.empty()) {
+ pending_inc.new_last_up_change = pending_inc.modified;
+ }
+ for (auto& i : pending_inc.new_weight) {
+ if (i.first >= osdmap.max_osd) {
+ if (i.second) {
+ // new osd is already marked in
+ pending_inc.new_last_in_change = pending_inc.modified;
+ break;
+ }
+ } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
+ // existing osd marked in or out
+ pending_inc.new_last_in_change = pending_inc.modified;
+ break;
+ }
+ }
+
+ {
+ OSDMap tmp;
+ tmp.deepish_copy_from(osdmap);
+ tmp.apply_incremental(pending_inc);
+
+ // clean pg_temp mappings
+ OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
+
+ // clean inappropriate pg_upmap/pg_upmap_items (if any)
+ {
+ // check every upmapped pg for now
+ // until we could reliably identify certain cases to ignore,
+ // which is obviously the hard part TBD..
+ vector<pg_t> pgs_to_check;
+ tmp.get_upmap_pgs(&pgs_to_check);
+ if (pgs_to_check.size() <
+ static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
+ // not enough pgs, do it inline
+ tmp.clean_pg_upmaps(cct, &pending_inc);
+ } else {
+ CleanUpmapJob job(cct, tmp, pending_inc);
+ mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
+ job.wait();
+ }
+ }
+
+ // update creating pgs first so that we can remove the created pgid and
+ // process the pool flag removal below in the same osdmap epoch.
+ auto pending_creatings = update_pending_pgs(pending_inc, tmp);
+ bufferlist creatings_bl;
+ uint64_t features = CEPH_FEATURES_ALL;
+ if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
+ dout(20) << __func__ << " encoding pending pgs without octopus features"
+ << dendl;
+ features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+ }
+ encode(pending_creatings, creatings_bl, features);
+ t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
+
+ // remove any old (or incompat) POOL_CREATING flags
+ for (auto& i : tmp.get_pools()) {
+ if (tmp.require_osd_release < ceph_release_t::nautilus) {
+ // pre-nautilus OSDMaps shouldn't get this flag.
+ if (pending_inc.new_pools.count(i.first)) {
+ pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
+ }
+ }
+ if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
+ !pending_creatings.still_creating_pool(i.first)) {
+ dout(10) << __func__ << " done creating pool " << i.first
+ << ", clearing CREATING flag" << dendl;
+ if (pending_inc.new_pools.count(i.first) == 0) {
+ pending_inc.new_pools[i.first] = i.second;
+ }
+ pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
+ }
+ }
+
+ // collect which pools are currently affected by
+ // the near/backfill/full osd(s),
+ // and set per-pool near/backfill/full flag instead
+ set<int64_t> full_pool_ids;
+ set<int64_t> backfillfull_pool_ids;
+ set<int64_t> nearfull_pool_ids;
+ tmp.get_full_pools(cct,
+ &full_pool_ids,
+ &backfillfull_pool_ids,
+ &nearfull_pool_ids);
+ if (full_pool_ids.empty() ||
+ backfillfull_pool_ids.empty() ||
+ nearfull_pool_ids.empty()) {
+ // normal case - no nearfull, backfillfull or full osds
+ // try cancel any improper nearfull/backfillfull/full pool
+ // flags first
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
+ nearfull_pool_ids.empty()) {
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s nearfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ // load original pool info first!
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
+ backfillfull_pool_ids.empty()) {
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s backfillfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
+ full_pool_ids.empty()) {
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // set by EQUOTA, skipping
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s full flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+ }
+ }
+ }
+ if (!full_pool_ids.empty()) {
+ dout(10) << __func__ << " marking pool(s) " << full_pool_ids
+ << " as full" << dendl;
+ for (auto &p: full_pool_ids) {
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
+ continue;
+ }
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_FULL for pools which are no longer full too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p)) {
+ // skip pools we have just marked as full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
+ tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // don't touch if currently is not full
+ // or is running out of quota (and hence considered as full)
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s full flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+ }
+ }
+ if (!backfillfull_pool_ids.empty()) {
+ for (auto &p: backfillfull_pool_ids) {
+ if (full_pool_ids.count(p)) {
+ // skip pools we have already considered as full above
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // make sure FLAG_FULL is truly set, so we are safe not
+ // to set a extra (redundant) FLAG_BACKFILLFULL flag
+ ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ // don't bother if pool is already marked as backfillfull
+ continue;
+ }
+ dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+ << "'s as backfillfull" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_BACKFILLFULL for pools
+ // which are no longer backfillfull too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+ // skip pools we have just marked as backfillfull/full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ // and don't touch if currently is not backfillfull
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s backfillfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+ }
+ }
+ if (!nearfull_pool_ids.empty()) {
+ for (auto &p: nearfull_pool_ids) {
+ if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // make sure FLAG_FULL is truly set, so we are safe not
+ // to set a extra (redundant) FLAG_NEARFULL flag
+ ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+ continue;
+ }
+ if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ // don't bother if pool is already marked as nearfull
+ continue;
+ }
+ dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+ << "'s as nearfull" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = tmp.pools[p];
+ }
+ pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
+ }
+ // cancel FLAG_NEARFULL for pools
+ // which are no longer nearfull too
+ for (auto &pool: tmp.get_pools()) {
+ auto p = pool.first;
+ if (full_pool_ids.count(p) ||
+ backfillfull_pool_ids.count(p) ||
+ nearfull_pool_ids.count(p)) {
+ // skip pools we have just marked as
+ // nearfull/backfillfull/full above
+ continue;
+ }
+ if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ // and don't touch if currently is not nearfull
+ continue;
+ }
+ dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+ << "'s nearfull flag" << dendl;
+ if (pending_inc.new_pools.count(p) == 0) {
+ pending_inc.new_pools[p] = pool.second;
+ }
+ pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+ }
+ }
+
+ // min_compat_client?
+ if (!tmp.require_min_compat_client) {
+ auto mv = tmp.get_min_compat_client();
+ dout(1) << __func__ << " setting require_min_compat_client to currently "
+ << "required " << mv << dendl;
+ mon.clog->info() << "setting require_min_compat_client to currently "
+ << "required " << mv;
+ pending_inc.new_require_min_compat_client = mv;
+ }
+
+ if (osdmap.require_osd_release < ceph_release_t::nautilus &&
+ tmp.require_osd_release >= ceph_release_t::nautilus) {
+ dout(10) << __func__ << " first nautilus+ epoch" << dendl;
+ // add creating flags?
+ for (auto& i : tmp.get_pools()) {
+ if (pending_creatings.still_creating_pool(i.first)) {
+ dout(10) << __func__ << " adding CREATING flag to pool " << i.first
+ << dendl;
+ if (pending_inc.new_pools.count(i.first) == 0) {
+ pending_inc.new_pools[i.first] = i.second;
+ }
+ pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
+ }
+ }
+ // adjust blocklist items to all be TYPE_ANY
+ for (auto& i : tmp.blocklist) {
+ auto a = i.first;
+ a.set_type(entity_addr_t::TYPE_ANY);
+ pending_inc.new_blocklist[a] = i.second;
+ pending_inc.old_blocklist.push_back(i.first);
+ }
+ }
+
+ if (osdmap.require_osd_release < ceph_release_t::octopus &&
+ tmp.require_osd_release >= ceph_release_t::octopus) {
+ dout(10) << __func__ << " first octopus+ epoch" << dendl;
+
+ // adjust obsoleted cache modes
+ for (auto& [poolid, pi] : tmp.pools) {
+ if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
+ if (pending_inc.new_pools.count(poolid) == 0) {
+ pending_inc.new_pools[poolid] = pi;
+ }
+ dout(10) << __func__ << " switching pool " << poolid
+ << " cachemode from forward -> proxy" << dendl;
+ pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
+ }
+ if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
+ if (pending_inc.new_pools.count(poolid) == 0) {
+ pending_inc.new_pools[poolid] = pi;
+ }
+ dout(10) << __func__ << " switching pool " << poolid
+ << " cachemode from readforward -> readproxy" << dendl;
+ pending_inc.new_pools[poolid].cache_mode =
+ pg_pool_t::CACHEMODE_READPROXY;
+ }
+ }
+
+ // clear removed_snaps for every pool
+ for (auto& [poolid, pi] : tmp.pools) {
+ if (pi.removed_snaps.empty()) {
+ continue;
+ }
+ if (pending_inc.new_pools.count(poolid) == 0) {
+ pending_inc.new_pools[poolid] = pi;
+ }
+ dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
+ << dendl;
+ pending_inc.new_pools[poolid].removed_snaps.clear();
+ }
+
+ // create a combined purged snap epoch key for all purged snaps
+ // prior to this epoch, and store it in the current epoch (i.e.,
+ // the last pre-octopus epoch, just prior to the one we're
+ // encoding now).
+ auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+ it->lower_bound("purged_snap_");
+ map<int64_t,snap_interval_set_t> combined;
+ while (it->valid()) {
+ if (it->key().find("purged_snap_") != 0) {
+ break;
+ }
+ string k = it->key();
+ long long unsigned pool;
+ int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
+ if (n != 1) {
+ derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
+ } else {
+ bufferlist v = it->value();
+ auto p = v.cbegin();
+ snapid_t begin, end;
+ ceph::decode(begin, p);
+ ceph::decode(end, p);
+ combined[pool].insert(begin, end - begin);
+ }
+ it->next();
+ }
+ if (!combined.empty()) {
+ string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
+ bufferlist v;
+ ceph::encode(combined, v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
+ << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
+ << dendl;
+ } else {
+ dout(10) << __func__ << " there were no pre-octopus purged snaps"
+ << dendl;
+ }
+
+ // clean out the old removed_snap_ and removed_epoch keys
+ // ('`' is ASCII '_' + 1)
+ t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
+ t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
+ }
+ }
+
+ // tell me about it
+ for (auto i = pending_inc.new_state.begin();
+ i != pending_inc.new_state.end();
+ ++i) {
+ int s = i->second ? i->second : CEPH_OSD_UP;
+ if (s & CEPH_OSD_UP) {
+ dout(2) << " osd." << i->first << " DOWN" << dendl;
+ // Reset laggy parameters if failure interval exceeds a threshold.
+ const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
+ if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
+ int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
+ if (grace_interval_threshold_exceeded(last_failure_interval)) {
+ set_default_laggy_params(i->first);
+ }
+ }
+ }
+ if (s & CEPH_OSD_EXISTS)
+ dout(2) << " osd." << i->first << " DNE" << dendl;
+ }
+ for (auto i = pending_inc.new_up_client.begin();
+ i != pending_inc.new_up_client.end();
+ ++i) {
+ //FIXME: insert cluster addresses too
+ dout(2) << " osd." << i->first << " UP " << i->second << dendl;
+ }
+ for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
+ i != pending_inc.new_weight.end();
+ ++i) {
+ if (i->second == CEPH_OSD_OUT) {
+ dout(2) << " osd." << i->first << " OUT" << dendl;
+ } else if (i->second == CEPH_OSD_IN) {
+ dout(2) << " osd." << i->first << " IN" << dendl;
+ } else {
+ dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
+ }
+ }
+
+ // features for osdmap and its incremental
+ uint64_t features;
+
+ // encode full map and determine its crc
+ OSDMap tmp;
+ {
+ tmp.deepish_copy_from(osdmap);
+ tmp.apply_incremental(pending_inc);
+
+ // determine appropriate features
+ features = tmp.get_encoding_features();
+ dout(10) << __func__ << " encoding full map with "
+ << tmp.require_osd_release
+ << " features " << features << dendl;
+
+ // the features should be a subset of the mon quorum's features!
+ ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
+
+ bufferlist fullbl;
+ encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
+ pending_inc.full_crc = tmp.get_crc();
+
+ // include full map in the txn. note that old monitors will
+ // overwrite this. new ones will now skip the local full map
+ // encode and reload from this.
+ put_version_full(t, pending_inc.epoch, fullbl);
+ }
+
+ // encode
+ ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
+ bufferlist bl;
+ encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
+
+ dout(20) << " full_crc " << tmp.get_crc()
+ << " inc_crc " << pending_inc.inc_crc << dendl;
+
+ /* put everything in the transaction */
+ put_version(t, pending_inc.epoch, bl);
+ put_last_committed(t, pending_inc.epoch);
+
+ // metadata, too!
+ for (map<int,bufferlist>::iterator p = pending_metadata.begin();
+ p != pending_metadata.end();
+ ++p)
+ t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
+ for (set<int>::iterator p = pending_metadata_rm.begin();
+ p != pending_metadata_rm.end();
+ ++p)
+ t->erase(OSD_METADATA_PREFIX, stringify(*p));
+ pending_metadata.clear();
+ pending_metadata_rm.clear();
+
+ // purged_snaps
+ if (tmp.require_osd_release >= ceph_release_t::octopus &&
+ !pending_inc.new_purged_snaps.empty()) {
+ // all snaps purged this epoch (across all pools)
+ string k = make_purged_snap_epoch_key(pending_inc.epoch);
+ bufferlist v;
+ encode(pending_inc.new_purged_snaps, v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ }
+ for (auto& i : pending_inc.new_purged_snaps) {
+ for (auto q = i.second.begin();
+ q != i.second.end();
+ ++q) {
+ insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
+ pending_inc.epoch,
+ t);
+ }
+ }
+ for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
+ for (auto snap : snaps) {
+ insert_purged_snap_update(pool, snap, snap + 1,
+ pending_inc.epoch,
+ t);
+ }
+ }
+
+ // health
+ health_check_map_t next;
+ tmp.check_health(cct, &next);
+ encode_health(next, t);
+}
+
+int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
+{
+ bufferlist bl;
+ int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
+ if (r < 0)
+ return r;
+ try {
+ auto p = bl.cbegin();
+ decode(m, p);
+ }
+ catch (ceph::buffer::error& e) {
+ if (err)
+ *err << "osd." << osd << " metadata is corrupt";
+ return -EIO;
+ }
+ return 0;
+}
+
+void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
+{
+ for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
+ if (osdmap.is_up(osd)) {
+ map<string,string> meta;
+ load_metadata(osd, meta, nullptr);
+ auto p = meta.find(field);
+ if (p == meta.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[p->second]++;
+ }
+ }
+ }
+}
+
+void OSDMonitor::count_metadata(const string& field, Formatter *f)
+{
+ map<string,int> by_val;
+ count_metadata(field, &by_val);
+ f->open_object_section(field.c_str());
+ for (auto& p : by_val) {
+ f->dump_int(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
+{
+ for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
+ if (osdmap.is_up(osd)) {
+ map<string,string> meta;
+ load_metadata(osd, meta, nullptr);
+ auto p = meta.find("ceph_version_short");
+ if (p == meta.end()) continue;
+ versions[p->second].push_back(string("osd.") + stringify(osd));
+ }
+ }
+}
+
+int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
+{
+ map<string, string> metadata;
+ int r = load_metadata(osd, metadata, nullptr);
+ if (r < 0)
+ return r;
+
+ auto it = metadata.find("osd_objectstore");
+ if (it == metadata.end())
+ return -ENOENT;
+ *type = it->second;
+ return 0;
+}
+
+bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
+ const pg_pool_t &pool,
+ ostream *err)
+{
+ // just check a few pgs for efficiency - this can't give a guarantee anyway,
+ // since filestore osds could always join the pool later
+ set<int> checked_osds;
+ for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
+ vector<int> up, acting;
+ pg_t pgid(ps, pool_id);
+ osdmap.pg_to_up_acting_osds(pgid, up, acting);
+ for (int osd : up) {
+ if (checked_osds.find(osd) != checked_osds.end())
+ continue;
+ string objectstore_type;
+ int r = get_osd_objectstore_type(osd, &objectstore_type);
+ // allow with missing metadata, e.g. due to an osd never booting yet
+ if (r < 0 || objectstore_type == "bluestore") {
+ checked_osds.insert(osd);
+ continue;
+ }
+ *err << "osd." << osd << " uses " << objectstore_type;
+ return false;
+ }
+ }
+ return true;
+}
+
+int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
+{
+ map<string,string> m;
+ if (int r = load_metadata(osd, m, err))
+ return r;
+ for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
+ f->dump_string(p->first.c_str(), p->second);
+ return 0;
+}
+
+void OSDMonitor::print_nodes(Formatter *f)
+{
+ // group OSDs by their hosts
+ map<string, list<int> > osds; // hostname => osd
+ for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
+ map<string, string> m;
+ if (load_metadata(osd, m, NULL)) {
+ continue;
+ }
+ map<string, string>::iterator hostname = m.find("hostname");
+ if (hostname == m.end()) {
+ // not likely though
+ continue;
+ }
+ osds[hostname->second].push_back(osd);
+ }
+
+ dump_services(f, osds, "osd");
+}
+
+void OSDMonitor::share_map_with_random_osd()
+{
+ if (osdmap.get_num_up_osds() == 0) {
+ dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
+ return;
+ }
+
+ MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
+ if (!s) {
+ dout(10) << __func__ << " no up osd on our session map" << dendl;
+ return;
+ }
+
+ dout(10) << "committed, telling random " << s->name
+ << " all about it" << dendl;
+
+ // get feature of the peer
+ // use quorum_con_features, if it's an anonymous connection.
+ uint64_t features = s->con_features ? s->con_features :
+ mon.get_quorum_con_features();
+ // whatev, they'll request more if they need it
+ MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
+ s->con->send_message(m);
+ // NOTE: do *not* record osd has up to this epoch (as we do
+ // elsewhere) as they may still need to request older values.
+}
+
+version_t OSDMonitor::get_trim_to() const
+{
+ if (mon.get_quorum().empty()) {
+ dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
+ return 0;
+ }
+
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ if (!creating_pgs.pgs.empty()) {
+ dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
+ return 0;
+ }
+ }
+
+ if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
+ dout(0) << __func__
+ << " blocking osdmap trim"
+ << " ('mon_debug_block_osdmap_trim' set to 'true')"
+ << " trim_to = 0" << dendl;
+ return 0;
+ }
+
+ {
+ epoch_t floor = get_min_last_epoch_clean();
+ dout(10) << " min_last_epoch_clean " << floor << dendl;
+ if (g_conf()->mon_osd_force_trim_to > 0 &&
+ g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
+ floor = g_conf()->mon_osd_force_trim_to;
+ dout(10) << __func__
+ << " explicit mon_osd_force_trim_to = " << floor << dendl;
+ }
+ unsigned min = g_conf()->mon_min_osdmap_epochs;
+ if (floor + min > get_last_committed()) {
+ if (min < get_last_committed())
+ floor = get_last_committed() - min;
+ else
+ floor = 0;
+ }
+ if (floor > get_first_committed()) {
+ dout(10) << __func__ << " trim_to = " << floor << dendl;
+ return floor;
+ }
+ }
+ dout(10) << __func__ << " trim_to = 0" << dendl;
+ return 0;
+}
+
+epoch_t OSDMonitor::get_min_last_epoch_clean() const
+{
+ auto floor = last_epoch_clean.get_lower_bound(osdmap);
+ // also scan osd epochs
+ // don't trim past the oldest reported osd epoch
+ for (auto [osd, epoch] : osd_epochs) {
+ if (epoch < floor) {
+ floor = epoch;
+ }
+ }
+ return floor;
+}
+
+void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
+ version_t first)
+{
+ dout(10) << __func__ << " including full map for e " << first << dendl;
+ bufferlist bl;
+ get_version_full(first, bl);
+ put_version_full(tx, first, bl);
+
+ if (has_osdmap_manifest &&
+ first > osdmap_manifest.get_first_pinned()) {
+ _prune_update_trimmed(tx, first);
+ }
+}
+
+
+/* full osdmap prune
+ *
+ * for more information, please refer to doc/dev/mon-osdmap-prune.rst
+ */
+
+void OSDMonitor::load_osdmap_manifest()
+{
+ bool store_has_manifest =
+ mon.store->exists(get_service_name(), "osdmap_manifest");
+
+ if (!store_has_manifest) {
+ if (!has_osdmap_manifest) {
+ return;
+ }
+
+ dout(20) << __func__
+ << " dropping osdmap manifest from memory." << dendl;
+ osdmap_manifest = osdmap_manifest_t();
+ has_osdmap_manifest = false;
+ return;
+ }
+
+ dout(20) << __func__
+ << " osdmap manifest detected in store; reload." << dendl;
+
+ bufferlist manifest_bl;
+ int r = get_value("osdmap_manifest", manifest_bl);
+ if (r < 0) {
+ derr << __func__ << " unable to read osdmap version manifest" << dendl;
+ ceph_abort_msg("error reading manifest");
+ }
+ osdmap_manifest.decode(manifest_bl);
+ has_osdmap_manifest = true;
+
+ dout(10) << __func__ << " store osdmap manifest pinned ("
+ << osdmap_manifest.get_first_pinned()
+ << " .. "
+ << osdmap_manifest.get_last_pinned()
+ << ")"
+ << dendl;
+}
+
+bool OSDMonitor::should_prune() const
+{
+ version_t first = get_first_committed();
+ version_t last = get_last_committed();
+ version_t min_osdmap_epochs =
+ g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
+ version_t prune_min =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
+ version_t prune_interval =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+ version_t last_pinned = osdmap_manifest.get_last_pinned();
+ version_t last_to_pin = last - min_osdmap_epochs;
+
+ // Make it or break it constraints.
+ //
+ // If any of these conditions fails, we will not prune, regardless of
+ // whether we have an on-disk manifest with an on-going pruning state.
+ //
+ if ((last - first) <= min_osdmap_epochs) {
+ // between the first and last committed epochs, we don't have
+ // enough epochs to trim, much less to prune.
+ dout(10) << __func__
+ << " currently holding only " << (last - first)
+ << " epochs (min osdmap epochs: " << min_osdmap_epochs
+ << "); do not prune."
+ << dendl;
+ return false;
+
+ } else if ((last_to_pin - first) < prune_min) {
+ // between the first committed epoch and the last epoch we would prune,
+ // we simply don't have enough versions over the minimum to prune maps.
+ dout(10) << __func__
+ << " could only prune " << (last_to_pin - first)
+ << " epochs (" << first << ".." << last_to_pin << "), which"
+ " is less than the required minimum (" << prune_min << ")"
+ << dendl;
+ return false;
+
+ } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
+ dout(10) << __func__
+ << " we have pruned as far as we can; do not prune."
+ << dendl;
+ return false;
+
+ } else if (last_pinned + prune_interval > last_to_pin) {
+ dout(10) << __func__
+ << " not enough epochs to form an interval (last pinned: "
+ << last_pinned << ", last to pin: "
+ << last_to_pin << ", interval: " << prune_interval << ")"
+ << dendl;
+ return false;
+ }
+
+ dout(15) << __func__
+ << " should prune (" << last_pinned << ".." << last_to_pin << ")"
+ << " lc (" << first << ".." << last << ")"
+ << dendl;
+ return true;
+}
+
+void OSDMonitor::_prune_update_trimmed(
+ MonitorDBStore::TransactionRef tx,
+ version_t first)
+{
+ dout(10) << __func__
+ << " first " << first
+ << " last_pinned " << osdmap_manifest.get_last_pinned()
+ << dendl;
+
+ osdmap_manifest_t manifest = osdmap_manifest;
+
+ if (!manifest.is_pinned(first)) {
+ manifest.pin(first);
+ }
+
+ set<version_t>::iterator p_end = manifest.pinned.find(first);
+ set<version_t>::iterator p = manifest.pinned.begin();
+ manifest.pinned.erase(p, p_end);
+ ceph_assert(manifest.get_first_pinned() == first);
+
+ if (manifest.get_last_pinned() == first+1 ||
+ manifest.pinned.size() == 1) {
+ // we reached the end of the line, as pinned maps go; clean up our
+ // manifest, and let `should_prune()` decide whether we should prune
+ // again.
+ tx->erase(get_service_name(), "osdmap_manifest");
+ return;
+ }
+
+ bufferlist bl;
+ manifest.encode(bl);
+ tx->put(get_service_name(), "osdmap_manifest", bl);
+}
+
+void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
+{
+ dout(1) << __func__ << dendl;
+
+ version_t pin_first;
+
+ // verify constrainsts on stable in-memory state
+ if (!has_osdmap_manifest) {
+ // we must have never pruned, OR if we pruned the state must no longer
+ // be relevant (i.e., the state must have been removed alongside with
+ // the trim that *must* have removed past the last pinned map in a
+ // previous prune).
+ ceph_assert(osdmap_manifest.pinned.empty());
+ ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
+ pin_first = get_first_committed();
+
+ } else {
+ // we must have pruned in the past AND its state is still relevant
+ // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
+ // and thus we still hold a manifest in the store).
+ ceph_assert(!osdmap_manifest.pinned.empty());
+ ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
+ ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
+
+ dout(10) << __func__
+ << " first_pinned " << osdmap_manifest.get_first_pinned()
+ << " last_pinned " << osdmap_manifest.get_last_pinned()
+ << dendl;
+
+ pin_first = osdmap_manifest.get_last_pinned();
+ }
+
+ manifest.pin(pin_first);
+}
+
+bool OSDMonitor::_prune_sanitize_options() const
+{
+ uint64_t prune_interval =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+ uint64_t prune_min =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
+ uint64_t txsize =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
+
+ bool r = true;
+
+ if (prune_interval == 0) {
+ derr << __func__
+ << " prune is enabled BUT prune interval is zero; abort."
+ << dendl;
+ r = false;
+ } else if (prune_interval == 1) {
+ derr << __func__
+ << " prune interval is equal to one, which essentially means"
+ " no pruning; abort."
+ << dendl;
+ r = false;
+ }
+ if (prune_min == 0) {
+ derr << __func__
+ << " prune is enabled BUT prune min is zero; abort."
+ << dendl;
+ r = false;
+ }
+ if (prune_interval > prune_min) {
+ derr << __func__
+ << " impossible to ascertain proper prune interval because"
+ << " it is greater than the minimum prune epochs"
+ << " (min: " << prune_min << ", interval: " << prune_interval << ")"
+ << dendl;
+ r = false;
+ }
+
+ if (txsize < prune_interval - 1) {
+ derr << __func__
+ << " 'mon_osdmap_full_prune_txsize' (" << txsize
+ << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
+ << "); abort." << dendl;
+ r = false;
+ }
+ return r;
+}
+
+bool OSDMonitor::is_prune_enabled() const {
+ return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
+}
+
+bool OSDMonitor::is_prune_supported() const {
+ return mon.get_required_mon_features().contains_any(
+ ceph::features::mon::FEATURE_OSDMAP_PRUNE);
+}
+
+/** do_prune
+ *
+ * @returns true if has side-effects; false otherwise.
+ */
+bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
+{
+ bool enabled = is_prune_enabled();
+
+ dout(1) << __func__ << " osdmap full prune "
+ << ( enabled ? "enabled" : "disabled")
+ << dendl;
+
+ if (!enabled || !_prune_sanitize_options() || !should_prune()) {
+ return false;
+ }
+
+ // we are beyond the minimum prune versions, we need to remove maps because
+ // otherwise the store will grow unbounded and we may end up having issues
+ // with available disk space or store hangs.
+
+ // we will not pin all versions. We will leave a buffer number of versions.
+ // this allows us the monitor to trim maps without caring too much about
+ // pinned maps, and then allow us to use another ceph-mon without these
+ // capabilities, without having to repair the store.
+
+ osdmap_manifest_t manifest = osdmap_manifest;
+
+ version_t first = get_first_committed();
+ version_t last = get_last_committed();
+
+ version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
+ version_t last_pinned = manifest.get_last_pinned();
+ uint64_t prune_interval =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+ uint64_t txsize =
+ g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
+
+ prune_init(manifest);
+
+ // we need to get rid of some osdmaps
+
+ dout(5) << __func__
+ << " lc (" << first << " .. " << last << ")"
+ << " last_pinned " << last_pinned
+ << " interval " << prune_interval
+ << " last_to_pin " << last_to_pin
+ << dendl;
+
+ // We will be erasing maps as we go.
+ //
+ // We will erase all maps between `last_pinned` and the `next_to_pin`.
+ //
+ // If `next_to_pin` happens to be greater than `last_to_pin`, then
+ // we stop pruning. We could prune the maps between `next_to_pin` and
+ // `last_to_pin`, but by not doing it we end up with neater pruned
+ // intervals, aligned with `prune_interval`. Besides, this should not be a
+ // problem as long as `prune_interval` is set to a sane value, instead of
+ // hundreds or thousands of maps.
+
+ auto map_exists = [this](version_t v) {
+ string k = mon.store->combine_strings("full", v);
+ return mon.store->exists(get_service_name(), k);
+ };
+
+ // 'interval' represents the number of maps from the last pinned
+ // i.e., if we pinned version 1 and have an interval of 10, we're pinning
+ // version 11 next; all intermediate versions will be removed.
+ //
+ // 'txsize' represents the maximum number of versions we'll be removing in
+ // this iteration. If 'txsize' is large enough to perform multiple passes
+ // pinning and removing maps, we will do so; if not, we'll do at least one
+ // pass. We are quite relaxed about honouring 'txsize', but we'll always
+ // ensure that we never go *over* the maximum.
+
+ // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
+ uint64_t removal_interval = prune_interval - 1;
+
+ if (txsize < removal_interval) {
+ dout(5) << __func__
+ << " setting txsize to removal interval size ("
+ << removal_interval << " versions"
+ << dendl;
+ txsize = removal_interval;
+ }
+ ceph_assert(removal_interval > 0);
+
+ uint64_t num_pruned = 0;
+ while (num_pruned + removal_interval <= txsize) {
+ last_pinned = manifest.get_last_pinned();
+
+ if (last_pinned + prune_interval > last_to_pin) {
+ break;
+ }
+ ceph_assert(last_pinned < last_to_pin);
+
+ version_t next_pinned = last_pinned + prune_interval;
+ ceph_assert(next_pinned <= last_to_pin);
+ manifest.pin(next_pinned);
+
+ dout(20) << __func__
+ << " last_pinned " << last_pinned
+ << " next_pinned " << next_pinned
+ << " num_pruned " << num_pruned
+ << " removal interval (" << (last_pinned+1)
+ << ".." << (next_pinned-1) << ")"
+ << " txsize " << txsize << dendl;
+
+ ceph_assert(map_exists(last_pinned));
+ ceph_assert(map_exists(next_pinned));
+
+ for (version_t v = last_pinned+1; v < next_pinned; ++v) {
+ ceph_assert(!manifest.is_pinned(v));
+
+ dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
+ string full_key = mon.store->combine_strings("full", v);
+ tx->erase(get_service_name(), full_key);
+ ++num_pruned;
+ }
+ }
+
+ ceph_assert(num_pruned > 0);
+
+ bufferlist bl;
+ manifest.encode(bl);
+ tx->put(get_service_name(), "osdmap_manifest", bl);
+
+ return true;
+}
+
+
+// -------------
+
+bool OSDMonitor::preprocess_query(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ Message *m = op->get_req();
+ dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+ switch (m->get_type()) {
+ // READs
+ case MSG_MON_COMMAND:
+ try {
+ return preprocess_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+ case CEPH_MSG_MON_GET_OSDMAP:
+ return preprocess_get_osdmap(op);
+
+ // damp updates
+ case MSG_OSD_MARK_ME_DOWN:
+ return preprocess_mark_me_down(op);
+ case MSG_OSD_MARK_ME_DEAD:
+ return preprocess_mark_me_dead(op);
+ case MSG_OSD_FULL:
+ return preprocess_full(op);
+ case MSG_OSD_FAILURE:
+ return preprocess_failure(op);
+ case MSG_OSD_BOOT:
+ return preprocess_boot(op);
+ case MSG_OSD_ALIVE:
+ return preprocess_alive(op);
+ case MSG_OSD_PG_CREATED:
+ return preprocess_pg_created(op);
+ case MSG_OSD_PG_READY_TO_MERGE:
+ return preprocess_pg_ready_to_merge(op);
+ case MSG_OSD_PGTEMP:
+ return preprocess_pgtemp(op);
+ case MSG_OSD_BEACON:
+ return preprocess_beacon(op);
+
+ case CEPH_MSG_POOLOP:
+ return preprocess_pool_op(op);
+
+ case MSG_REMOVE_SNAPS:
+ return preprocess_remove_snaps(op);
+
+ case MSG_MON_GET_PURGED_SNAPS:
+ return preprocess_get_purged_snaps(op);
+
+ default:
+ ceph_abort();
+ return true;
+ }
+}
+
+bool OSDMonitor::prepare_update(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ Message *m = op->get_req();
+ dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+ switch (m->get_type()) {
+ // damp updates
+ case MSG_OSD_MARK_ME_DOWN:
+ return prepare_mark_me_down(op);
+ case MSG_OSD_MARK_ME_DEAD:
+ return prepare_mark_me_dead(op);
+ case MSG_OSD_FULL:
+ return prepare_full(op);
+ case MSG_OSD_FAILURE:
+ return prepare_failure(op);
+ case MSG_OSD_BOOT:
+ return prepare_boot(op);
+ case MSG_OSD_ALIVE:
+ return prepare_alive(op);
+ case MSG_OSD_PG_CREATED:
+ return prepare_pg_created(op);
+ case MSG_OSD_PGTEMP:
+ return prepare_pgtemp(op);
+ case MSG_OSD_PG_READY_TO_MERGE:
+ return prepare_pg_ready_to_merge(op);
+ case MSG_OSD_BEACON:
+ return prepare_beacon(op);
+
+ case MSG_MON_COMMAND:
+ try {
+ return prepare_command(op);
+ } catch (const bad_cmd_get& e) {
+ bufferlist bl;
+ mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+ return true;
+ }
+
+ case CEPH_MSG_POOLOP:
+ return prepare_pool_op(op);
+
+ case MSG_REMOVE_SNAPS:
+ return prepare_remove_snaps(op);
+
+
+ default:
+ ceph_abort();
+ }
+
+ return false;
+}
+
+bool OSDMonitor::should_propose(double& delay)
+{
+ dout(10) << "should_propose" << dendl;
+
+ // if full map, propose immediately! any subsequent changes will be clobbered.
+ if (pending_inc.fullmap.length())
+ return true;
+
+ // adjust osd weights?
+ if (!osd_weight.empty() &&
+ osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
+ dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
+ osdmap.adjust_osd_weights(osd_weight, pending_inc);
+ delay = 0.0;
+ osd_weight.clear();
+ return true;
+ }
+
+ return PaxosService::should_propose(delay);
+}
+
+
+
+// ---------------------------
+// READs
+
+bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonGetOSDMap>();
+
+ uint64_t features = mon.get_quorum_con_features();
+ if (op->get_session() && op->get_session()->con_features)
+ features = op->get_session()->con_features;
+
+ dout(10) << __func__ << " " << *m << dendl;
+ MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
+ epoch_t first = get_first_committed();
+ epoch_t last = osdmap.get_epoch();
+ int max = g_conf()->osd_map_message_max;
+ ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
+ for (epoch_t e = std::max(first, m->get_full_first());
+ e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
+ ++e, --max) {
+ bufferlist& bl = reply->maps[e];
+ int r = get_version_full(e, features, bl);
+ ceph_assert(r >= 0);
+ max_bytes -= bl.length();
+ }
+ for (epoch_t e = std::max(first, m->get_inc_first());
+ e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
+ ++e, --max) {
+ bufferlist& bl = reply->incremental_maps[e];
+ int r = get_version(e, features, bl);
+ ceph_assert(r >= 0);
+ max_bytes -= bl.length();
+ }
+ reply->oldest_map = first;
+ reply->newest_map = last;
+ mon.send_reply(op, reply);
+ return true;
+}
+
+
+// ---------------------------
+// UPDATEs
+
+// failure --
+
+bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
+ // check permissions
+ MonSession *session = op->get_session();
+ if (!session)
+ return true;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "got MOSDFailure from entity with insufficient caps "
+ << session->caps << dendl;
+ return true;
+ }
+ if (fsid != mon.monmap->fsid) {
+ dout(0) << "check_source: on fsid " << fsid
+ << " != " << mon.monmap->fsid << dendl;
+ return true;
+ }
+ return false;
+}
+
+
+bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFailure>();
+ // who is target_osd
+ int badboy = m->get_target_osd();
+
+ // check permissions
+ if (check_source(op, m->fsid))
+ goto didit;
+
+ // first, verify the reporting host is valid
+ if (m->get_orig_source().is_osd()) {
+ int from = m->get_orig_source().num();
+ if (!osdmap.exists(from) ||
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
+ (osdmap.is_down(from) && m->if_osd_failed())) {
+ dout(5) << "preprocess_failure from dead osd." << from
+ << ", ignoring" << dendl;
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+ }
+
+
+ // weird?
+ if (osdmap.is_down(badboy)) {
+ dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << ", from " << m->get_orig_source() << dendl;
+ if (m->get_epoch() < osdmap.get_epoch())
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+ if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
+ dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << " != map's " << osdmap.get_addrs(badboy)
+ << ", from " << m->get_orig_source() << dendl;
+ if (m->get_epoch() < osdmap.get_epoch())
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+
+ // already reported?
+ if (osdmap.is_down(badboy) ||
+ osdmap.get_up_from(badboy) > m->get_epoch()) {
+ dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << ", from " << m->get_orig_source() << dendl;
+ if (m->get_epoch() < osdmap.get_epoch())
+ send_incremental(op, m->get_epoch()+1);
+ goto didit;
+ }
+
+ if (!can_mark_down(badboy)) {
+ dout(5) << "preprocess_failure ignoring report of osd."
+ << m->get_target_osd() << " " << m->get_target_addrs()
+ << " from " << m->get_orig_source() << dendl;
+ goto didit;
+ }
+
+ dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << ", from " << m->get_orig_source() << dendl;
+ return false;
+
+ didit:
+ mon.no_reply(op);
+ return true;
+}
+
+class C_AckMarkedDown : public C_MonOp {
+ OSDMonitor *osdmon;
+public:
+ C_AckMarkedDown(
+ OSDMonitor *osdmon,
+ MonOpRequestRef op)
+ : C_MonOp(op), osdmon(osdmon) {}
+
+ void _finish(int r) override {
+ if (r == 0) {
+ auto m = op->get_req<MOSDMarkMeDown>();
+ osdmon->mon.send_reply(
+ op,
+ new MOSDMarkMeDown(
+ m->fsid,
+ m->target_osd,
+ m->target_addrs,
+ m->get_epoch(),
+ false)); // ACK itself does not request an ack
+ } else if (r == -EAGAIN) {
+ osdmon->dispatch(op);
+ } else {
+ ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
+ }
+ }
+ ~C_AckMarkedDown() override {
+ }
+};
+
+bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDown>();
+ int from = m->target_osd;
+
+ // check permissions
+ if (check_source(op, m->fsid))
+ goto reply;
+
+ // first, verify the reporting host is valid
+ if (!m->get_orig_source().is_osd())
+ goto reply;
+
+ if (!osdmap.exists(from) ||
+ osdmap.is_down(from) ||
+ osdmap.get_addrs(from) != m->target_addrs) {
+ dout(5) << "preprocess_mark_me_down from dead osd."
+ << from << ", ignoring" << dendl;
+ send_incremental(op, m->get_epoch()+1);
+ goto reply;
+ }
+
+ // no down might be set
+ if (!can_mark_down(from))
+ goto reply;
+
+ dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
+ << " " << m->target_addrs << dendl;
+ return false;
+
+ reply:
+ if (m->request_ack) {
+ Context *c(new C_AckMarkedDown(this, op));
+ c->complete(0);
+ }
+ return true;
+}
+
+bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDown>();
+ int target_osd = m->target_osd;
+
+ ceph_assert(osdmap.is_up(target_osd));
+ ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
+
+ mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
+ pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+ if (m->down_and_dead) {
+ if (!pending_inc.new_xinfo.count(target_osd)) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
+ }
+ if (m->request_ack)
+ wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
+ return true;
+}
+
+bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDead>();
+ int from = m->target_osd;
+
+ // check permissions
+ if (check_source(op, m->fsid)) {
+ mon.no_reply(op);
+ return true;
+ }
+
+ // first, verify the reporting host is valid
+ if (!m->get_orig_source().is_osd()) {
+ mon.no_reply(op);
+ return true;
+ }
+
+ if (!osdmap.exists(from) ||
+ !osdmap.is_down(from)) {
+ dout(5) << __func__ << " from nonexistent or up osd." << from
+ << ", ignoring" << dendl;
+ send_incremental(op, m->get_epoch()+1);
+ mon.no_reply(op);
+ return true;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDMarkMeDead>();
+ int target_osd = m->target_osd;
+
+ ceph_assert(osdmap.is_down(target_osd));
+
+ mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
+ << m->get_epoch();
+ if (!pending_inc.new_xinfo.count(target_osd)) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
+ wait_for_finished_proposal(
+ op,
+ new LambdaContext(
+ [op, this] (int r) {
+ if (r >= 0) {
+ mon.no_reply(op); // ignore on success
+ }
+ }
+ ));
+ return true;
+}
+
+bool OSDMonitor::can_mark_down(int i)
+{
+ if (osdmap.is_nodown(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as nodown, "
+ << "will not mark it down" << dendl;
+ return false;
+ }
+
+ int num_osds = osdmap.get_num_osds();
+ if (num_osds == 0) {
+ dout(5) << __func__ << " no osds" << dendl;
+ return false;
+ }
+ int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
+ float up_ratio = (float)up / (float)num_osds;
+ if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
+ dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
+ << g_conf()->mon_osd_min_up_ratio
+ << ", will not mark osd." << i << " down" << dendl;
+ return false;
+ }
+ return true;
+}
+
+bool OSDMonitor::can_mark_up(int i)
+{
+ if (osdmap.is_noup(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as noup, "
+ << "will not mark it up" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * @note the parameter @p i apparently only exists here so we can output the
+ * osd's id on messages.
+ */
+bool OSDMonitor::can_mark_out(int i)
+{
+ if (osdmap.is_noout(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as noout, "
+ << "will not mark it out" << dendl;
+ return false;
+ }
+
+ int num_osds = osdmap.get_num_osds();
+ if (num_osds == 0) {
+ dout(5) << __func__ << " no osds" << dendl;
+ return false;
+ }
+ int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
+ float in_ratio = (float)in / (float)num_osds;
+ if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
+ if (i >= 0)
+ dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
+ << g_conf()->mon_osd_min_in_ratio
+ << ", will not mark osd." << i << " out" << dendl;
+ else
+ dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
+ << g_conf()->mon_osd_min_in_ratio
+ << ", will not mark osds out" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool OSDMonitor::can_mark_in(int i)
+{
+ if (osdmap.is_noin(i)) {
+ dout(5) << __func__ << " osd." << i << " is marked as noin, "
+ << "will not mark it in" << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool OSDMonitor::check_failures(utime_t now)
+{
+ bool found_failure = false;
+ auto p = failure_info.begin();
+ while (p != failure_info.end()) {
+ auto& [target_osd, fi] = *p;
+ if (can_mark_down(target_osd) &&
+ check_failure(now, target_osd, fi)) {
+ found_failure = true;
+ ++p;
+ } else if (is_failure_stale(now, fi)) {
+ dout(10) << " dropping stale failure_info for osd." << target_osd
+ << " from " << fi.reporters.size() << " reporters"
+ << dendl;
+ p = failure_info.erase(p);
+ } else {
+ ++p;
+ }
+ }
+ return found_failure;
+}
+
+utime_t OSDMonitor::get_grace_time(utime_t now,
+ int target_osd,
+ failure_info_t& fi) const
+{
+ utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
+ if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
+ return orig_grace;
+ }
+ utime_t grace = orig_grace;
+ double halflife = (double)g_conf()->mon_osd_laggy_halflife;
+ double decay_k = ::log(.5) / halflife;
+
+ // scale grace period based on historical probability of 'lagginess'
+ // (false positive failures due to slowness).
+ const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
+ const utime_t failed_for = now - fi.get_failed_since();
+ double decay = exp((double)failed_for * decay_k);
+ dout(20) << " halflife " << halflife << " decay_k " << decay_k
+ << " failed_for " << failed_for << " decay " << decay << dendl;
+ double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
+ grace += my_grace;
+
+ // consider the peers reporting a failure a proxy for a potential
+ // 'subcluster' over the overall cluster that is similarly
+ // laggy. this is clearly not true in all cases, but will sometimes
+ // help us localize the grace correction to a subset of the system
+ // (say, a rack with a bad switch) that is unhappy.
+ double peer_grace = 0;
+ for (auto& [reporter, report] : fi.reporters) {
+ if (osdmap.exists(reporter)) {
+ const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
+ utime_t elapsed = now - xi.down_stamp;
+ double decay = exp((double)elapsed * decay_k);
+ peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
+ }
+ }
+ peer_grace /= (double)fi.reporters.size();
+ grace += peer_grace;
+ dout(10) << " osd." << target_osd << " has "
+ << fi.reporters.size() << " reporters, "
+ << grace << " grace (" << orig_grace << " + " << my_grace
+ << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
+ << dendl;
+
+ return grace;
+}
+
+bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
+{
+ // already pending failure?
+ if (pending_inc.new_state.count(target_osd) &&
+ pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+ dout(10) << " already pending failure" << dendl;
+ return true;
+ }
+
+ set<string> reporters_by_subtree;
+ auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
+ ceph_assert(fi.reporters.size());
+ for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
+ // get the parent bucket whose type matches with "reporter_subtree_level".
+ // fall back to OSD if the level doesn't exist.
+ if (osdmap.exists(p->first)) {
+ auto reporter_loc = osdmap.crush->get_full_location(p->first);
+ if (auto iter = reporter_loc.find(reporter_subtree_level);
+ iter == reporter_loc.end()) {
+ reporters_by_subtree.insert("osd." + to_string(p->first));
+ } else {
+ reporters_by_subtree.insert(iter->second);
+ }
+ ++p;
+ } else {
+ fi.cancel_report(p->first);;
+ p = fi.reporters.erase(p);
+ }
+ }
+ if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
+ return false;
+ }
+ const utime_t failed_for = now - fi.get_failed_since();
+ const utime_t grace = get_grace_time(now, target_osd, fi);
+ if (failed_for >= grace) {
+ dout(1) << " we have enough reporters to mark osd." << target_osd
+ << " down" << dendl;
+ pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+
+ mon.clog->info() << "osd." << target_osd << " failed ("
+ << osdmap.crush->get_full_location_ordered_string(
+ target_osd)
+ << ") ("
+ << (int)reporters_by_subtree.size()
+ << " reporters from different "
+ << reporter_subtree_level << " after "
+ << failed_for << " >= grace " << grace << ")";
+ return true;
+ }
+ return false;
+}
+
+bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
+{
+ // if it takes too long to either cancel the report to mark the osd down,
+ // some reporters must have failed to cancel their reports. let's just
+ // forget these reports.
+ const utime_t failed_for = now - fi.get_failed_since();
+ auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+ auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+ return failed_for >= (heartbeat_grace + heartbeat_stale);
+}
+
+void OSDMonitor::force_failure(int target_osd, int by)
+{
+ // already pending failure?
+ if (pending_inc.new_state.count(target_osd) &&
+ pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+ dout(10) << " already pending failure" << dendl;
+ return;
+ }
+
+ dout(1) << " we're forcing failure of osd." << target_osd << dendl;
+ pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+ if (!pending_inc.new_xinfo.count(target_osd)) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
+
+ mon.clog->info() << "osd." << target_osd << " failed ("
+ << osdmap.crush->get_full_location_ordered_string(target_osd)
+ << ") (connection refused reported by osd." << by << ")";
+ return;
+}
+
+bool OSDMonitor::prepare_failure(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFailure>();
+ dout(1) << "prepare_failure osd." << m->get_target_osd()
+ << " " << m->get_target_addrs()
+ << " from " << m->get_orig_source()
+ << " is reporting failure:" << m->if_osd_failed() << dendl;
+
+ int target_osd = m->get_target_osd();
+ int reporter = m->get_orig_source().num();
+ ceph_assert(osdmap.is_up(target_osd));
+ ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
+
+ mon.no_reply(op);
+
+ if (m->if_osd_failed()) {
+ // calculate failure time
+ utime_t now = ceph_clock_now();
+ utime_t failed_since =
+ m->get_recv_stamp() - utime_t(m->failed_for, 0);
+
+ // add a report
+ if (m->is_immediate()) {
+ mon.clog->debug() << "osd." << m->get_target_osd()
+ << " reported immediately failed by "
+ << m->get_orig_source();
+ force_failure(target_osd, reporter);
+ return true;
+ }
+ mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
+ << m->get_orig_source();
+
+ failure_info_t& fi = failure_info[target_osd];
+ fi.add_report(reporter, failed_since, op);
+ return check_failure(now, target_osd, fi);
+ } else {
+ // remove the report
+ mon.clog->debug() << "osd." << m->get_target_osd()
+ << " failure report canceled by "
+ << m->get_orig_source();
+ if (failure_info.count(target_osd)) {
+ failure_info_t& fi = failure_info[target_osd];
+ fi.cancel_report(reporter);
+ if (fi.reporters.empty()) {
+ dout(10) << " removing last failure_info for osd." << target_osd
+ << dendl;
+ failure_info.erase(target_osd);
+ } else {
+ dout(10) << " failure_info for osd." << target_osd << " now "
+ << fi.reporters.size() << " reporters" << dendl;
+ }
+ } else {
+ dout(10) << " no failure_info for osd." << target_osd << dendl;
+ }
+ }
+
+ return false;
+}
+
+void OSDMonitor::process_failures()
+{
+ map<int,failure_info_t>::iterator p = failure_info.begin();
+ while (p != failure_info.end()) {
+ if (osdmap.is_up(p->first)) {
+ ++p;
+ } else {
+ dout(10) << "process_failures osd." << p->first << dendl;
+ list<MonOpRequestRef> ls;
+ p->second.take_report_messages(ls);
+ failure_info.erase(p++);
+
+ while (!ls.empty()) {
+ MonOpRequestRef o = ls.front();
+ if (o) {
+ o->mark_event(__func__);
+ MOSDFailure *m = o->get_req<MOSDFailure>();
+ send_latest(o, m->get_epoch());
+ mon.no_reply(o);
+ }
+ ls.pop_front();
+ }
+ }
+ }
+}
+
+void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
+{
+ dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
+
+ for (map<int,failure_info_t>::iterator p = failure_info.begin();
+ p != failure_info.end();
+ ++p) {
+ p->second.take_report_messages(ls);
+ }
+ failure_info.clear();
+}
+
+int OSDMonitor::get_grace_interval_threshold()
+{
+ int halflife = g_conf()->mon_osd_laggy_halflife;
+ // Scale the halflife period (default: 1_hr) by
+ // a factor (48) to calculate the threshold.
+ int grace_threshold_factor = 48;
+ return halflife * grace_threshold_factor;
+}
+
+bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
+{
+ int grace_interval_threshold_secs = get_grace_interval_threshold();
+ if (last_failed_interval > grace_interval_threshold_secs) {
+ dout(1) << " last_failed_interval " << last_failed_interval
+ << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+void OSDMonitor::set_default_laggy_params(int target_osd)
+{
+ if (pending_inc.new_xinfo.count(target_osd) == 0) {
+ pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+ }
+ osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
+ xi.down_stamp = pending_inc.modified;
+ xi.laggy_probability = 0.0;
+ xi.laggy_interval = 0;
+ dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
+}
+
+
+// boot --
+
+bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDBoot>();
+ int from = m->get_orig_source_inst().name.num();
+
+ // check permissions, ignore if failed (no response expected)
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "got preprocess_boot message from entity with insufficient caps"
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (m->sb.cluster_fsid != mon.monmap->fsid) {
+ dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
+ << " != " << mon.monmap->fsid << dendl;
+ goto ignore;
+ }
+
+ if (m->get_orig_source_inst().addr.is_blank_ip()) {
+ dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
+ goto ignore;
+ }
+
+ ceph_assert(m->get_orig_source_inst().name.is_osd());
+
+ // force all osds to have gone through luminous prior to upgrade to nautilus
+ {
+ vector<string> missing;
+ if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
+ missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
+ }
+ if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
+ missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
+ }
+ if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
+ missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
+ }
+ if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
+ missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
+ }
+
+ if (!missing.empty()) {
+ using std::experimental::make_ostream_joiner;
+
+ stringstream ss;
+ copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
+
+ mon.clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because the osd lacks " << ss.str();
+ goto ignore;
+ }
+ }
+
+ // make sure osd versions do not span more than 3 releases
+ if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
+ osdmap.require_osd_release < ceph_release_t::mimic) {
+ mon.clog->info() << "disallowing boot of octopus+ OSD "
+ << m->get_orig_source_inst()
+ << " because require_osd_release < mimic";
+ goto ignore;
+ }
+ if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
+ osdmap.require_osd_release < ceph_release_t::nautilus) {
+ mon.clog->info() << "disallowing boot of pacific+ OSD "
+ << m->get_orig_source_inst()
+ << " because require_osd_release < nautilus";
+ goto ignore;
+ }
+
+ // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+ // we are reusing a jewel feature bit that was retired in luminous.
+ if (osdmap.require_osd_release >= ceph_release_t::luminous &&
+ osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
+ !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
+ mon.clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
+ goto ignore;
+ }
+
+ if (osdmap.stretch_mode_enabled &&
+ !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
+ mon.clog->info() << "disallowing boot of OSD "
+ << m->get_orig_source_inst()
+ << " because stretch mode is on and OSD lacks support";
+ goto ignore;
+ }
+
+ // already booted?
+ if (osdmap.is_up(from) &&
+ osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
+ osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
+ // yup.
+ dout(7) << "preprocess_boot dup from " << m->get_orig_source()
+ << " " << m->get_orig_source_addrs()
+ << " =~ " << osdmap.get_addrs(from) << dendl;
+ _booted(op, false);
+ return true;
+ }
+
+ if (osdmap.exists(from) &&
+ !osdmap.get_uuid(from).is_zero() &&
+ osdmap.get_uuid(from) != m->sb.osd_fsid) {
+ dout(7) << __func__ << " from " << m->get_orig_source_inst()
+ << " clashes with existing osd: different fsid"
+ << " (ours: " << osdmap.get_uuid(from)
+ << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
+ goto ignore;
+ }
+
+ if (osdmap.exists(from) &&
+ osdmap.get_info(from).up_from > m->version &&
+ osdmap.get_most_recent_addrs(from).legacy_equals(
+ m->get_orig_source_addrs())) {
+ dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
+ send_latest(op, m->sb.current_epoch+1);
+ return true;
+ }
+
+ // noup?
+ if (!can_mark_up(from)) {
+ dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
+ send_latest(op, m->sb.current_epoch+1);
+ return true;
+ }
+
+ dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
+ return false;
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_boot(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDBoot>();
+ dout(7) << __func__ << " from " << m->get_source()
+ << " sb " << m->sb
+ << " client_addrs" << m->get_connection()->get_peer_addrs()
+ << " cluster_addrs " << m->cluster_addrs
+ << " hb_back_addrs " << m->hb_back_addrs
+ << " hb_front_addrs " << m->hb_front_addrs
+ << dendl;
+
+ ceph_assert(m->get_orig_source().is_osd());
+ int from = m->get_orig_source().num();
+
+ // does this osd exist?
+ if (from >= osdmap.get_max_osd()) {
+ dout(1) << "boot from osd." << from << " >= max_osd "
+ << osdmap.get_max_osd() << dendl;
+ return false;
+ }
+
+ int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
+ if (pending_inc.new_state.count(from))
+ oldstate ^= pending_inc.new_state[from];
+
+ // already up? mark down first?
+ if (osdmap.is_up(from)) {
+ dout(7) << __func__ << " was up, first marking down osd." << from << " "
+ << osdmap.get_addrs(from) << dendl;
+ // preprocess should have caught these; if not, assert.
+ ceph_assert(!osdmap.get_addrs(from).legacy_equals(
+ m->get_orig_source_addrs()) ||
+ !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
+ ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
+
+ if (pending_inc.new_state.count(from) == 0 ||
+ (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
+ // mark previous guy down
+ pending_inc.new_state[from] = CEPH_OSD_UP;
+ }
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ } else if (pending_inc.new_up_client.count(from)) {
+ // already prepared, just wait
+ dout(7) << __func__ << " already prepared, waiting on "
+ << m->get_orig_source_addr() << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ } else {
+ // mark new guy up.
+ pending_inc.new_up_client[from] = m->get_orig_source_addrs();
+ pending_inc.new_up_cluster[from] = m->cluster_addrs;
+ pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
+ pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
+
+ down_pending_out.erase(from); // if any
+
+ if (m->sb.weight)
+ osd_weight[from] = m->sb.weight;
+
+ // set uuid?
+ dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
+ << dendl;
+ if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
+ // preprocess should have caught this; if not, assert.
+ ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
+ pending_inc.new_uuid[from] = m->sb.osd_fsid;
+ }
+
+ // fresh osd?
+ if (m->sb.newest_map == 0 && osdmap.exists(from)) {
+ const osd_info_t& i = osdmap.get_info(from);
+ if (i.up_from > i.lost_at) {
+ dout(10) << " fresh osd; marking lost_at too" << dendl;
+ pending_inc.new_lost[from] = osdmap.get_epoch();
+ }
+ }
+
+ // metadata
+ bufferlist osd_metadata;
+ encode(m->metadata, osd_metadata);
+ pending_metadata[from] = osd_metadata;
+ pending_metadata_rm.erase(from);
+
+ // adjust last clean unmount epoch?
+ const osd_info_t& info = osdmap.get_info(from);
+ dout(10) << " old osd_info: " << info << dendl;
+ if (m->sb.mounted > info.last_clean_begin ||
+ (m->sb.mounted == info.last_clean_begin &&
+ m->sb.clean_thru > info.last_clean_end)) {
+ epoch_t begin = m->sb.mounted;
+ epoch_t end = m->sb.clean_thru;
+
+ dout(10) << __func__ << " osd." << from << " last_clean_interval "
+ << "[" << info.last_clean_begin << "," << info.last_clean_end
+ << ") -> [" << begin << "-" << end << ")"
+ << dendl;
+ pending_inc.new_last_clean_interval[from] =
+ pair<epoch_t,epoch_t>(begin, end);
+ }
+
+ if (pending_inc.new_xinfo.count(from) == 0)
+ pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
+ osd_xinfo_t& xi = pending_inc.new_xinfo[from];
+ if (m->boot_epoch == 0) {
+ xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
+ xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
+ dout(10) << " not laggy, new xi " << xi << dendl;
+ } else {
+ if (xi.down_stamp.sec()) {
+ int interval = ceph_clock_now().sec() -
+ xi.down_stamp.sec();
+ if (g_conf()->mon_osd_laggy_max_interval &&
+ (interval > g_conf()->mon_osd_laggy_max_interval)) {
+ interval = g_conf()->mon_osd_laggy_max_interval;
+ }
+ xi.laggy_interval =
+ interval * g_conf()->mon_osd_laggy_weight +
+ xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
+ }
+ xi.laggy_probability =
+ g_conf()->mon_osd_laggy_weight +
+ xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
+ dout(10) << " laggy, now xi " << xi << dendl;
+ }
+
+ // set features shared by the osd
+ if (m->osd_features)
+ xi.features = m->osd_features;
+ else
+ xi.features = m->get_connection()->get_features();
+
+ // mark in?
+ if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
+ (oldstate & CEPH_OSD_AUTOOUT)) ||
+ (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
+ (g_conf()->mon_osd_auto_mark_in)) {
+ if (can_mark_in(from)) {
+ if (xi.old_weight > 0) {
+ pending_inc.new_weight[from] = xi.old_weight;
+ xi.old_weight = 0;
+ } else {
+ pending_inc.new_weight[from] = CEPH_OSD_IN;
+ }
+ } else {
+ dout(7) << __func__ << " NOIN set, will not mark in "
+ << m->get_orig_source_addr() << dendl;
+ }
+ }
+
+ // wait
+ wait_for_finished_proposal(op, new C_Booted(this, op));
+ }
+ return true;
+}
+
+void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDBoot>();
+ dout(7) << "_booted " << m->get_orig_source_inst()
+ << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
+
+ if (logit) {
+ mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
+ << " boot";
+ }
+
+ send_latest(op, m->sb.current_epoch+1);
+}
+
+
+// -------------
+// full
+
+bool OSDMonitor::preprocess_full(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFull>();
+ int from = m->get_orig_source().num();
+ set<string> state;
+ unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
+
+ // check permissions, ignore if failed
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "MOSDFull from entity with insufficient privileges:"
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ // ignore a full message from the osd instance that already went down
+ if (!osdmap.exists(from)) {
+ dout(7) << __func__ << " ignoring full message from nonexistent "
+ << m->get_orig_source_inst() << dendl;
+ goto ignore;
+ }
+ if ((!osdmap.is_up(from) &&
+ osdmap.get_most_recent_addrs(from).legacy_equals(
+ m->get_orig_source_addrs())) ||
+ (osdmap.is_up(from) &&
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
+ dout(7) << __func__ << " ignoring full message from down "
+ << m->get_orig_source_inst() << dendl;
+ goto ignore;
+ }
+
+ OSDMap::calc_state_set(osdmap.get_state(from), state);
+
+ if ((osdmap.get_state(from) & mask) == m->state) {
+ dout(7) << __func__ << " state already " << state << " for osd." << from
+ << " " << m->get_orig_source_inst() << dendl;
+ _reply_map(op, m->version);
+ goto ignore;
+ }
+
+ dout(10) << __func__ << " want state " << state << " for osd." << from
+ << " " << m->get_orig_source_inst() << dendl;
+ return false;
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_full(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDFull>();
+ const int from = m->get_orig_source().num();
+
+ const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
+ const unsigned want_state = m->state & mask; // safety first
+
+ unsigned cur_state = osdmap.get_state(from);
+ auto p = pending_inc.new_state.find(from);
+ if (p != pending_inc.new_state.end()) {
+ cur_state ^= p->second;
+ }
+ cur_state &= mask;
+
+ set<string> want_state_set, cur_state_set;
+ OSDMap::calc_state_set(want_state, want_state_set);
+ OSDMap::calc_state_set(cur_state, cur_state_set);
+
+ if (cur_state != want_state) {
+ if (p != pending_inc.new_state.end()) {
+ p->second &= ~mask;
+ } else {
+ pending_inc.new_state[from] = 0;
+ }
+ pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
+ dout(7) << __func__ << " osd." << from << " " << cur_state_set
+ << " -> " << want_state_set << dendl;
+ } else {
+ dout(7) << __func__ << " osd." << from << " " << cur_state_set
+ << " = wanted " << want_state_set << ", just waiting" << dendl;
+ }
+
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+ return true;
+}
+
+// -------------
+// alive
+
+bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDAlive>();
+ int from = m->get_orig_source().num();
+
+ // check permissions, ignore if failed
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (!osdmap.is_up(from) ||
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
+ dout(7) << "preprocess_alive ignoring alive message from down "
+ << m->get_orig_source() << " " << m->get_orig_source_addrs()
+ << dendl;
+ goto ignore;
+ }
+
+ if (osdmap.get_up_thru(from) >= m->want) {
+ // yup.
+ dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
+ _reply_map(op, m->version);
+ return true;
+ }
+
+ dout(10) << "preprocess_alive want up_thru " << m->want
+ << " from " << m->get_orig_source_inst() << dendl;
+ return false;
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_alive(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDAlive>();
+ int from = m->get_orig_source().num();
+
+ if (0) { // we probably don't care much about these
+ mon.clog->debug() << m->get_orig_source_inst() << " alive";
+ }
+
+ dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
+ << " from " << m->get_orig_source_inst() << dendl;
+
+ update_up_thru(from, m->version); // set to the latest map the OSD has
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+ return true;
+}
+
+void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
+{
+ op->mark_osdmon_event(__func__);
+ dout(7) << "_reply_map " << e
+ << " from " << op->get_req()->get_orig_source_inst()
+ << dendl;
+ send_latest(op, e);
+}
+
+// pg_created
+bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGCreated>();
+ dout(10) << __func__ << " " << *m << dendl;
+ auto session = op->get_session();
+ mon.no_reply(op);
+ if (!session) {
+ dout(10) << __func__ << ": no monitor session!" << dendl;
+ return true;
+ }
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ derr << __func__ << " received from entity "
+ << "with insufficient privileges " << session->caps << dendl;
+ return true;
+ }
+ // always forward the "created!" to the leader
+ return false;
+}
+
+bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGCreated>();
+ dout(10) << __func__ << " " << *m << dendl;
+ auto src = m->get_orig_source();
+ auto from = src.num();
+ if (!src.is_osd() ||
+ !mon.osdmon()->osdmap.is_up(from) ||
+ !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
+ m->get_orig_source_addrs())) {
+ dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
+ return false;
+ }
+ pending_created_pgs.push_back(m->pgid);
+ return true;
+}
+
+bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGReadyToMerge>();
+ dout(10) << __func__ << " " << *m << dendl;
+ const pg_pool_t *pi;
+ auto session = op->get_session();
+ if (!session) {
+ dout(10) << __func__ << ": no monitor session!" << dendl;
+ goto ignore;
+ }
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ derr << __func__ << " received from entity "
+ << "with insufficient privileges " << session->caps << dendl;
+ goto ignore;
+ }
+ pi = osdmap.get_pg_pool(m->pgid.pool());
+ if (!pi) {
+ derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
+ goto ignore;
+ }
+ if (pi->get_pg_num() <= m->pgid.ps()) {
+ dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
+ goto ignore;
+ }
+ if (pi->get_pg_num() != m->pgid.ps() + 1) {
+ derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
+ goto ignore;
+ }
+ if (pi->get_pg_num_pending() > m->pgid.ps()) {
+ dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
+ goto ignore;
+ }
+ return false;
+
+ ignore:
+ mon.no_reply(op);
+ return true;
+}
+
+bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGReadyToMerge>();
+ dout(10) << __func__ << " " << *m << dendl;
+ pg_pool_t p;
+ if (pending_inc.new_pools.count(m->pgid.pool()))
+ p = pending_inc.new_pools[m->pgid.pool()];
+ else
+ p = *osdmap.get_pg_pool(m->pgid.pool());
+ if (p.get_pg_num() != m->pgid.ps() + 1 ||
+ p.get_pg_num_pending() > m->pgid.ps()) {
+ dout(10) << __func__
+ << " race with concurrent pg_num[_pending] update, will retry"
+ << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+
+ if (m->ready) {
+ p.dec_pg_num(m->pgid,
+ pending_inc.epoch,
+ m->source_version,
+ m->target_version,
+ m->last_epoch_started,
+ m->last_epoch_clean);
+ p.last_change = pending_inc.epoch;
+ } else {
+ // back off the merge attempt!
+ p.set_pg_num_pending(p.get_pg_num());
+ }
+
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_pending changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+
+ pending_inc.new_pools[m->pgid.pool()] = p;
+
+ auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
+ if (m->ready &&
+ prob > 0 &&
+ prob > (double)(rand() % 1000)/1000.0) {
+ derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
+ auto n = new MMonCommand(mon.monmap->get_fsid());
+ n->set_connection(m->get_connection());
+ n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
+ osdmap.get_pool_name(m->pgid.pool()) +
+ "\", \"var\": \"pg_num_actual\", \"val\": \"" +
+ stringify(m->pgid.ps() + 1) + "\"}" };
+ MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
+ nop->set_type_service();
+ wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
+ } else {
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+ }
+ return true;
+}
+
+
+// -------------
+// pg_temp changes
+
+bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGTemp>();
+ dout(10) << "preprocess_pgtemp " << *m << dendl;
+ mempool::osdmap::vector<int> empty;
+ int from = m->get_orig_source().num();
+ size_t ignore_cnt = 0;
+
+ // check caps
+ MonSession *session = op->get_session();
+ if (!session)
+ goto ignore;
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ if (!osdmap.is_up(from) ||
+ !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
+ dout(7) << "ignoring pgtemp message from down "
+ << m->get_orig_source() << " " << m->get_orig_source_addrs()
+ << dendl;
+ goto ignore;
+ }
+
+ if (m->forced) {
+ return false;
+ }
+
+ for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+ dout(20) << " " << p->first
+ << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
+ << " -> " << p->second << dendl;
+
+ // does the pool exist?
+ if (!osdmap.have_pg_pool(p->first.pool())) {
+ /*
+ * 1. If the osdmap does not have the pool, it means the pool has been
+ * removed in-between the osd sending this message and us handling it.
+ * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
+ * not exist in the pending either, as the osds would not send a
+ * message about a pool they know nothing about (yet).
+ * 3. However, if the pool does exist in the pending, then it must be a
+ * new pool, and not relevant to this message (see 1).
+ */
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": pool has been removed" << dendl;
+ ignore_cnt++;
+ continue;
+ }
+
+ int acting_primary = -1;
+ osdmap.pg_to_up_acting_osds(
+ p->first, nullptr, nullptr, nullptr, &acting_primary);
+ if (acting_primary != from) {
+ /* If the source isn't the primary based on the current osdmap, we know
+ * that the interval changed and that we can discard this message.
+ * Indeed, we must do so to avoid 16127 since we can't otherwise determine
+ * which of two pg temp mappings on the same pg is more recent.
+ */
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": primary has changed" << dendl;
+ ignore_cnt++;
+ continue;
+ }
+
+ // removal?
+ if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
+ osdmap.primary_temp->count(p->first)))
+ return false;
+ // change?
+ // NOTE: we assume that this will clear pg_primary, so consider
+ // an existing pg_primary field to imply a change
+ if (p->second.size() &&
+ (osdmap.pg_temp->count(p->first) == 0 ||
+ osdmap.pg_temp->get(p->first) != p->second ||
+ osdmap.primary_temp->count(p->first)))
+ return false;
+ }
+
+ // should we ignore all the pgs?
+ if (ignore_cnt == m->pg_temp.size())
+ goto ignore;
+
+ dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
+ _reply_map(op, m->map_epoch);
+ return true;
+
+ ignore:
+ mon.no_reply(op);
+ return true;
+}
+
+void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
+{
+ epoch_t old_up_thru = osdmap.get_up_thru(from);
+ auto ut = pending_inc.new_up_thru.find(from);
+ if (ut != pending_inc.new_up_thru.end()) {
+ old_up_thru = ut->second;
+ }
+ if (up_thru > old_up_thru) {
+ // set up_thru too, so the osd doesn't have to ask again
+ pending_inc.new_up_thru[from] = up_thru;
+ }
+}
+
+bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MOSDPGTemp>();
+ int from = m->get_orig_source().num();
+ dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
+ for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+ uint64_t pool = p->first.pool();
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": pool pending removal" << dendl;
+ continue;
+ }
+ if (!osdmap.have_pg_pool(pool)) {
+ dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+ << ": pool has been removed" << dendl;
+ continue;
+ }
+ pending_inc.new_pg_temp[p->first] =
+ mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
+
+ // unconditionally clear pg_primary (until this message can encode
+ // a change for that, too.. at which point we need to also fix
+ // preprocess_pg_temp)
+ if (osdmap.primary_temp->count(p->first) ||
+ pending_inc.new_primary_temp.count(p->first))
+ pending_inc.new_primary_temp[p->first] = -1;
+ }
+
+ // set up_thru too, so the osd doesn't have to ask again
+ update_up_thru(from, m->map_epoch);
+
+ wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
+ return true;
+}
+
+
+// ---
+
+bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MRemoveSnaps>();
+ dout(7) << "preprocess_remove_snaps " << *m << dendl;
+
+ // check privilege, ignore if failed
+ MonSession *session = op->get_session();
+ mon.no_reply(op);
+ if (!session)
+ goto ignore;
+ if (!session->caps.is_capable(
+ cct,
+ session->entity_name,
+ "osd", "osd pool rmsnap", {}, true, true, false,
+ session->get_peer_socket_addr())) {
+ dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
+ << session->caps << dendl;
+ goto ignore;
+ }
+
+ for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
+ q != m->snaps.end();
+ ++q) {
+ if (!osdmap.have_pg_pool(q->first)) {
+ dout(10) << " ignoring removed_snaps " << q->second
+ << " on non-existent pool " << q->first << dendl;
+ continue;
+ }
+ const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
+ for (vector<snapid_t>::iterator p = q->second.begin();
+ p != q->second.end();
+ ++p) {
+ if (*p > pi->get_snap_seq() ||
+ !_is_removed_snap(q->first, *p)) {
+ return false;
+ }
+ }
+ }
+
+ if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
+ auto reply = make_message<MRemoveSnaps>();
+ reply->snaps = m->snaps;
+ mon.send_reply(op, reply.detach());
+ }
+
+ ignore:
+ return true;
+}
+
+bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MRemoveSnaps>();
+ dout(7) << "prepare_remove_snaps " << *m << dendl;
+
+ for (auto& [pool, snaps] : m->snaps) {
+ if (!osdmap.have_pg_pool(pool)) {
+ dout(10) << " ignoring removed_snaps " << snaps
+ << " on non-existent pool " << pool << dendl;
+ continue;
+ }
+
+ pg_pool_t& pi = osdmap.pools[pool];
+ for (auto s : snaps) {
+ if (!_is_removed_snap(pool, s) &&
+ (!pending_inc.new_pools.count(pool) ||
+ !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
+ (!pending_inc.new_removed_snaps.count(pool) ||
+ !pending_inc.new_removed_snaps[pool].contains(s))) {
+ pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
+ if (osdmap.require_osd_release < ceph_release_t::octopus) {
+ newpi->removed_snaps.insert(s);
+ dout(10) << " pool " << pool << " removed_snaps added " << s
+ << " (now " << newpi->removed_snaps << ")" << dendl;
+ }
+ newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
+ if (s > newpi->get_snap_seq()) {
+ dout(10) << " pool " << pool << " snap_seq "
+ << newpi->get_snap_seq() << " -> " << s << dendl;
+ newpi->set_snap_seq(s);
+ }
+ newpi->set_snap_epoch(pending_inc.epoch);
+ dout(10) << " added pool " << pool << " snap " << s
+ << " to removed_snaps queue" << dendl;
+ pending_inc.new_removed_snaps[pool].insert(s);
+ }
+ }
+ }
+
+ if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
+ auto reply = make_message<MRemoveSnaps>();
+ reply->snaps = m->snaps;
+ wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
+ }
+
+ return true;
+}
+
+bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonGetPurgedSnaps>();
+ dout(7) << __func__ << " " << *m << dendl;
+
+ map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
+
+ string k = make_purged_snap_epoch_key(m->start);
+ auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+ it->upper_bound(k);
+ unsigned long epoch = m->last;
+ while (it->valid()) {
+ if (it->key().find("purged_epoch_") != 0) {
+ break;
+ }
+ string k = it->key();
+ int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
+ if (n != 1) {
+ derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
+ } else if (epoch > m->last) {
+ break;
+ } else {
+ bufferlist bl = it->value();
+ auto p = bl.cbegin();
+ auto &v = r[epoch];
+ try {
+ ceph::decode(v, p);
+ } catch (ceph::buffer::error& e) {
+ derr << __func__ << " unable to parse value for key '" << it->key()
+ << "': \n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+ }
+ n += 4 + v.size() * 16;
+ }
+ if (n > 1048576) {
+ // impose a semi-arbitrary limit to message size
+ break;
+ }
+ it->next();
+ }
+
+ auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
+ reply->purged_snaps.swap(r);
+ mon.send_reply(op, reply.detach());
+
+ return true;
+}
+
+// osd beacon
+bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ // check caps
+ auto session = op->get_session();
+ mon.no_reply(op);
+ if (!session) {
+ dout(10) << __func__ << " no monitor session!" << dendl;
+ return true;
+ }
+ if (!session->is_capable("osd", MON_CAP_X)) {
+ derr << __func__ << " received from entity "
+ << "with insufficient privileges " << session->caps << dendl;
+ return true;
+ }
+ // Always forward the beacon to the leader, even if they are the same as
+ // the old one. The leader will mark as down osds that haven't sent
+ // beacon for a few minutes.
+ return false;
+}
+
+bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ const auto beacon = op->get_req<MOSDBeacon>();
+ const auto src = beacon->get_orig_source();
+ dout(10) << __func__ << " " << *beacon
+ << " from " << src << dendl;
+ int from = src.num();
+
+ if (!src.is_osd() ||
+ !osdmap.is_up(from) ||
+ !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
+ if (src.is_osd() && !osdmap.is_up(from)) {
+ // share some new maps with this guy in case it may not be
+ // aware of its own deadness...
+ send_latest(op, beacon->version+1);
+ }
+ dout(1) << " ignoring beacon from non-active osd." << from << dendl;
+ return false;
+ }
+
+ last_osd_report[from].first = ceph_clock_now();
+ last_osd_report[from].second = beacon->osd_beacon_report_interval;
+ osd_epochs[from] = beacon->version;
+
+ for (const auto& pg : beacon->pgs) {
+ if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
+ unsigned pg_num = pool->get_pg_num();
+ last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
+ }
+ }
+
+ if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
+ beacon->last_purged_snaps_scrub) {
+ if (pending_inc.new_xinfo.count(from) == 0) {
+ pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
+ }
+ pending_inc.new_xinfo[from].last_purged_snaps_scrub =
+ beacon->last_purged_snaps_scrub;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// ---------------
+// map helpers
+
+void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
+{
+ op->mark_osdmon_event(__func__);
+ dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
+ << " start " << start << dendl;
+ if (start == 0)
+ send_full(op);
+ else
+ send_incremental(op, start);
+}
+
+
+MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
+{
+ MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
+ get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
+ r->oldest_map = get_first_committed();
+ r->newest_map = osdmap.get_epoch();
+ return r;
+}
+
+MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
+{
+ dout(10) << "build_incremental [" << from << ".." << to << "] with features "
+ << std::hex << features << std::dec << dendl;
+ MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
+ m->oldest_map = get_first_committed();
+ m->newest_map = osdmap.get_epoch();
+
+ for (epoch_t e = to; e >= from && e > 0; e--) {
+ bufferlist bl;
+ int err = get_version(e, features, bl);
+ if (err == 0) {
+ ceph_assert(bl.length());
+ // if (get_version(e, bl) > 0) {
+ dout(20) << "build_incremental inc " << e << " "
+ << bl.length() << " bytes" << dendl;
+ m->incremental_maps[e] = bl;
+ } else {
+ ceph_assert(err == -ENOENT);
+ ceph_assert(!bl.length());
+ get_version_full(e, features, bl);
+ if (bl.length() > 0) {
+ //else if (get_version("full", e, bl) > 0) {
+ dout(20) << "build_incremental full " << e << " "
+ << bl.length() << " bytes" << dendl;
+ m->maps[e] = bl;
+ } else {
+ ceph_abort(); // we should have all maps.
+ }
+ }
+ }
+ return m;
+}
+
+void OSDMonitor::send_full(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
+ mon.send_reply(op, build_latest_full(op->get_session()->con_features));
+}
+
+void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
+{
+ op->mark_osdmon_event(__func__);
+
+ MonSession *s = op->get_session();
+ ceph_assert(s);
+
+ if (s->proxy_con) {
+ // oh, we can tell the other mon to do it
+ dout(10) << __func__ << " asking proxying mon to send_incremental from "
+ << first << dendl;
+ MRoute *r = new MRoute(s->proxy_tid, NULL);
+ r->send_osdmap_first = first;
+ s->proxy_con->send_message(r);
+ op->mark_event("reply: send routed send_osdmap_first reply");
+ } else {
+ // do it ourselves
+ send_incremental(first, s, false, op);
+ }
+}
+
+void OSDMonitor::send_incremental(epoch_t first,
+ MonSession *session,
+ bool onetime,
+ MonOpRequestRef req)
+{
+ dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
+ << " to " << session->name << dendl;
+
+ // get feature of the peer
+ // use quorum_con_features, if it's an anonymous connection.
+ uint64_t features = session->con_features ? session->con_features :
+ mon.get_quorum_con_features();
+
+ if (first <= session->osd_epoch) {
+ dout(10) << __func__ << " " << session->name << " should already have epoch "
+ << session->osd_epoch << dendl;
+ first = session->osd_epoch + 1;
+ }
+
+ if (first < get_first_committed()) {
+ MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
+ m->oldest_map = get_first_committed();
+ m->newest_map = osdmap.get_epoch();
+
+ first = get_first_committed();
+ bufferlist bl;
+ int err = get_version_full(first, features, bl);
+ ceph_assert(err == 0);
+ ceph_assert(bl.length());
+ dout(20) << "send_incremental starting with base full "
+ << first << " " << bl.length() << " bytes" << dendl;
+ m->maps[first] = bl;
+
+ if (req) {
+ mon.send_reply(req, m);
+ session->osd_epoch = first;
+ return;
+ } else {
+ session->con->send_message(m);
+ session->osd_epoch = first;
+ }
+ first++;
+ }
+
+ while (first <= osdmap.get_epoch()) {
+ epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
+ osdmap.get_epoch());
+ MOSDMap *m = build_incremental(first, last, features);
+
+ if (req) {
+ // send some maps. it may not be all of them, but it will get them
+ // started.
+ mon.send_reply(req, m);
+ } else {
+ session->con->send_message(m);
+ first = last + 1;
+ }
+ session->osd_epoch = last;
+ if (onetime || req)
+ break;
+ }
+}
+
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+ return get_version(ver, mon.get_quorum_con_features(), bl);
+}
+
+void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
+{
+ OSDMap::Incremental inc;
+ auto q = bl.cbegin();
+ inc.decode(q);
+ // always encode with subset of osdmap's canonical features
+ uint64_t f = features & inc.encode_features;
+ dout(20) << __func__ << " " << inc.epoch << " with features " << f
+ << dendl;
+ bl.clear();
+ if (inc.fullmap.length()) {
+ // embedded full map?
+ OSDMap m;
+ m.decode(inc.fullmap);
+ inc.fullmap.clear();
+ m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
+ }
+ if (inc.crush.length()) {
+ // embedded crush map
+ CrushWrapper c;
+ auto p = inc.crush.cbegin();
+ c.decode(p);
+ inc.crush.clear();
+ c.encode(inc.crush, f);
+ }
+ inc.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
+{
+ OSDMap m;
+ auto q = bl.cbegin();
+ m.decode(q);
+ // always encode with subset of osdmap's canonical features
+ uint64_t f = features & m.get_encoding_features();
+ dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
+ << dendl;
+ bl.clear();
+ m.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
+{
+ uint64_t significant_features = OSDMap::get_significant_features(features);
+ if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version(ver, bl);
+ if (ret < 0) {
+ return ret;
+ }
+ // NOTE: this check is imprecise; the OSDMap encoding features may
+ // be a subset of the latest mon quorum features, but worst case we
+ // reencode once and then cache the (identical) result under both
+ // feature masks.
+ if (significant_features !=
+ OSDMap::get_significant_features(mon.get_quorum_con_features())) {
+ reencode_incremental_map(bl, features);
+ }
+ inc_osd_cache.add_bytes({ver, significant_features}, bl);
+ return 0;
+}
+
+int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
+{
+ bufferlist inc_bl;
+ int err = get_version(ver, inc_bl);
+ ceph_assert(err == 0);
+ ceph_assert(inc_bl.length());
+
+ auto p = inc_bl.cbegin();
+ inc.decode(p);
+ dout(10) << __func__ << " "
+ << " epoch " << inc.epoch
+ << " inc_crc " << inc.inc_crc
+ << " full_crc " << inc.full_crc
+ << " encode_features " << inc.encode_features << dendl;
+ return 0;
+}
+
+int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
+{
+ dout(10) << __func__ << " ver " << ver << dendl;
+
+ version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
+ if (closest_pinned == 0) {
+ return -ENOENT;
+ }
+ if (closest_pinned > ver) {
+ dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
+ }
+ ceph_assert(closest_pinned <= ver);
+
+ dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
+
+ // get osdmap incremental maps and apply on top of this one.
+ bufferlist osdm_bl;
+ bool has_cached_osdmap = false;
+ for (version_t v = ver-1; v >= closest_pinned; --v) {
+ if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
+ &osdm_bl)) {
+ dout(10) << __func__ << " found map in cache ver " << v << dendl;
+ closest_pinned = v;
+ has_cached_osdmap = true;
+ break;
+ }
+ }
+
+ if (!has_cached_osdmap) {
+ int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
+ if (err != 0) {
+ derr << __func__ << " closest pinned map ver " << closest_pinned
+ << " not available! error: " << cpp_strerror(err) << dendl;
+ }
+ ceph_assert(err == 0);
+ }
+
+ ceph_assert(osdm_bl.length());
+
+ OSDMap osdm;
+ osdm.decode(osdm_bl);
+
+ dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
+ << " e" << osdm.epoch
+ << " crc " << osdm.get_crc()
+ << " -- applying incremental maps." << dendl;
+
+ uint64_t encode_features = 0;
+ for (version_t v = closest_pinned + 1; v <= ver; ++v) {
+ dout(20) << __func__ << " applying inc epoch " << v << dendl;
+
+ OSDMap::Incremental inc;
+ int err = get_inc(v, inc);
+ ceph_assert(err == 0);
+
+ encode_features = inc.encode_features;
+
+ err = osdm.apply_incremental(inc);
+ ceph_assert(err == 0);
+
+ // this block performs paranoid checks on map retrieval
+ if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
+ inc.full_crc != 0) {
+
+ uint64_t f = encode_features;
+ if (!f) {
+ f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
+ }
+
+ // encode osdmap to force calculating crcs
+ bufferlist tbl;
+ osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
+ // decode osdmap to compare crcs with what's expected by incremental
+ OSDMap tosdm;
+ tosdm.decode(tbl);
+
+ if (tosdm.get_crc() != inc.full_crc) {
+ derr << __func__
+ << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
+ << ", expected " << inc.full_crc << ")" << dendl;
+ ceph_abort_msg("osdmap crc mismatch");
+ }
+ }
+
+ // note: we cannot add the recently computed map to the cache, as is,
+ // because we have not encoded the map into a bl.
+ }
+
+ if (!encode_features) {
+ dout(10) << __func__
+ << " last incremental map didn't have features;"
+ << " defaulting to quorum's or all" << dendl;
+ encode_features =
+ (mon.quorum_con_features ? mon.quorum_con_features : -1);
+ }
+ osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
+
+ return 0;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+ return get_version_full(ver, mon.get_quorum_con_features(), bl);
+}
+
+int OSDMonitor::get_version_full(version_t ver, uint64_t features,
+ bufferlist& bl)
+{
+ uint64_t significant_features = OSDMap::get_significant_features(features);
+ if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version_full(ver, bl);
+ if (ret == -ENOENT) {
+ // build map?
+ ret = get_full_from_pinned_map(ver, bl);
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ // NOTE: this check is imprecise; the OSDMap encoding features may
+ // be a subset of the latest mon quorum features, but worst case we
+ // reencode once and then cache the (identical) result under both
+ // feature masks.
+ if (significant_features !=
+ OSDMap::get_significant_features(mon.get_quorum_con_features())) {
+ reencode_full_map(bl, features);
+ }
+ full_osd_cache.add_bytes({ver, significant_features}, bl);
+ return 0;
+}
+
+epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
+{
+ dout(10) << "blocklist " << av << " until " << until << dendl;
+ for (auto a : av.v) {
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ a.set_type(entity_addr_t::TYPE_ANY);
+ } else {
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ }
+ pending_inc.new_blocklist[a] = until;
+ }
+ return pending_inc.epoch;
+}
+
+epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
+{
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ a.set_type(entity_addr_t::TYPE_ANY);
+ } else {
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ }
+ dout(10) << "blocklist " << a << " until " << until << dendl;
+ pending_inc.new_blocklist[a] = until;
+ return pending_inc.epoch;
+}
+
+
+void OSDMonitor::check_osdmap_subs()
+{
+ dout(10) << __func__ << dendl;
+ if (!osdmap.get_epoch()) {
+ return;
+ }
+ auto osdmap_subs = mon.session_map.subs.find("osdmap");
+ if (osdmap_subs == mon.session_map.subs.end()) {
+ return;
+ }
+ auto p = osdmap_subs->second->begin();
+ while (!p.end()) {
+ auto sub = *p;
+ ++p;
+ check_osdmap_sub(sub);
+ }
+}
+
+void OSDMonitor::check_osdmap_sub(Subscription *sub)
+{
+ dout(10) << __func__ << " " << sub << " next " << sub->next
+ << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
+ if (sub->next <= osdmap.get_epoch()) {
+ if (sub->next >= 1)
+ send_incremental(sub->next, sub->session, sub->incremental_onetime);
+ else
+ sub->session->con->send_message(build_latest_full(sub->session->con_features));
+ if (sub->onetime)
+ mon.session_map.remove_sub(sub);
+ else
+ sub->next = osdmap.get_epoch() + 1;
+ }
+}
+
+void OSDMonitor::check_pg_creates_subs()
+{
+ if (!osdmap.get_num_up_osds()) {
+ return;
+ }
+ ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
+ mon.with_session_map([this](const MonSessionMap& session_map) {
+ auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
+ if (pg_creates_subs == session_map.subs.end()) {
+ return;
+ }
+ for (auto sub : *pg_creates_subs->second) {
+ check_pg_creates_sub(sub);
+ }
+ });
+}
+
+void OSDMonitor::check_pg_creates_sub(Subscription *sub)
+{
+ dout(20) << __func__ << " .. " << sub->session->name << dendl;
+ ceph_assert(sub->type == "osd_pg_creates");
+ // only send these if the OSD is up. we will check_subs() when they do
+ // come up so they will get the creates then.
+ if (sub->session->name.is_osd() &&
+ mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
+ sub->next = send_pg_creates(sub->session->name.num(),
+ sub->session->con.get(),
+ sub->next);
+ }
+}
+
+void OSDMonitor::do_application_enable(int64_t pool_id,
+ const std::string &app_name,
+ const std::string &app_key,
+ const std::string &app_value,
+ bool force)
+{
+ ceph_assert(paxos.is_plugged() && is_writeable());
+
+ dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
+ << dendl;
+
+ ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
+
+ auto pp = osdmap.get_pg_pool(pool_id);
+ ceph_assert(pp != nullptr);
+
+ pg_pool_t p = *pp;
+ if (pending_inc.new_pools.count(pool_id)) {
+ p = pending_inc.new_pools[pool_id];
+ }
+
+ if (app_key.empty()) {
+ p.application_metadata.insert({app_name, {}});
+ } else {
+ if (force) {
+ p.application_metadata[app_name][app_key] = app_value;
+ } else {
+ p.application_metadata.insert({app_name, {{app_key, app_value}}});
+ }
+ }
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool_id] = p;
+}
+
+void OSDMonitor::do_set_pool_opt(int64_t pool_id,
+ pool_opts_t::key_t opt,
+ pool_opts_t::value_t val)
+{
+ dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
+ << " val: " << val << dendl;
+ auto p = pending_inc.new_pools.try_emplace(
+ pool_id, *osdmap.get_pg_pool(pool_id));
+ p.first->second.opts.set(opt, val);
+}
+
+unsigned OSDMonitor::scan_for_creating_pgs(
+ const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+ const mempool::osdmap::set<int64_t>& removed_pools,
+ utime_t modified,
+ creating_pgs_t* creating_pgs) const
+{
+ unsigned queued = 0;
+ for (auto& p : pools) {
+ int64_t poolid = p.first;
+ if (creating_pgs->created_pools.count(poolid)) {
+ dout(10) << __func__ << " already created " << poolid << dendl;
+ continue;
+ }
+ const pg_pool_t& pool = p.second;
+ int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
+ pool.get_type(), pool.get_size());
+ if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
+ continue;
+
+ const auto last_scan_epoch = creating_pgs->last_scan_epoch;
+ const auto created = pool.get_last_change();
+ if (last_scan_epoch && created <= last_scan_epoch) {
+ dout(10) << __func__ << " no change in pool " << poolid
+ << " " << pool << dendl;
+ continue;
+ }
+ if (removed_pools.count(poolid)) {
+ dout(10) << __func__ << " pool is being removed: " << poolid
+ << " " << pool << dendl;
+ continue;
+ }
+ dout(10) << __func__ << " queueing pool create for " << poolid
+ << " " << pool << dendl;
+ creating_pgs->create_pool(poolid, pool.get_pg_num(),
+ created, modified);
+ queued++;
+ }
+ return queued;
+}
+
+void OSDMonitor::update_creating_pgs()
+{
+ dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
+ << creating_pgs.queue.size() << " pools in queue" << dendl;
+ decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ for (const auto& pg : creating_pgs.pgs) {
+ int acting_primary = -1;
+ auto pgid = pg.first;
+ if (!osdmap.pg_exists(pgid)) {
+ dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
+ << dendl;
+ continue;
+ }
+ auto mapped = pg.second.create_epoch;
+ dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
+ spg_t spgid(pgid);
+ mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
+ // check the previous creating_pgs, look for the target to whom the pg was
+ // previously mapped
+ for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
+ const auto last_acting_primary = pgs_by_epoch.first;
+ for (auto& pgs: pgs_by_epoch.second) {
+ if (pgs.second.count(spgid)) {
+ if (last_acting_primary == acting_primary) {
+ mapped = pgs.first;
+ } else {
+ dout(20) << __func__ << " " << pgid << " "
+ << " acting_primary:" << last_acting_primary
+ << " -> " << acting_primary << dendl;
+ // note epoch if the target of the create message changed.
+ mapped = mapping.get_epoch();
+ }
+ break;
+ } else {
+ // newly creating
+ mapped = mapping.get_epoch();
+ }
+ }
+ }
+ dout(10) << __func__ << " will instruct osd." << acting_primary
+ << " to create " << pgid << "@" << mapped << dendl;
+ new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
+ }
+ creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
+ creating_pgs_epoch = mapping.get_epoch();
+}
+
+epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
+{
+ dout(30) << __func__ << " osd." << osd << " next=" << next
+ << " " << creating_pgs_by_osd_epoch << dendl;
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
+ dout(20) << __func__
+ << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
+ // the subscribers will be updated when the mapping is completed anyway
+ return next;
+ }
+ auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
+ if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
+ return next;
+ ceph_assert(!creating_pgs_by_epoch->second.empty());
+
+ MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
+ MOSDPGCreate2 *m = nullptr;
+
+ bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
+
+ epoch_t last = 0;
+ for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
+ epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
+ auto epoch = epoch_pgs->first;
+ auto& pgs = epoch_pgs->second;
+ dout(20) << __func__ << " osd." << osd << " from " << next
+ << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
+ last = epoch;
+ for (auto& pg : pgs) {
+ // Need the create time from the monitor using its clock to set
+ // last_scrub_stamp upon pg creation.
+ auto create = creating_pgs.pgs.find(pg.pgid);
+ ceph_assert(create != creating_pgs.pgs.end());
+ if (old) {
+ if (!oldm) {
+ oldm = new MOSDPGCreate(creating_pgs_epoch);
+ }
+ oldm->mkpg.emplace(pg.pgid,
+ pg_create_t{create->second.create_epoch, pg.pgid, 0});
+ oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
+ } else {
+ if (!m) {
+ m = new MOSDPGCreate2(creating_pgs_epoch);
+ }
+ m->pgs.emplace(pg, make_pair(create->second.create_epoch,
+ create->second.create_stamp));
+ if (create->second.history.epoch_created) {
+ dout(20) << __func__ << " " << pg << " " << create->second.history
+ << " " << create->second.past_intervals << dendl;
+ m->pg_extra.emplace(pg, make_pair(create->second.history,
+ create->second.past_intervals));
+ }
+ }
+ dout(20) << __func__ << " will create " << pg
+ << " at " << create->second.create_epoch << dendl;
+ }
+ }
+ if (m) {
+ con->send_message(m);
+ } else if (oldm) {
+ con->send_message(oldm);
+ } else {
+ dout(20) << __func__ << " osd." << osd << " from " << next
+ << " has nothing to send" << dendl;
+ return next;
+ }
+
+ // sub is current through last + 1
+ return last + 1;
+}
+
+// TICK
+
+
+void OSDMonitor::tick()
+{
+ if (!is_active()) return;
+
+ dout(10) << osdmap << dendl;
+
+ // always update osdmap manifest, regardless of being the leader.
+ load_osdmap_manifest();
+
+ // always tune priority cache manager memory on leader and peons
+ if (ceph_using_tcmalloc() && mon_memory_autotune) {
+ std::lock_guard l(balancer_lock);
+ if (pcm != nullptr) {
+ pcm->tune_memory();
+ pcm->balance();
+ _set_new_cache_sizes();
+ dout(10) << "tick balancer "
+ << " inc cache_bytes: " << inc_cache->get_cache_bytes()
+ << " inc comtd_bytes: " << inc_cache->get_committed_size()
+ << " inc used_bytes: " << inc_cache->_get_used_bytes()
+ << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
+ << dendl;
+ dout(10) << "tick balancer "
+ << " full cache_bytes: " << full_cache->get_cache_bytes()
+ << " full comtd_bytes: " << full_cache->get_committed_size()
+ << " full used_bytes: " << full_cache->_get_used_bytes()
+ << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
+ << dendl;
+ }
+ }
+
+ if (!mon.is_leader()) return;
+
+ bool do_propose = false;
+ utime_t now = ceph_clock_now();
+
+ if (handle_osd_timeouts(now, last_osd_report)) {
+ do_propose = true;
+ }
+
+ // mark osds down?
+ if (check_failures(now)) {
+ do_propose = true;
+ }
+
+ // Force a proposal if we need to prune; pruning is performed on
+ // ``encode_pending()``, hence why we need to regularly trigger a proposal
+ // even if there's nothing going on.
+ if (is_prune_enabled() && should_prune()) {
+ do_propose = true;
+ }
+
+ // mark down osds out?
+
+ /* can_mark_out() checks if we can mark osds as being out. The -1 has no
+ * influence at all. The decision is made based on the ratio of "in" osds,
+ * and the function returns false if this ratio is lower that the minimum
+ * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
+ */
+ if (can_mark_out(-1)) {
+ string down_out_subtree_limit = g_conf().get_val<string>(
+ "mon_osd_down_out_subtree_limit");
+ set<int> down_cache; // quick cache of down subtrees
+
+ map<int,utime_t>::iterator i = down_pending_out.begin();
+ while (i != down_pending_out.end()) {
+ int o = i->first;
+ utime_t down = now;
+ down -= i->second;
+ ++i;
+
+ if (osdmap.is_down(o) &&
+ osdmap.is_in(o) &&
+ can_mark_out(o)) {
+ utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
+ utime_t grace = orig_grace;
+ double my_grace = 0.0;
+
+ if (g_conf()->mon_osd_adjust_down_out_interval) {
+ // scale grace period the same way we do the heartbeat grace.
+ const osd_xinfo_t& xi = osdmap.get_xinfo(o);
+ double halflife = (double)g_conf()->mon_osd_laggy_halflife;
+ double decay_k = ::log(.5) / halflife;
+ double decay = exp((double)down * decay_k);
+ dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
+ << " down for " << down << " decay " << decay << dendl;
+ my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
+ grace += my_grace;
+ }
+
+ // is this an entire large subtree down?
+ if (down_out_subtree_limit.length()) {
+ int type = osdmap.crush->get_type_id(down_out_subtree_limit);
+ if (type > 0) {
+ if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
+ dout(10) << "tick entire containing " << down_out_subtree_limit
+ << " subtree for osd." << o
+ << " is down; resetting timer" << dendl;
+ // reset timer, too.
+ down_pending_out[o] = now;
+ continue;
+ }
+ }
+ }
+
+ bool down_out = !osdmap.is_destroyed(o) &&
+ g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
+ bool destroyed_out = osdmap.is_destroyed(o) &&
+ g_conf()->mon_osd_destroyed_out_interval > 0 &&
+ // this is not precise enough as we did not make a note when this osd
+ // was marked as destroyed, but let's not bother with that
+ // complexity for now.
+ down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
+ if (down_out || destroyed_out) {
+ dout(10) << "tick marking osd." << o << " OUT after " << down
+ << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
+ pending_inc.new_weight[o] = CEPH_OSD_OUT;
+
+ // set the AUTOOUT bit.
+ if (pending_inc.new_state.count(o) == 0)
+ pending_inc.new_state[o] = 0;
+ pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
+
+ // remember previous weight
+ if (pending_inc.new_xinfo.count(o) == 0)
+ pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
+ pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
+
+ do_propose = true;
+
+ mon.clog->info() << "Marking osd." << o << " out (has been down for "
+ << int(down.sec()) << " seconds)";
+ } else
+ continue;
+ }
+
+ down_pending_out.erase(o);
+ }
+ } else {
+ dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
+ }
+
+ // expire blocklisted items?
+ for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
+ p != osdmap.blocklist.end();
+ ++p) {
+ if (p->second < now) {
+ dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
+ pending_inc.old_blocklist.push_back(p->first);
+ do_propose = true;
+ }
+ }
+ for (auto p = osdmap.range_blocklist.begin();
+ p != osdmap.range_blocklist.end();
+ ++p) {
+ if (p->second < now) {
+ dout(10) << "expiring range_blocklist item " << p->first
+ << " expired " << p->second << " < now " << now << dendl;
+ pending_inc.old_range_blocklist.push_back(p->first);
+ do_propose = true;
+ }
+ }
+
+ if (try_prune_purged_snaps()) {
+ do_propose = true;
+ }
+
+ if (update_pools_status())
+ do_propose = true;
+
+ if (do_propose ||
+ !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
+ propose_pending();
+}
+
+void OSDMonitor::_set_new_cache_sizes()
+{
+ uint64_t cache_size = 0;
+ int64_t inc_alloc = 0;
+ int64_t full_alloc = 0;
+ int64_t kv_alloc = 0;
+
+ if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
+ cache_size = pcm->get_tuned_mem();
+ inc_alloc = inc_cache->get_committed_size();
+ full_alloc = full_cache->get_committed_size();
+ kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
+ }
+
+ inc_osd_cache.set_bytes(inc_alloc);
+ full_osd_cache.set_bytes(full_alloc);
+
+ dout(1) << __func__ << " cache_size:" << cache_size
+ << " inc_alloc: " << inc_alloc
+ << " full_alloc: " << full_alloc
+ << " kv_alloc: " << kv_alloc
+ << dendl;
+}
+
+bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
+ std::map<int, std::pair<utime_t, int>> &last_osd_report)
+{
+ utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
+ if (now - mon.get_leader_since() < timeo) {
+ // We haven't been the leader for long enough to consider OSD timeouts
+ return false;
+ }
+
+ int max_osd = osdmap.get_max_osd();
+ bool new_down = false;
+
+ for (int i=0; i < max_osd; ++i) {
+ dout(30) << __func__ << ": checking up on osd " << i << dendl;
+ if (!osdmap.exists(i)) {
+ last_osd_report.erase(i); // if any
+ continue;
+ }
+ if (!osdmap.is_up(i))
+ continue;
+ const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
+ if (t == last_osd_report.end()) {
+ // it wasn't in the map; start the timer.
+ last_osd_report[i].first = now;
+ last_osd_report[i].second = 0;
+ } else if (can_mark_down(i)) {
+ utime_t diff = now - t->second.first;
+ // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
+ // to allow for the osd to miss a beacon.
+ int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
+ utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
+ if (diff > max_timeout) {
+ mon.clog->info() << "osd." << i << " marked down after no beacon for "
+ << diff << " seconds";
+ derr << "no beacon from osd." << i << " since " << t->second.first
+ << ", " << diff << " seconds ago. marking down" << dendl;
+ pending_inc.new_state[i] = CEPH_OSD_UP;
+ new_down = true;
+ }
+ }
+ }
+ return new_down;
+}
+
+static void dump_cpu_list(Formatter *f, const char *name,
+ const string& strlist)
+{
+ cpu_set_t cpu_set;
+ size_t cpu_set_size;
+ if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
+ return;
+ }
+ set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
+ f->open_array_section(name);
+ for (auto cpu : cpus) {
+ f->dump_int("cpu", cpu);
+ }
+ f->close_section();
+}
+
+void OSDMonitor::dump_info(Formatter *f)
+{
+ f->open_object_section("osdmap");
+ osdmap.dump(f);
+ f->close_section();
+
+ f->open_array_section("osd_metadata");
+ for (int i=0; i<osdmap.get_max_osd(); ++i) {
+ if (osdmap.exists(i)) {
+ f->open_object_section("osd");
+ f->dump_unsigned("id", i);
+ dump_osd_metadata(i, f, NULL);
+ f->close_section();
+ }
+ }
+ f->close_section();
+
+ f->open_object_section("osdmap_clean_epochs");
+ f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
+
+ f->open_object_section("last_epoch_clean");
+ last_epoch_clean.dump(f);
+ f->close_section();
+
+ f->open_array_section("osd_epochs");
+ for (auto& osd_epoch : osd_epochs) {
+ f->open_object_section("osd");
+ f->dump_unsigned("id", osd_epoch.first);
+ f->dump_unsigned("epoch", osd_epoch.second);
+ f->close_section();
+ }
+ f->close_section(); // osd_epochs
+
+ f->close_section(); // osd_clean_epochs
+
+ f->dump_unsigned("osdmap_first_committed", get_first_committed());
+ f->dump_unsigned("osdmap_last_committed", get_last_committed());
+
+ f->open_object_section("crushmap");
+ osdmap.crush->dump(f);
+ f->close_section();
+
+ if (has_osdmap_manifest) {
+ f->open_object_section("osdmap_manifest");
+ osdmap_manifest.dump(f);
+ f->close_section();
+ }
+}
+
+namespace {
+ enum osd_pool_get_choices {
+ SIZE, MIN_SIZE,
+ PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
+ NODELETE, NOPGCHANGE, NOSIZECHANGE,
+ WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
+ HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+ USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
+ CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+ CACHE_TARGET_FULL_RATIO,
+ CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+ ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
+ MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
+ HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
+ SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
+ RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
+ COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
+ COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
+ CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
+ PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
+ PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
+ DEDUP_CDC_CHUNK_SIZE, PG_NUM_MAX, BULK };
+
+ std::set<osd_pool_get_choices>
+ subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
+ const std::set<osd_pool_get_choices>& second)
+ {
+ std::set<osd_pool_get_choices> result;
+ std::set_difference(first.begin(), first.end(),
+ second.begin(), second.end(),
+ std::inserter(result, result.end()));
+ return result;
+ }
+}
+
+
+bool OSDMonitor::preprocess_command(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ int r = 0;
+ bufferlist rdata;
+ stringstream ss, ds;
+
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ derr << __func__ << " no session" << dendl;
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ string format;
+ cmd_getval(cmdmap, "format", format, string("plain"));
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ if (prefix == "osd stat") {
+ if (f) {
+ f->open_object_section("osdmap");
+ osdmap.print_summary(f.get(), ds, "", true);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ osdmap.print_summary(nullptr, ds, "", true);
+ rdata.append(ds);
+ }
+ }
+ else if (prefix == "osd dump" ||
+ prefix == "osd tree" ||
+ prefix == "osd tree-from" ||
+ prefix == "osd ls" ||
+ prefix == "osd getmap" ||
+ prefix == "osd getcrushmap" ||
+ prefix == "osd ls-tree" ||
+ prefix == "osd info") {
+
+ epoch_t epoch = 0;
+ int64_t epochnum;
+ cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
+ epoch = epochnum;
+
+ bufferlist osdmap_bl;
+ int err = get_version_full(epoch, osdmap_bl);
+ if (err == -ENOENT) {
+ r = -ENOENT;
+ ss << "there is no map for epoch " << epoch;
+ goto reply;
+ }
+ ceph_assert(err == 0);
+ ceph_assert(osdmap_bl.length());
+
+ OSDMap *p;
+ if (epoch == osdmap.get_epoch()) {
+ p = &osdmap;
+ } else {
+ p = new OSDMap;
+ p->decode(osdmap_bl);
+ }
+
+ auto sg = make_scope_guard([&] {
+ if (p != &osdmap) {
+ delete p;
+ }
+ });
+
+ if (prefix == "osd dump") {
+ stringstream ds;
+ if (f) {
+ f->open_object_section("osdmap");
+ p->dump(f.get());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ p->print(ds);
+ }
+ rdata.append(ds);
+ if (!f)
+ ds << " ";
+ } else if (prefix == "osd ls") {
+ if (f) {
+ f->open_array_section("osds");
+ for (int i = 0; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i)) {
+ f->dump_int("osd", i);
+ }
+ }
+ f->close_section();
+ f->flush(ds);
+ } else {
+ bool first = true;
+ for (int i = 0; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i)) {
+ if (!first)
+ ds << "\n";
+ first = false;
+ ds << i;
+ }
+ }
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd info") {
+ int64_t osd_id;
+ bool do_single_osd = true;
+ if (!cmd_getval(cmdmap, "id", osd_id)) {
+ do_single_osd = false;
+ }
+
+ if (do_single_osd && !osdmap.exists(osd_id)) {
+ ss << "osd." << osd_id << " does not exist";
+ r = -EINVAL;
+ goto reply;
+ }
+
+ if (f) {
+ if (do_single_osd) {
+ osdmap.dump_osd(osd_id, f.get());
+ } else {
+ osdmap.dump_osds(f.get());
+ }
+ f->flush(ds);
+ } else {
+ if (do_single_osd) {
+ osdmap.print_osd(osd_id, ds);
+ } else {
+ osdmap.print_osds(ds);
+ }
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd tree" || prefix == "osd tree-from") {
+ string bucket;
+ if (prefix == "osd tree-from") {
+ cmd_getval(cmdmap, "bucket", bucket);
+ if (!osdmap.crush->name_exists(bucket)) {
+ ss << "bucket '" << bucket << "' does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ int id = osdmap.crush->get_item_id(bucket);
+ if (id >= 0) {
+ ss << "\"" << bucket << "\" is not a bucket";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+
+ vector<string> states;
+ cmd_getval(cmdmap, "states", states);
+ unsigned filter = 0;
+ for (auto& s : states) {
+ if (s == "up") {
+ filter |= OSDMap::DUMP_UP;
+ } else if (s == "down") {
+ filter |= OSDMap::DUMP_DOWN;
+ } else if (s == "in") {
+ filter |= OSDMap::DUMP_IN;
+ } else if (s == "out") {
+ filter |= OSDMap::DUMP_OUT;
+ } else if (s == "destroyed") {
+ filter |= OSDMap::DUMP_DESTROYED;
+ } else {
+ ss << "unrecognized state '" << s << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+ if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
+ (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
+ ss << "cannot specify both 'in' and 'out'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
+ (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
+ ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
+ (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
+ ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
+ (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
+ ss << "can specify only one of 'up', 'down' and 'destroyed'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (f) {
+ f->open_object_section("tree");
+ p->print_tree(f.get(), NULL, filter, bucket);
+ f->close_section();
+ f->flush(ds);
+ } else {
+ p->print_tree(NULL, &ds, filter, bucket);
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd getmap") {
+ rdata.append(osdmap_bl);
+ ss << "got osdmap epoch " << p->get_epoch();
+ } else if (prefix == "osd getcrushmap") {
+ p->crush->encode(rdata, mon.get_quorum_con_features());
+ ss << p->get_crush_version();
+ } else if (prefix == "osd ls-tree") {
+ string bucket_name;
+ cmd_getval(cmdmap, "name", bucket_name);
+ set<int> osds;
+ r = p->get_osds_by_bucket_name(bucket_name, &osds);
+ if (r == -ENOENT) {
+ ss << "\"" << bucket_name << "\" does not exist";
+ goto reply;
+ } else if (r < 0) {
+ ss << "can not parse bucket name:\"" << bucket_name << "\"";
+ goto reply;
+ }
+
+ if (f) {
+ f->open_array_section("osds");
+ for (auto &i : osds) {
+ if (osdmap.exists(i)) {
+ f->dump_int("osd", i);
+ }
+ }
+ f->close_section();
+ f->flush(ds);
+ } else {
+ bool first = true;
+ for (auto &i : osds) {
+ if (osdmap.exists(i)) {
+ if (!first)
+ ds << "\n";
+ first = false;
+ ds << i;
+ }
+ }
+ }
+
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd getmaxosd") {
+ if (f) {
+ f->open_object_section("getmaxosd");
+ f->dump_unsigned("epoch", osdmap.get_epoch());
+ f->dump_int("max_osd", osdmap.get_max_osd());
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd utilization") {
+ string out;
+ osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
+ if (f)
+ f->flush(rdata);
+ else
+ rdata.append(out);
+ r = 0;
+ goto reply;
+ } else if (prefix == "osd find") {
+ int64_t osd;
+ if (!cmd_getval(cmdmap, "id", osd)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap["id"]) << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_object_section("osd_location");
+ f->dump_int("osd", osd);
+ f->dump_object("addrs", osdmap.get_addrs(osd));
+ f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
+
+ // try to identify host, pod/container name, etc.
+ map<string,string> m;
+ load_metadata(osd, m, nullptr);
+ if (auto p = m.find("hostname"); p != m.end()) {
+ f->dump_string("host", p->second);
+ }
+ for (auto& k : {
+ "pod_name", "pod_namespace", // set by rook
+ "container_name" // set by cephadm, ceph-ansible
+ }) {
+ if (auto p = m.find(k); p != m.end()) {
+ f->dump_string(k, p->second);
+ }
+ }
+
+ // crush is helpful too
+ f->open_object_section("crush_location");
+ map<string,string> loc = osdmap.crush->get_full_location(osd);
+ for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
+ f->dump_string(p->first.c_str(), p->second);
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "osd metadata") {
+ int64_t osd = -1;
+ if (cmd_vartype_stringify(cmdmap["id"]).size() &&
+ !cmd_getval(cmdmap, "id", osd)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap["id"]) << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (osd >= 0 && !osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ if (osd >= 0) {
+ f->open_object_section("osd_metadata");
+ f->dump_unsigned("id", osd);
+ r = dump_osd_metadata(osd, f.get(), &ss);
+ if (r < 0)
+ goto reply;
+ f->close_section();
+ } else {
+ r = 0;
+ f->open_array_section("osd_metadata");
+ for (int i=0; i<osdmap.get_max_osd(); ++i) {
+ if (osdmap.exists(i)) {
+ f->open_object_section("osd");
+ f->dump_unsigned("id", i);
+ r = dump_osd_metadata(i, f.get(), NULL);
+ if (r == -EINVAL || r == -ENOENT) {
+ // Drop error, continue to get other daemons' metadata
+ dout(4) << "No metadata for osd." << i << dendl;
+ r = 0;
+ } else if (r < 0) {
+ // Unexpected error
+ goto reply;
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+ }
+ f->flush(rdata);
+ } else if (prefix == "osd versions") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ count_metadata("ceph_version", f.get());
+ f->flush(rdata);
+ r = 0;
+ } else if (prefix == "osd count-metadata") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ string field;
+ cmd_getval(cmdmap, "property", field);
+ count_metadata(field, f.get());
+ f->flush(rdata);
+ r = 0;
+ } else if (prefix == "osd numa-status") {
+ TextTable tbl;
+ if (f) {
+ f->open_array_section("osds");
+ } else {
+ tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
+ }
+ for (int i=0; i<osdmap.get_max_osd(); ++i) {
+ if (osdmap.exists(i)) {
+ map<string,string> m;
+ ostringstream err;
+ if (load_metadata(i, m, &err) < 0) {
+ continue;
+ }
+ string host;
+ auto p = m.find("hostname");
+ if (p != m.end()) {
+ host = p->second;
+ }
+ if (f) {
+ f->open_object_section("osd");
+ f->dump_int("osd", i);
+ f->dump_string("host", host);
+ for (auto n : { "network_numa_node", "objectstore_numa_node",
+ "numa_node" }) {
+ p = m.find(n);
+ if (p != m.end()) {
+ f->dump_int(n, atoi(p->second.c_str()));
+ }
+ }
+ for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
+ p = m.find(n);
+ if (p != m.end()) {
+ list<string> ls = get_str_list(p->second, ",");
+ f->open_array_section(n);
+ for (auto node : ls) {
+ f->dump_int("node", atoi(node.c_str()));
+ }
+ f->close_section();
+ }
+ }
+ for (auto n : { "numa_node_cpus" }) {
+ p = m.find(n);
+ if (p != m.end()) {
+ dump_cpu_list(f.get(), n, p->second);
+ }
+ }
+ f->close_section();
+ } else {
+ tbl << i;
+ tbl << host;
+ p = m.find("network_numa_nodes");
+ if (p != m.end()) {
+ tbl << p->second;
+ } else {
+ tbl << "-";
+ }
+ p = m.find("objectstore_numa_nodes");
+ if (p != m.end()) {
+ tbl << p->second;
+ } else {
+ tbl << "-";
+ }
+ p = m.find("numa_node");
+ auto q = m.find("numa_node_cpus");
+ if (p != m.end() && q != m.end()) {
+ tbl << p->second;
+ tbl << q->second;
+ } else {
+ tbl << "-";
+ tbl << "-";
+ }
+ tbl << TextTable::endrow;
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ rdata.append(stringify(tbl));
+ }
+ } else if (prefix == "osd map") {
+ string poolstr, objstr, namespacestr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ cmd_getval(cmdmap, "object", objstr);
+ cmd_getval(cmdmap, "nspace", namespacestr);
+
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "pool " << poolstr << " does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ object_locator_t oloc(pool, namespacestr);
+ object_t oid(objstr);
+ pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
+ pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
+ vector<int> up, acting;
+ int up_p, acting_p;
+ osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
+
+ string fullobjname;
+ if (!namespacestr.empty())
+ fullobjname = namespacestr + string("/") + oid.name;
+ else
+ fullobjname = oid.name;
+ if (f) {
+ f->open_object_section("osd_map");
+ f->dump_unsigned("epoch", osdmap.get_epoch());
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+ f->dump_stream("objname") << fullobjname;
+ f->dump_stream("raw_pgid") << pgid;
+ f->dump_stream("pgid") << mpgid;
+ f->open_array_section("up");
+ for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("up_primary", up_p);
+ f->open_array_section("acting");
+ for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("acting_primary", acting_p);
+ f->close_section(); // osd_map
+ f->flush(rdata);
+ } else {
+ ds << "osdmap e" << osdmap.get_epoch()
+ << " pool '" << poolstr << "' (" << pool << ")"
+ << " object '" << fullobjname << "' ->"
+ << " pg " << pgid << " (" << mpgid << ")"
+ << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
+ << pg_vector_string(acting) << ", p" << acting_p << ")";
+ rdata.append(ds);
+ }
+
+ } else if (prefix == "pg map") {
+ pg_t pgid;
+ string pgidstr;
+ cmd_getval(cmdmap, "pgid", pgidstr);
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ vector<int> up, acting;
+ if (!osdmap.have_pg_pool(pgid.pool())) {
+ ss << "pg '" << pgidstr << "' does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
+ osdmap.pg_to_up_acting_osds(pgid, up, acting);
+ if (f) {
+ f->open_object_section("pg_map");
+ f->dump_unsigned("epoch", osdmap.get_epoch());
+ f->dump_stream("raw_pgid") << pgid;
+ f->dump_stream("pgid") << mpgid;
+ f->open_array_section("up");
+ for (auto osd : up) {
+ f->dump_int("up_osd", osd);
+ }
+ f->close_section();
+ f->open_array_section("acting");
+ for (auto osd : acting) {
+ f->dump_int("acting_osd", osd);
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ds << "osdmap e" << osdmap.get_epoch()
+ << " pg " << pgid << " (" << mpgid << ")"
+ << " -> up " << up << " acting " << acting;
+ rdata.append(ds);
+ }
+ goto reply;
+
+ } else if (prefix == "osd lspools") {
+ if (f)
+ f->open_array_section("pools");
+ for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
+ p != osdmap.pools.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_int("poolnum", p->first);
+ f->dump_string("poolname", osdmap.pool_name[p->first]);
+ f->close_section();
+ } else {
+ ds << p->first << ' ' << osdmap.pool_name[p->first];
+ if (next(p) != osdmap.pools.end()) {
+ ds << '\n';
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(ds);
+ }
+ rdata.append(ds);
+ } else if (prefix == "osd blocklist ls" ||
+ prefix == "osd blacklist ls") {
+ if (f)
+ f->open_array_section("blocklist");
+
+ for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
+ p != osdmap.blocklist.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("entry");
+ f->dump_string("addr", p->first.get_legacy_str());
+ f->dump_stream("until") << p->second;
+ f->close_section();
+ } else {
+ stringstream ss;
+ string s;
+ ss << p->first << " " << p->second;
+ getline(ss, s);
+ s += "\n";
+ rdata.append(s);
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ }
+ if (f)
+ f->open_array_section("range_blocklist");
+
+ for (auto p = osdmap.range_blocklist.begin();
+ p != osdmap.range_blocklist.end();
+ ++p) {
+ if (f) {
+ f->open_object_section("entry");
+ f->dump_string("range", p->first.get_legacy_str());
+ f->dump_stream("until") << p->second;
+ f->close_section();
+ } else {
+ stringstream ss;
+ string s;
+ ss << p->first << " " << p->second;
+ getline(ss, s);
+ s += "\n";
+ rdata.append(s);
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ }
+ ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
+
+ } else if (prefix == "osd pool ls") {
+ string detail;
+ cmd_getval(cmdmap, "detail", detail);
+ if (!f && detail == "detail") {
+ ostringstream ss;
+ osdmap.print_pools(ss);
+ rdata.append(ss.str());
+ } else {
+ if (f)
+ f->open_array_section("pools");
+ for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
+ it != osdmap.get_pools().end();
+ ++it) {
+ if (f) {
+ if (detail == "detail") {
+ f->open_object_section("pool");
+ f->dump_int("pool_id", it->first);
+ f->dump_string("pool_name", osdmap.get_pool_name(it->first));
+ it->second.dump(f.get());
+ f->close_section();
+ } else {
+ f->dump_string("pool_name", osdmap.get_pool_name(it->first));
+ }
+ } else {
+ rdata.append(osdmap.get_pool_name(it->first) + "\n");
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ }
+ }
+
+ } else if (prefix == "osd crush get-tunable") {
+ string tunable;
+ cmd_getval(cmdmap, "tunable", tunable);
+ ostringstream rss;
+ if (f)
+ f->open_object_section("tunable");
+ if (tunable == "straw_calc_version") {
+ if (f)
+ f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
+ else
+ rss << osdmap.crush->get_straw_calc_version() << "\n";
+ } else {
+ r = -EINVAL;
+ goto reply;
+ }
+ if (f) {
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ rdata.append(rss.str());
+ }
+ r = 0;
+
+ } else if (prefix == "osd pool get") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ string var;
+ cmd_getval(cmdmap, "var", var);
+
+ typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
+ const choices_map_t ALL_CHOICES = {
+ {"size", SIZE},
+ {"min_size", MIN_SIZE},
+ {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
+ {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
+ {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
+ {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
+ {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
+ {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
+ {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
+ {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
+ {"use_gmt_hitset", USE_GMT_HITSET},
+ {"target_max_objects", TARGET_MAX_OBJECTS},
+ {"target_max_bytes", TARGET_MAX_BYTES},
+ {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
+ {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
+ {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
+ {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
+ {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
+ {"erasure_code_profile", ERASURE_CODE_PROFILE},
+ {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
+ {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
+ {"fast_read", FAST_READ},
+ {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
+ {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
+ {"scrub_min_interval", SCRUB_MIN_INTERVAL},
+ {"scrub_max_interval", SCRUB_MAX_INTERVAL},
+ {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
+ {"recovery_priority", RECOVERY_PRIORITY},
+ {"recovery_op_priority", RECOVERY_OP_PRIORITY},
+ {"scrub_priority", SCRUB_PRIORITY},
+ {"compression_mode", COMPRESSION_MODE},
+ {"compression_algorithm", COMPRESSION_ALGORITHM},
+ {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
+ {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
+ {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
+ {"csum_type", CSUM_TYPE},
+ {"csum_max_block", CSUM_MAX_BLOCK},
+ {"csum_min_block", CSUM_MIN_BLOCK},
+ {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
+ {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
+ {"pg_num_min", PG_NUM_MIN},
+ {"pg_num_max", PG_NUM_MAX},
+ {"target_size_bytes", TARGET_SIZE_BYTES},
+ {"target_size_ratio", TARGET_SIZE_RATIO},
+ {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
+ {"dedup_tier", DEDUP_TIER},
+ {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
+ {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
+ {"bulk", BULK}
+ };
+
+ typedef std::set<osd_pool_get_choices> choices_set_t;
+
+ const choices_set_t ONLY_TIER_CHOICES = {
+ HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+ TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
+ CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+ CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+ MIN_READ_RECENCY_FOR_PROMOTE,
+ MIN_WRITE_RECENCY_FOR_PROMOTE,
+ HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
+ };
+ const choices_set_t ONLY_ERASURE_CHOICES = {
+ EC_OVERWRITES, ERASURE_CODE_PROFILE
+ };
+
+ choices_set_t selected_choices;
+ if (var == "all") {
+ for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
+ it != ALL_CHOICES.end(); ++it) {
+ selected_choices.insert(it->second);
+ }
+
+ if(!p->is_tier()) {
+ selected_choices = subtract_second_from_first(selected_choices,
+ ONLY_TIER_CHOICES);
+ }
+
+ if(!p->is_erasure()) {
+ selected_choices = subtract_second_from_first(selected_choices,
+ ONLY_ERASURE_CHOICES);
+ }
+ } else /* var != "all" */ {
+ choices_map_t::const_iterator found = ALL_CHOICES.find(var);
+ if (found == ALL_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "': invalid variable: '" << var << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+
+ osd_pool_get_choices selected = found->second;
+
+ if (!p->is_tier() &&
+ ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "' is not a tier pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
+ if (!p->is_erasure() &&
+ ONLY_ERASURE_CHOICES.find(selected)
+ != ONLY_ERASURE_CHOICES.end()) {
+ ss << "pool '" << poolstr
+ << "' is not a erasure pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
+ if (pool_opts_t::is_opt_name(var) &&
+ !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
+ ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+
+ selected_choices.insert(selected);
+ }
+
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+ for(choices_set_t::const_iterator it = selected_choices.begin();
+ it != selected_choices.end(); ++it) {
+ choices_map_t::const_iterator i;
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it) {
+ break;
+ }
+ }
+ ceph_assert(i != ALL_CHOICES.end());
+ switch(*it) {
+ case PG_NUM:
+ f->dump_int("pg_num", p->get_pg_num());
+ break;
+ case PGP_NUM:
+ f->dump_int("pgp_num", p->get_pgp_num());
+ break;
+ case SIZE:
+ f->dump_int("size", p->get_size());
+ break;
+ case MIN_SIZE:
+ f->dump_int("min_size", p->get_min_size());
+ break;
+ case CRUSH_RULE:
+ if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+ f->dump_string("crush_rule", osdmap.crush->get_rule_name(
+ p->get_crush_rule()));
+ } else {
+ f->dump_string("crush_rule", stringify(p->get_crush_rule()));
+ }
+ break;
+ case EC_OVERWRITES:
+ f->dump_bool("allow_ec_overwrites",
+ p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
+ break;
+ case PG_AUTOSCALE_MODE:
+ f->dump_string("pg_autoscale_mode",
+ pg_pool_t::get_pg_autoscale_mode_name(
+ p->pg_autoscale_mode));
+ break;
+ case HASHPSPOOL:
+ case NODELETE:
+ case BULK:
+ case NOPGCHANGE:
+ case NOSIZECHANGE:
+ case WRITE_FADVISE_DONTNEED:
+ case NOSCRUB:
+ case NODEEP_SCRUB:
+ f->dump_bool(i->first.c_str(),
+ p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
+ break;
+ case HIT_SET_PERIOD:
+ f->dump_int("hit_set_period", p->hit_set_period);
+ break;
+ case HIT_SET_COUNT:
+ f->dump_int("hit_set_count", p->hit_set_count);
+ break;
+ case HIT_SET_TYPE:
+ f->dump_string("hit_set_type",
+ HitSet::get_type_name(p->hit_set_params.get_type()));
+ break;
+ case HIT_SET_FPP:
+ {
+ if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+ BloomHitSet::Params *bloomp =
+ static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ f->dump_float("hit_set_fpp", bloomp->get_fpp());
+ } else if(var != "all") {
+ f->close_section();
+ ss << "hit set is not of type Bloom; " <<
+ "invalid to get a false positive rate!";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+ break;
+ case USE_GMT_HITSET:
+ f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
+ break;
+ case TARGET_MAX_OBJECTS:
+ f->dump_unsigned("target_max_objects", p->target_max_objects);
+ break;
+ case TARGET_MAX_BYTES:
+ f->dump_unsigned("target_max_bytes", p->target_max_bytes);
+ break;
+ case CACHE_TARGET_DIRTY_RATIO:
+ f->dump_unsigned("cache_target_dirty_ratio_micro",
+ p->cache_target_dirty_ratio_micro);
+ f->dump_float("cache_target_dirty_ratio",
+ ((float)p->cache_target_dirty_ratio_micro/1000000));
+ break;
+ case CACHE_TARGET_DIRTY_HIGH_RATIO:
+ f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+ p->cache_target_dirty_high_ratio_micro);
+ f->dump_float("cache_target_dirty_high_ratio",
+ ((float)p->cache_target_dirty_high_ratio_micro/1000000));
+ break;
+ case CACHE_TARGET_FULL_RATIO:
+ f->dump_unsigned("cache_target_full_ratio_micro",
+ p->cache_target_full_ratio_micro);
+ f->dump_float("cache_target_full_ratio",
+ ((float)p->cache_target_full_ratio_micro/1000000));
+ break;
+ case CACHE_MIN_FLUSH_AGE:
+ f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
+ break;
+ case CACHE_MIN_EVICT_AGE:
+ f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
+ break;
+ case ERASURE_CODE_PROFILE:
+ f->dump_string("erasure_code_profile", p->erasure_code_profile);
+ break;
+ case MIN_READ_RECENCY_FOR_PROMOTE:
+ f->dump_int("min_read_recency_for_promote",
+ p->min_read_recency_for_promote);
+ break;
+ case MIN_WRITE_RECENCY_FOR_PROMOTE:
+ f->dump_int("min_write_recency_for_promote",
+ p->min_write_recency_for_promote);
+ break;
+ case FAST_READ:
+ f->dump_int("fast_read", p->fast_read);
+ break;
+ case HIT_SET_GRADE_DECAY_RATE:
+ f->dump_int("hit_set_grade_decay_rate",
+ p->hit_set_grade_decay_rate);
+ break;
+ case HIT_SET_SEARCH_LAST_N:
+ f->dump_int("hit_set_search_last_n",
+ p->hit_set_search_last_n);
+ break;
+ case SCRUB_MIN_INTERVAL:
+ case SCRUB_MAX_INTERVAL:
+ case DEEP_SCRUB_INTERVAL:
+ case RECOVERY_PRIORITY:
+ case RECOVERY_OP_PRIORITY:
+ case SCRUB_PRIORITY:
+ case COMPRESSION_MODE:
+ case COMPRESSION_ALGORITHM:
+ case COMPRESSION_REQUIRED_RATIO:
+ case COMPRESSION_MAX_BLOB_SIZE:
+ case COMPRESSION_MIN_BLOB_SIZE:
+ case CSUM_TYPE:
+ case CSUM_MAX_BLOCK:
+ case CSUM_MIN_BLOCK:
+ case FINGERPRINT_ALGORITHM:
+ case PG_NUM_MIN:
+ case PG_NUM_MAX:
+ case TARGET_SIZE_BYTES:
+ case TARGET_SIZE_RATIO:
+ case PG_AUTOSCALE_BIAS:
+ case DEDUP_TIER:
+ case DEDUP_CHUNK_ALGORITHM:
+ case DEDUP_CDC_CHUNK_SIZE:
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+ if (p->opts.is_set(key)) {
+ if(*it == CSUM_TYPE) {
+ int64_t val;
+ p->opts.get(pool_opts_t::CSUM_TYPE, &val);
+ f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
+ } else {
+ p->opts.dump(i->first, f.get());
+ }
+ }
+ break;
+ }
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else /* !f */ {
+ for(choices_set_t::const_iterator it = selected_choices.begin();
+ it != selected_choices.end(); ++it) {
+ choices_map_t::const_iterator i;
+ switch(*it) {
+ case PG_NUM:
+ ss << "pg_num: " << p->get_pg_num() << "\n";
+ break;
+ case PGP_NUM:
+ ss << "pgp_num: " << p->get_pgp_num() << "\n";
+ break;
+ case SIZE:
+ ss << "size: " << p->get_size() << "\n";
+ break;
+ case MIN_SIZE:
+ ss << "min_size: " << p->get_min_size() << "\n";
+ break;
+ case CRUSH_RULE:
+ if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+ ss << "crush_rule: " << osdmap.crush->get_rule_name(
+ p->get_crush_rule()) << "\n";
+ } else {
+ ss << "crush_rule: " << p->get_crush_rule() << "\n";
+ }
+ break;
+ case PG_AUTOSCALE_MODE:
+ ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
+ p->pg_autoscale_mode) <<"\n";
+ break;
+ case HIT_SET_PERIOD:
+ ss << "hit_set_period: " << p->hit_set_period << "\n";
+ break;
+ case HIT_SET_COUNT:
+ ss << "hit_set_count: " << p->hit_set_count << "\n";
+ break;
+ case HIT_SET_TYPE:
+ ss << "hit_set_type: " <<
+ HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
+ break;
+ case HIT_SET_FPP:
+ {
+ if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+ BloomHitSet::Params *bloomp =
+ static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
+ } else if(var != "all") {
+ ss << "hit set is not of type Bloom; " <<
+ "invalid to get a false positive rate!";
+ r = -EINVAL;
+ goto reply;
+ }
+ }
+ break;
+ case USE_GMT_HITSET:
+ ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
+ break;
+ case TARGET_MAX_OBJECTS:
+ ss << "target_max_objects: " << p->target_max_objects << "\n";
+ break;
+ case TARGET_MAX_BYTES:
+ ss << "target_max_bytes: " << p->target_max_bytes << "\n";
+ break;
+ case CACHE_TARGET_DIRTY_RATIO:
+ ss << "cache_target_dirty_ratio: "
+ << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
+ break;
+ case CACHE_TARGET_DIRTY_HIGH_RATIO:
+ ss << "cache_target_dirty_high_ratio: "
+ << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
+ break;
+ case CACHE_TARGET_FULL_RATIO:
+ ss << "cache_target_full_ratio: "
+ << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
+ break;
+ case CACHE_MIN_FLUSH_AGE:
+ ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
+ break;
+ case CACHE_MIN_EVICT_AGE:
+ ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
+ break;
+ case ERASURE_CODE_PROFILE:
+ ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
+ break;
+ case MIN_READ_RECENCY_FOR_PROMOTE:
+ ss << "min_read_recency_for_promote: " <<
+ p->min_read_recency_for_promote << "\n";
+ break;
+ case HIT_SET_GRADE_DECAY_RATE:
+ ss << "hit_set_grade_decay_rate: " <<
+ p->hit_set_grade_decay_rate << "\n";
+ break;
+ case HIT_SET_SEARCH_LAST_N:
+ ss << "hit_set_search_last_n: " <<
+ p->hit_set_search_last_n << "\n";
+ break;
+ case EC_OVERWRITES:
+ ss << "allow_ec_overwrites: " <<
+ (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
+ "\n";
+ break;
+ case HASHPSPOOL:
+ case NODELETE:
+ case BULK:
+ case NOPGCHANGE:
+ case NOSIZECHANGE:
+ case WRITE_FADVISE_DONTNEED:
+ case NOSCRUB:
+ case NODEEP_SCRUB:
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it)
+ break;
+ }
+ ceph_assert(i != ALL_CHOICES.end());
+ ss << i->first << ": " <<
+ (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
+ "true" : "false") << "\n";
+ break;
+ case MIN_WRITE_RECENCY_FOR_PROMOTE:
+ ss << "min_write_recency_for_promote: " <<
+ p->min_write_recency_for_promote << "\n";
+ break;
+ case FAST_READ:
+ ss << "fast_read: " << p->fast_read << "\n";
+ break;
+ case SCRUB_MIN_INTERVAL:
+ case SCRUB_MAX_INTERVAL:
+ case DEEP_SCRUB_INTERVAL:
+ case RECOVERY_PRIORITY:
+ case RECOVERY_OP_PRIORITY:
+ case SCRUB_PRIORITY:
+ case COMPRESSION_MODE:
+ case COMPRESSION_ALGORITHM:
+ case COMPRESSION_REQUIRED_RATIO:
+ case COMPRESSION_MAX_BLOB_SIZE:
+ case COMPRESSION_MIN_BLOB_SIZE:
+ case CSUM_TYPE:
+ case CSUM_MAX_BLOCK:
+ case CSUM_MIN_BLOCK:
+ case FINGERPRINT_ALGORITHM:
+ case PG_NUM_MIN:
+ case PG_NUM_MAX:
+ case TARGET_SIZE_BYTES:
+ case TARGET_SIZE_RATIO:
+ case PG_AUTOSCALE_BIAS:
+ case DEDUP_TIER:
+ case DEDUP_CHUNK_ALGORITHM:
+ case DEDUP_CDC_CHUNK_SIZE:
+ for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+ if (i->second == *it)
+ break;
+ }
+ ceph_assert(i != ALL_CHOICES.end());
+ {
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+ if (p->opts.is_set(key)) {
+ if(key == pool_opts_t::CSUM_TYPE) {
+ int64_t val;
+ p->opts.get(key, &val);
+ ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
+ } else {
+ ss << i->first << ": " << p->opts.get(key) << "\n";
+ }
+ }
+ }
+ break;
+ }
+ rdata.append(ss.str());
+ ss.str("");
+ }
+ }
+ r = 0;
+ } else if (prefix == "osd pool get-quota") {
+ string pool_name;
+ cmd_getval(cmdmap, "pool", pool_name);
+
+ int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
+ if (poolid < 0) {
+ ceph_assert(poolid == -ENOENT);
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(poolid);
+ const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
+ if (!pstat) {
+ ss << "no stats for pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ if (f) {
+ f->open_object_section("pool_quotas");
+ f->dump_string("pool_name", pool_name);
+ f->dump_unsigned("pool_id", poolid);
+ f->dump_unsigned("quota_max_objects", p->quota_max_objects);
+ f->dump_int("current_num_objects", sum.num_objects);
+ f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
+ f->dump_int("current_num_bytes", sum.num_bytes);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ stringstream rs;
+ rs << "quotas for pool '" << pool_name << "':\n"
+ << " max objects: ";
+ if (p->quota_max_objects == 0)
+ rs << "N/A";
+ else {
+ rs << si_u_t(p->quota_max_objects) << " objects";
+ rs << " (current num objects: " << sum.num_objects << " objects)";
+ }
+ rs << "\n"
+ << " max bytes : ";
+ if (p->quota_max_bytes == 0)
+ rs << "N/A";
+ else {
+ rs << byte_u_t(p->quota_max_bytes);
+ rs << " (current num bytes: " << sum.num_bytes << " bytes)";
+ }
+ rdata.append(rs.str());
+ }
+ rdata.append("\n");
+ r = 0;
+ } else if (prefix == "osd crush rule list" ||
+ prefix == "osd crush rule ls") {
+ if (f) {
+ f->open_array_section("rules");
+ osdmap.crush->list_rules(f.get());
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ osdmap.crush->list_rules(&ss);
+ rdata.append(ss.str());
+ }
+ } else if (prefix == "osd crush rule ls-by-class") {
+ string class_name;
+ cmd_getval(cmdmap, "class", class_name);
+ if (class_name.empty()) {
+ ss << "no class specified";
+ r = -EINVAL;
+ goto reply;
+ }
+ set<int> rules;
+ r = osdmap.crush->get_rules_by_class(class_name, &rules);
+ if (r < 0) {
+ ss << "failed to get rules by class '" << class_name << "'";
+ goto reply;
+ }
+ if (f) {
+ f->open_array_section("rules");
+ for (auto &rule: rules) {
+ f->dump_string("name", osdmap.crush->get_rule_name(rule));
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream rs;
+ for (auto &rule: rules) {
+ rs << osdmap.crush->get_rule_name(rule) << "\n";
+ }
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush rule dump") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ if (name == "") {
+ f->open_array_section("rules");
+ osdmap.crush->dump_rules(f.get());
+ f->close_section();
+ } else {
+ int ruleno = osdmap.crush->get_rule_id(name);
+ if (ruleno < 0) {
+ ss << "unknown crush rule '" << name << "'";
+ r = ruleno;
+ goto reply;
+ }
+ osdmap.crush->dump_rule(ruleno, f.get());
+ }
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ } else if (prefix == "osd crush dump") {
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_object_section("crush_map");
+ osdmap.crush->dump(f.get());
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ } else if (prefix == "osd crush show-tunables") {
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_object_section("crush_map_tunables");
+ osdmap.crush->dump_tunables(f.get());
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ } else if (prefix == "osd crush tree") {
+ string shadow;
+ cmd_getval(cmdmap, "shadow", shadow);
+ bool show_shadow = shadow == "--show-shadow";
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_object_section("crush_tree");
+ osdmap.crush->dump_tree(nullptr,
+ f.get(),
+ osdmap.get_pool_names(),
+ show_shadow);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ osdmap.crush->dump_tree(&ss,
+ nullptr,
+ osdmap.get_pool_names(),
+ show_shadow);
+ rdata.append(ss.str());
+ }
+ } else if (prefix == "osd crush ls") {
+ string name;
+ if (!cmd_getval(cmdmap, "node", name)) {
+ ss << "no node specified";
+ r = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.crush->name_exists(name)) {
+ ss << "node '" << name << "' does not exist";
+ r = -ENOENT;
+ goto reply;
+ }
+ int id = osdmap.crush->get_item_id(name);
+ list<int> result;
+ if (id >= 0) {
+ result.push_back(id);
+ } else {
+ int num = osdmap.crush->get_bucket_size(id);
+ for (int i = 0; i < num; ++i) {
+ result.push_back(osdmap.crush->get_bucket_item(id, i));
+ }
+ }
+ if (f) {
+ f->open_array_section("items");
+ for (auto i : result) {
+ f->dump_string("item", osdmap.crush->get_item_name(i));
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream ss;
+ for (auto i : result) {
+ ss << osdmap.crush->get_item_name(i) << "\n";
+ }
+ rdata.append(ss.str());
+ }
+ r = 0;
+ } else if (prefix == "osd crush class ls") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+ f->open_array_section("crush_classes");
+ for (auto i : osdmap.crush->class_name)
+ f->dump_string("class", i.second);
+ f->close_section();
+ f->flush(rdata);
+ } else if (prefix == "osd crush class ls-osd") {
+ string name;
+ cmd_getval(cmdmap, "class", name);
+ set<int> osds;
+ osdmap.crush->get_devices_by_class(name, &osds);
+ if (f) {
+ f->open_array_section("osds");
+ for (auto &osd: osds)
+ f->dump_int("osd", osd);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ bool first = true;
+ for (auto &osd : osds) {
+ if (!first)
+ ds << "\n";
+ first = false;
+ ds << osd;
+ }
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd crush get-device-class") {
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ map<int, string> class_by_osd;
+ for (auto& id : idvec) {
+ ostringstream ts;
+ long osd = parse_osd_id(id.c_str(), &ts);
+ if (osd < 0) {
+ ss << "unable to parse osd id:'" << id << "'";
+ r = -EINVAL;
+ goto reply;
+ }
+ auto device_class = osdmap.crush->get_item_class(osd);
+ if (device_class)
+ class_by_osd[osd] = device_class;
+ else
+ class_by_osd[osd] = ""; // no class
+ }
+ if (f) {
+ f->open_array_section("osd_device_classes");
+ for (auto& i : class_by_osd) {
+ f->open_object_section("osd_device_class");
+ f->dump_int("osd", i.first);
+ f->dump_string("device_class", i.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ if (class_by_osd.size() == 1) {
+ // for single input, make a clean output
+ ds << class_by_osd.begin()->second;
+ } else {
+ // note that we do not group osds by class here
+ for (auto it = class_by_osd.begin();
+ it != class_by_osd.end();
+ it++) {
+ ds << "osd." << it->first << ' ' << it->second;
+ if (next(it) != class_by_osd.end())
+ ds << '\n';
+ }
+ }
+ rdata.append(ds);
+ }
+ } else if (prefix == "osd erasure-code-profile ls") {
+ const auto &profiles = osdmap.get_erasure_code_profiles();
+ if (f)
+ f->open_array_section("erasure-code-profiles");
+ for (auto i = profiles.begin(); i != profiles.end(); ++i) {
+ if (f)
+ f->dump_string("profile", i->first.c_str());
+ else
+ rdata.append(i->first + "\n");
+ }
+ if (f) {
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush weight-set ls") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_array_section("weight_sets");
+ if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+ f->dump_string("pool", "(compat)");
+ }
+ for (auto& i : osdmap.crush->choose_args) {
+ if (i.first >= 0) {
+ f->dump_string("pool", osdmap.get_pool_name(i.first));
+ }
+ }
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ostringstream rs;
+ if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+ rs << "(compat)\n";
+ }
+ for (auto& i : osdmap.crush->choose_args) {
+ if (i.first >= 0) {
+ rs << osdmap.get_pool_name(i.first) << "\n";
+ }
+ }
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd crush weight-set dump") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+ "json-pretty"));
+ osdmap.crush->dump_choose_args(f.get());
+ f->flush(rdata);
+ } else if (prefix == "osd erasure-code-profile get") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!osdmap.has_erasure_code_profile(name)) {
+ ss << "unknown erasure code profile '" << name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
+ if (f)
+ f->open_object_section("profile");
+ for (map<string,string>::const_iterator i = profile.begin();
+ i != profile.end();
+ ++i) {
+ if (f)
+ f->dump_string(i->first.c_str(), i->second.c_str());
+ else
+ rdata.append(i->first + "=" + i->second + "\n");
+ }
+ if (f) {
+ f->close_section();
+ ostringstream rs;
+ f->flush(rs);
+ rs << "\n";
+ rdata.append(rs.str());
+ }
+ } else if (prefix == "osd pool application get") {
+ boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+ "json-pretty"));
+ string pool_name;
+ cmd_getval(cmdmap, "pool", pool_name);
+ string app;
+ cmd_getval(cmdmap, "app", app);
+ string key;
+ cmd_getval(cmdmap, "key", key);
+
+ if (pool_name.empty()) {
+ // all
+ f->open_object_section("pools");
+ for (const auto &pool : osdmap.pools) {
+ std::string name("<unknown>");
+ const auto &pni = osdmap.pool_name.find(pool.first);
+ if (pni != osdmap.pool_name.end())
+ name = pni->second;
+ f->open_object_section(name.c_str());
+ for (auto &app_pair : pool.second.application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section();
+ }
+ f->close_section(); // name
+ }
+ f->close_section(); // pools
+ f->flush(rdata);
+ } else {
+ int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ auto p = osdmap.get_pg_pool(pool);
+ // filter by pool
+ if (app.empty()) {
+ f->open_object_section(pool_name.c_str());
+ for (auto &app_pair : p->application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ }
+ f->close_section(); // pool_name
+ f->flush(rdata);
+ goto reply;
+ }
+
+ auto app_it = p->application_metadata.find(app);
+ if (app_it == p->application_metadata.end()) {
+ ss << "pool '" << pool_name << "' has no application '" << app << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ // filter by pool + app
+ if (key.empty()) {
+ f->open_object_section(app_it->first.c_str());
+ for (auto &kv_pair : app_it->second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ f->flush(rdata);
+ goto reply;
+ }
+ // filter by pool + app + key
+ auto key_it = app_it->second.find(key);
+ if (key_it == app_it->second.end()) {
+ ss << "application '" << app << "' on pool '" << pool_name
+ << "' does not have key '" << key << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ ss << key_it->second << "\n";
+ rdata.append(ss.str());
+ ss.str("");
+ }
+ } else if (prefix == "osd get-require-min-compat-client") {
+ ss << osdmap.require_min_compat_client << std::endl;
+ rdata.append(ss.str());
+ ss.str("");
+ goto reply;
+ } else if (prefix == "osd pool application enable" ||
+ prefix == "osd pool application disable" ||
+ prefix == "osd pool application set" ||
+ prefix == "osd pool application rm") {
+ bool changed = false;
+ r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
+ if (r != 0) {
+ // Error, reply.
+ goto reply;
+ } else if (changed) {
+ // Valid mutation, proceed to prepare phase
+ return false;
+ } else {
+ // Idempotent case, reply
+ goto reply;
+ }
+ } else {
+ // try prepare update
+ return false;
+ }
+
+ reply:
+ string rs;
+ getline(ss, rs);
+ mon.reply_command(op, r, rs, rdata, get_last_committed());
+ return true;
+}
+
+void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
+{
+ pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+ osdmap.get_pg_pool(pool_id));
+ ceph_assert(pool);
+ pool->set_flag(flags);
+}
+
+void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
+{
+ pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+ osdmap.get_pg_pool(pool_id));
+ ceph_assert(pool);
+ pool->unset_flag(flags);
+}
+
+string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
+{
+ char k[80];
+ snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
+ return k;
+}
+
+string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
+{
+ char k[80];
+ snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
+ (unsigned long long)pool, (unsigned long long)snap);
+ return k;
+}
+
+string OSDMonitor::make_purged_snap_key_value(
+ int64_t pool, snapid_t snap, snapid_t num,
+ epoch_t epoch, bufferlist *v)
+{
+ // encode the *last* epoch in the key so that we can use forward
+ // iteration only to search for an epoch in an interval.
+ encode(snap, *v);
+ encode(snap + num, *v);
+ encode(epoch, *v);
+ return make_purged_snap_key(pool, snap + num - 1);
+}
+
+
+int OSDMonitor::lookup_purged_snap(
+ int64_t pool, snapid_t snap,
+ snapid_t *begin, snapid_t *end)
+{
+ string k = make_purged_snap_key(pool, snap);
+ auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+ it->lower_bound(k);
+ if (!it->valid()) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - key '" << k << "' not found" << dendl;
+ return -ENOENT;
+ }
+ if (it->key().find("purged_snap_") != 0) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - key '" << k << "' got '" << it->key()
+ << "', wrong prefix" << dendl;
+ return -ENOENT;
+ }
+ string gotk = it->key();
+ const char *format = "purged_snap_%llu_";
+ long long int keypool;
+ int n = sscanf(gotk.c_str(), format, &keypool);
+ if (n != 1) {
+ derr << __func__ << " invalid k '" << gotk << "'" << dendl;
+ return -ENOENT;
+ }
+ if (pool != keypool) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - key '" << k << "' got '" << gotk
+ << "', wrong pool " << keypool
+ << dendl;
+ return -ENOENT;
+ }
+ bufferlist v = it->value();
+ auto p = v.cbegin();
+ decode(*begin, p);
+ decode(*end, p);
+ if (snap < *begin || snap >= *end) {
+ dout(20) << __func__
+ << " pool " << pool << " snap " << snap
+ << " - found [" << *begin << "," << *end << "), no overlap"
+ << dendl;
+ return -ENOENT;
+ }
+ return 0;
+}
+
+void OSDMonitor::insert_purged_snap_update(
+ int64_t pool,
+ snapid_t start, snapid_t end,
+ epoch_t epoch,
+ MonitorDBStore::TransactionRef t)
+{
+ snapid_t before_begin, before_end;
+ snapid_t after_begin, after_end;
+ int b = lookup_purged_snap(pool, start - 1,
+ &before_begin, &before_end);
+ int a = lookup_purged_snap(pool, end,
+ &after_begin, &after_end);
+ if (!b && !a) {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - joins ["
+ << before_begin << "," << before_end << ") and ["
+ << after_begin << "," << after_end << ")" << dendl;
+ // erase only the begin record; we'll overwrite the end one.
+ t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ before_begin, after_end - before_begin,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ } else if (!b) {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - join with earlier ["
+ << before_begin << "," << before_end << ")" << dendl;
+ t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ before_begin, end - before_begin,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ } else if (!a) {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - join with later ["
+ << after_begin << "," << after_end << ")" << dendl;
+ // overwrite after record
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ start, after_end - start,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ } else {
+ dout(10) << __func__
+ << " [" << start << "," << end << ") - new"
+ << dendl;
+ bufferlist v;
+ string k = make_purged_snap_key_value(pool,
+ start, end - start,
+ pending_inc.epoch, &v);
+ t->put(OSD_SNAP_PREFIX, k, v);
+ }
+}
+
+bool OSDMonitor::try_prune_purged_snaps()
+{
+ if (!mon.mgrstatmon()->is_readable()) {
+ return false;
+ }
+ if (!pending_inc.new_purged_snaps.empty()) {
+ return false; // we already pruned for this epoch
+ }
+
+ unsigned max_prune = cct->_conf.get_val<uint64_t>(
+ "mon_max_snap_prune_per_epoch");
+ if (!max_prune) {
+ max_prune = 100000;
+ }
+ dout(10) << __func__ << " max_prune " << max_prune << dendl;
+
+ unsigned actually_pruned = 0;
+ auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
+ for (auto& p : osdmap.get_pools()) {
+ auto q = purged_snaps.find(p.first);
+ if (q == purged_snaps.end()) {
+ continue;
+ }
+ auto& purged = q->second;
+ if (purged.empty()) {
+ dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
+ continue;
+ }
+ dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
+ snap_interval_set_t to_prune;
+ unsigned maybe_pruned = actually_pruned;
+ for (auto i = purged.begin(); i != purged.end(); ++i) {
+ snapid_t begin = i.get_start();
+ auto end = i.get_start() + i.get_len();
+ snapid_t pbegin = 0, pend = 0;
+ int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
+ if (r == 0) {
+ // already purged.
+ // be a bit aggressive about backing off here, because the mon may
+ // do a lot of work going through this set, and if we know the
+ // purged set from the OSDs is at least *partly* stale we may as
+ // well wait for it to be fresh.
+ dout(20) << __func__ << " we've already purged " << pbegin
+ << "~" << (pend - pbegin) << dendl;
+ break; // next pool
+ }
+ if (pbegin && pbegin > begin && pbegin < end) {
+ // the tail of [begin,end) is purged; shorten the range
+ end = pbegin;
+ }
+ to_prune.insert(begin, end - begin);
+ maybe_pruned += end - begin;
+ if (maybe_pruned >= max_prune) {
+ break;
+ }
+ }
+ if (!to_prune.empty()) {
+ // PGs may still be reporting things as purged that we have already
+ // pruned from removed_snaps_queue.
+ snap_interval_set_t actual;
+ auto r = osdmap.removed_snaps_queue.find(p.first);
+ if (r != osdmap.removed_snaps_queue.end()) {
+ actual.intersection_of(to_prune, r->second);
+ }
+ actually_pruned += actual.size();
+ dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
+ << ", actual pruned " << actual << dendl;
+ if (!actual.empty()) {
+ pending_inc.new_purged_snaps[p.first].swap(actual);
+ }
+ }
+ if (actually_pruned >= max_prune) {
+ break;
+ }
+ }
+ dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
+ return !!actually_pruned;
+}
+
+bool OSDMonitor::update_pools_status()
+{
+ if (!mon.mgrstatmon()->is_readable())
+ return false;
+
+ bool ret = false;
+
+ auto& pools = osdmap.get_pools();
+ for (auto it = pools.begin(); it != pools.end(); ++it) {
+ const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
+ if (!pstat)
+ continue;
+ const object_stat_sum_t& sum = pstat->stats.sum;
+ const pg_pool_t &pool = it->second;
+ const string& pool_name = osdmap.get_pool_name(it->first);
+
+ bool pool_is_full =
+ (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
+ (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
+
+ if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ if (pool_is_full)
+ continue;
+
+ mon.clog->info() << "pool '" << pool_name
+ << "' no longer out of quota; removing NO_QUOTA flag";
+ // below we cancel FLAG_FULL too, we'll set it again in
+ // OSDMonitor::encode_pending if it still fails the osd-full checking.
+ clear_pool_flags(it->first,
+ pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
+ ret = true;
+ } else {
+ if (!pool_is_full)
+ continue;
+
+ if (pool.quota_max_bytes > 0 &&
+ (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+ mon.clog->warn() << "pool '" << pool_name << "' is full"
+ << " (reached quota's max_bytes: "
+ << byte_u_t(pool.quota_max_bytes) << ")";
+ }
+ if (pool.quota_max_objects > 0 &&
+ (uint64_t)sum.num_objects >= pool.quota_max_objects) {
+ mon.clog->warn() << "pool '" << pool_name << "' is full"
+ << " (reached quota's max_objects: "
+ << pool.quota_max_objects << ")";
+ }
+ // set both FLAG_FULL_QUOTA and FLAG_FULL
+ // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
+ // since FLAG_FULL should always take precedence
+ set_pool_flags(it->first,
+ pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
+ clear_pool_flags(it->first,
+ pg_pool_t::FLAG_NEARFULL |
+ pg_pool_t::FLAG_BACKFILLFULL);
+ ret = true;
+ }
+ }
+ return ret;
+}
+
+int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
+ MonSession *session = op->get_session();
+ if (!session)
+ return -EPERM;
+ string erasure_code_profile;
+ stringstream ss;
+ string rule_name;
+ bool bulk = false;
+ int ret = 0;
+ ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
+ 0, 0, 0, 0, 0, 0, 0.0,
+ erasure_code_profile,
+ pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
+ &ss);
+
+ if (ret < 0) {
+ dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+ }
+ return ret;
+}
+
+int OSDMonitor::crush_rename_bucket(const string& srcname,
+ const string& dstname,
+ ostream *ss)
+{
+ int ret;
+ //
+ // Avoid creating a pending crush if it does not already exists and
+ // the rename would fail.
+ //
+ if (!_have_pending_crush()) {
+ ret = _get_stable_crush().can_rename_bucket(srcname,
+ dstname,
+ ss);
+ if (ret)
+ return ret;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ ret = newcrush.rename_bucket(srcname,
+ dstname,
+ ss);
+ if (ret)
+ return ret;
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ *ss << "renamed bucket " << srcname << " into " << dstname;
+ return 0;
+}
+
+void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
+{
+ string replacement = "";
+
+ if (plugin == "jerasure_generic" ||
+ plugin == "jerasure_sse3" ||
+ plugin == "jerasure_sse4" ||
+ plugin == "jerasure_neon") {
+ replacement = "jerasure";
+ } else if (plugin == "shec_generic" ||
+ plugin == "shec_sse3" ||
+ plugin == "shec_sse4" ||
+ plugin == "shec_neon") {
+ replacement = "shec";
+ }
+
+ if (replacement != "") {
+ dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
+ << plugin << " that has been deprecated. Please use "
+ << replacement << " instead." << dendl;
+ }
+}
+
+int OSDMonitor::normalize_profile(const string& profilename,
+ ErasureCodeProfile &profile,
+ bool force,
+ ostream *ss)
+{
+ ErasureCodeInterfaceRef erasure_code;
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
+ check_legacy_ec_plugin(plugin->second, profilename);
+ int err = instance.factory(plugin->second,
+ g_conf().get_val<std::string>("erasure_code_dir"),
+ profile, &erasure_code, ss);
+ if (err) {
+ return err;
+ }
+
+ err = erasure_code->init(profile, ss);
+ if (err) {
+ return err;
+ }
+
+ auto it = profile.find("stripe_unit");
+ if (it != profile.end()) {
+ string err_str;
+ uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
+ if (!err_str.empty()) {
+ *ss << "could not parse stripe_unit '" << it->second
+ << "': " << err_str << std::endl;
+ return -EINVAL;
+ }
+ uint32_t data_chunks = erasure_code->get_data_chunk_count();
+ uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
+ if (chunk_size != stripe_unit) {
+ *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
+ << "alignment. Would be padded to " << chunk_size
+ << std::endl;
+ return -EINVAL;
+ }
+ if ((stripe_unit % 4096) != 0 && !force) {
+ *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
+ << "use --force to override this check" << std::endl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int OSDMonitor::crush_rule_create_erasure(const string &name,
+ const string &profile,
+ int *rule,
+ ostream *ss)
+{
+ int ruleid = osdmap.crush->get_rule_id(name);
+ if (ruleid != -ENOENT) {
+ *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
+ return -EEXIST;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ ruleid = newcrush.get_rule_id(name);
+ if (ruleid != -ENOENT) {
+ *rule = newcrush.get_rule_mask_ruleset(ruleid);
+ return -EALREADY;
+ } else {
+ ErasureCodeInterfaceRef erasure_code;
+ int err = get_erasure_code(profile, &erasure_code, ss);
+ if (err) {
+ *ss << "failed to load plugin using profile " << profile << std::endl;
+ return err;
+ }
+
+ err = erasure_code->create_rule(name, newcrush, ss);
+ erasure_code.reset();
+ if (err < 0)
+ return err;
+ *rule = err;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ return 0;
+ }
+}
+
+int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
+ ErasureCodeInterfaceRef *erasure_code,
+ ostream *ss) const
+{
+ if (pending_inc.has_erasure_code_profile(erasure_code_profile))
+ return -EAGAIN;
+ ErasureCodeProfile profile =
+ osdmap.get_erasure_code_profile(erasure_code_profile);
+ ErasureCodeProfile::const_iterator plugin =
+ profile.find("plugin");
+ if (plugin == profile.end()) {
+ *ss << "cannot determine the erasure code plugin"
+ << " because there is no 'plugin' entry in the erasure_code_profile "
+ << profile << std::endl;
+ return -EINVAL;
+ }
+ check_legacy_ec_plugin(plugin->second, erasure_code_profile);
+ auto& instance = ErasureCodePluginRegistry::instance();
+ return instance.factory(plugin->second,
+ g_conf().get_val<std::string>("erasure_code_dir"),
+ profile, erasure_code, ss);
+}
+
+int OSDMonitor::check_cluster_features(uint64_t features,
+ stringstream &ss)
+{
+ stringstream unsupported_ss;
+ int unsupported_count = 0;
+ if ((mon.get_quorum_con_features() & features) != features) {
+ unsupported_ss << "the monitor cluster";
+ ++unsupported_count;
+ }
+
+ set<int32_t> up_osds;
+ osdmap.get_up_osds(up_osds);
+ for (set<int32_t>::iterator it = up_osds.begin();
+ it != up_osds.end(); ++it) {
+ const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
+ if ((xi.features & features) != features) {
+ if (unsupported_count > 0)
+ unsupported_ss << ", ";
+ unsupported_ss << "osd." << *it;
+ unsupported_count ++;
+ }
+ }
+
+ if (unsupported_count > 0) {
+ ss << "features " << features << " unsupported by: "
+ << unsupported_ss.str();
+ return -ENOTSUP;
+ }
+
+ // check pending osd state, too!
+ for (map<int32_t,osd_xinfo_t>::const_iterator p =
+ pending_inc.new_xinfo.begin();
+ p != pending_inc.new_xinfo.end(); ++p) {
+ const osd_xinfo_t &xi = p->second;
+ if ((xi.features & features) != features) {
+ dout(10) << __func__ << " pending osd." << p->first
+ << " features are insufficient; retry" << dendl;
+ return -EAGAIN;
+ }
+ }
+
+ return 0;
+}
+
+bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
+ stringstream& ss)
+{
+ OSDMap::Incremental new_pending = pending_inc;
+ encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
+ OSDMap newmap;
+ newmap.deepish_copy_from(osdmap);
+ newmap.apply_incremental(new_pending);
+
+ // client compat
+ if (newmap.require_min_compat_client != ceph_release_t::unknown) {
+ auto mv = newmap.get_min_compat_client();
+ if (mv > newmap.require_min_compat_client) {
+ ss << "new crush map requires client version " << mv
+ << " but require_min_compat_client is "
+ << newmap.require_min_compat_client;
+ return false;
+ }
+ }
+
+ // osd compat
+ uint64_t features =
+ newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
+ newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
+ stringstream features_ss;
+ int r = check_cluster_features(features, features_ss);
+ if (r) {
+ ss << "Could not change CRUSH: " << features_ss.str();
+ return false;
+ }
+
+ return true;
+}
+
+bool OSDMonitor::erasure_code_profile_in_use(
+ const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+ const string &profile,
+ ostream *ss)
+{
+ bool found = false;
+ for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
+ p != pools.end();
+ ++p) {
+ if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
+ *ss << osdmap.pool_name[p->first] << " ";
+ found = true;
+ }
+ }
+ if (found) {
+ *ss << "pool(s) are using the erasure code profile '" << profile << "'";
+ }
+ return found;
+}
+
+int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
+ map<string,string> *erasure_code_profile_map,
+ ostream *ss)
+{
+ int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
+ get_json_str_map,
+ *ss,
+ erasure_code_profile_map,
+ true);
+ if (r)
+ return r;
+ ceph_assert((*erasure_code_profile_map).count("plugin"));
+ string default_plugin = (*erasure_code_profile_map)["plugin"];
+ map<string,string> user_map;
+ for (vector<string>::const_iterator i = erasure_code_profile.begin();
+ i != erasure_code_profile.end();
+ ++i) {
+ size_t equal = i->find('=');
+ if (equal == string::npos) {
+ user_map[*i] = string();
+ (*erasure_code_profile_map)[*i] = string();
+ } else {
+ const string key = i->substr(0, equal);
+ equal++;
+ const string value = i->substr(equal);
+ if (key.find("ruleset-") == 0) {
+ *ss << "property '" << key << "' is no longer supported; try "
+ << "'crush-" << key.substr(8) << "' instead";
+ return -EINVAL;
+ }
+ user_map[key] = value;
+ (*erasure_code_profile_map)[key] = value;
+ }
+ }
+
+ if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
+ (*erasure_code_profile_map) = user_map;
+
+ return 0;
+}
+
+int OSDMonitor::prepare_pool_size(const unsigned pool_type,
+ const string &erasure_code_profile,
+ uint8_t repl_size,
+ unsigned *size, unsigned *min_size,
+ ostream *ss)
+{
+ int err = 0;
+ bool set_min_size = false;
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ if (osdmap.stretch_mode_enabled) {
+ if (repl_size == 0)
+ repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
+ if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
+ *ss << "prepare_pool_size: we are in stretch mode but size "
+ << repl_size << " does not match!";
+ return -EINVAL;
+ }
+ *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+ set_min_size = true;
+ }
+ if (repl_size == 0) {
+ repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+ }
+ *size = repl_size;
+ if (!set_min_size)
+ *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ if (osdmap.stretch_mode_enabled) {
+ *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
+ return -EINVAL;
+ }
+ ErasureCodeInterfaceRef erasure_code;
+ err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+ if (err == 0) {
+ *size = erasure_code->get_chunk_count();
+ *min_size =
+ erasure_code->get_data_chunk_count() +
+ std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
+ assert(*min_size <= *size);
+ assert(*min_size >= erasure_code->get_data_chunk_count());
+ }
+ }
+ break;
+ default:
+ *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
+ const string &erasure_code_profile,
+ uint32_t *stripe_width,
+ ostream *ss)
+{
+ int err = 0;
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ // ignored
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ ErasureCodeProfile profile =
+ osdmap.get_erasure_code_profile(erasure_code_profile);
+ ErasureCodeInterfaceRef erasure_code;
+ err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+ if (err)
+ break;
+ uint32_t data_chunks = erasure_code->get_data_chunk_count();
+ uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
+ auto it = profile.find("stripe_unit");
+ if (it != profile.end()) {
+ string err_str;
+ stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
+ ceph_assert(err_str.empty());
+ }
+ *stripe_width = data_chunks *
+ erasure_code->get_chunk_size(stripe_unit * data_chunks);
+ }
+ break;
+ default:
+ *ss << "prepare_pool_stripe_width: "
+ << pool_type << " is not a known pool type";
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+int OSDMonitor::get_replicated_stretch_crush_rule()
+{
+ /* we don't write down the stretch rule anywhere, so
+ * we have to guess it. How? Look at all the pools
+ * and count up how many times a given rule is used
+ * on stretch pools and then return the one with
+ * the most users!
+ */
+ map<int,int> rule_counts;
+ for (const auto& pooli : osdmap.pools) {
+ const pg_pool_t& p = pooli.second;
+ if (p.is_replicated() && p.is_stretch_pool()) {
+ if (!rule_counts.count(p.crush_rule)) {
+ rule_counts[p.crush_rule] = 1;
+ } else {
+ ++rule_counts[p.crush_rule];
+ }
+ }
+ }
+
+ if (rule_counts.empty()) {
+ return -ENOENT;
+ }
+
+ int most_used_count = 0;
+ int most_used_rule = -1;
+ for (auto i : rule_counts) {
+ if (i.second > most_used_count) {
+ most_used_rule = i.first;
+ most_used_count = i.second;
+ }
+ }
+ ceph_assert(most_used_count > 0);
+ ceph_assert(most_used_rule >= 0);
+ return most_used_rule;
+}
+
+int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
+ const string &erasure_code_profile,
+ const string &rule_name,
+ int *crush_rule,
+ ostream *ss)
+{
+
+ if (*crush_rule < 0) {
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ {
+ if (rule_name == "") {
+ if (osdmap.stretch_mode_enabled) {
+ *crush_rule = get_replicated_stretch_crush_rule();
+ } else {
+ // Use default rule
+ *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
+ }
+ if (*crush_rule < 0) {
+ // Errors may happen e.g. if no valid rule is available
+ *ss << "No suitable CRUSH rule exists, check "
+ << "'osd pool default crush *' config options";
+ return -ENOENT;
+ }
+ } else {
+ return get_crush_rule(rule_name, crush_rule, ss);
+ }
+ }
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ int err = crush_rule_create_erasure(rule_name,
+ erasure_code_profile,
+ crush_rule, ss);
+ switch (err) {
+ case -EALREADY:
+ dout(20) << "prepare_pool_crush_rule: rule "
+ << rule_name << " try again" << dendl;
+ // fall through
+ case 0:
+ // need to wait for the crush rule to be proposed before proceeding
+ err = -EAGAIN;
+ break;
+ case -EEXIST:
+ err = 0;
+ break;
+ }
+ return err;
+ }
+ break;
+ default:
+ *ss << "prepare_pool_crush_rule: " << pool_type
+ << " is not a known pool type";
+ return -EINVAL;
+ }
+ } else {
+ if (!osdmap.crush->ruleset_exists(*crush_rule)) {
+ *ss << "CRUSH rule " << *crush_rule << " not found";
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+int OSDMonitor::get_crush_rule(const string &rule_name,
+ int *crush_rule,
+ ostream *ss)
+{
+ int ret;
+ ret = osdmap.crush->get_rule_id(rule_name);
+ if (ret != -ENOENT) {
+ // found it, use it
+ *crush_rule = ret;
+ } else {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ ret = newcrush.get_rule_id(rule_name);
+ if (ret != -ENOENT) {
+ // found it, wait for it to be proposed
+ dout(20) << __func__ << ": rule " << rule_name
+ << " try again" << dendl;
+ return -EAGAIN;
+ } else {
+ // Cannot find it , return error
+ *ss << "specified rule " << rule_name << " doesn't exist";
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
+{
+ auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
+ auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
+ auto max_pgs = max_pgs_per_osd * num_osds;
+ uint64_t projected = 0;
+ if (pool < 0) {
+ projected += pg_num * size;
+ }
+ for (const auto& i : osdmap.get_pools()) {
+ if (i.first == pool) {
+ projected += pg_num * size;
+ } else {
+ projected += i.second.get_pg_num_target() * i.second.get_size();
+ }
+ }
+ if (projected > max_pgs) {
+ if (pool >= 0) {
+ *ss << "pool id " << pool;
+ }
+ *ss << " pg_num " << pg_num << " size " << size
+ << " would mean " << projected
+ << " total pgs, which exceeds max " << max_pgs
+ << " (mon_max_pg_per_osd " << max_pgs_per_osd
+ << " * num_in_osds " << num_osds << ")";
+ return -ERANGE;
+ }
+ return 0;
+}
+
+/**
+ * @param name The name of the new pool
+ * @param crush_rule The crush rule to use. If <0, will use the system default
+ * @param crush_rule_name The crush rule to use, if crush_rulset <0
+ * @param pg_num The pg_num to use. If set to 0, will use the system default
+ * @param pgp_num The pgp_num to use. If set to 0, will use the system default
+ * @param pg_num_min min pg_num
+ * @param pg_num_max max pg_num
+ * @param repl_size Replication factor, or 0 for default
+ * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
+ * @param pool_type TYPE_ERASURE, or TYPE_REP
+ * @param expected_num_objects expected number of objects on the pool
+ * @param fast_read fast read type.
+ * @param ss human readable error message, if any.
+ *
+ * @return 0 on success, negative errno on failure.
+ */
+int OSDMonitor::prepare_new_pool(string& name,
+ int crush_rule,
+ const string &crush_rule_name,
+ unsigned pg_num, unsigned pgp_num,
+ unsigned pg_num_min,
+ unsigned pg_num_max,
+ const uint64_t repl_size,
+ const uint64_t target_size_bytes,
+ const float target_size_ratio,
+ const string &erasure_code_profile,
+ const unsigned pool_type,
+ const uint64_t expected_num_objects,
+ FastReadType fast_read,
+ const string& pg_autoscale_mode,
+ bool bulk,
+ ostream *ss)
+{
+ if (name.length() == 0)
+ return -EINVAL;
+ if (pg_num == 0)
+ pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
+ if (pgp_num == 0)
+ pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
+ if (!pgp_num)
+ pgp_num = pg_num;
+ if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+ *ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
+ if (pgp_num > pg_num) {
+ *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
+ << ", which in this case is " << pg_num;
+ return -ERANGE;
+ }
+ if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
+ *ss << "'fast_read' can only apply to erasure coding pool";
+ return -EINVAL;
+ }
+ int r;
+ r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
+ crush_rule_name, &crush_rule, ss);
+ if (r) {
+ dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
+ return r;
+ }
+ if (g_conf()->mon_osd_crush_smoke_test) {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ ostringstream err;
+ CrushTester tester(newcrush, err);
+ tester.set_min_x(0);
+ tester.set_max_x(50);
+ tester.set_rule(crush_rule);
+ auto start = ceph::coarse_mono_clock::now();
+ r = tester.test_with_fork(g_conf()->mon_lease);
+ auto duration = ceph::coarse_mono_clock::now() - start;
+ if (r < 0) {
+ dout(10) << "tester.test_with_fork returns " << r
+ << ": " << err.str() << dendl;
+ *ss << "crush test failed with " << r << ": " << err.str();
+ return r;
+ }
+ dout(10) << __func__ << " crush smoke test duration: "
+ << duration << dendl;
+ }
+ unsigned size, min_size;
+ r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
+ &size, &min_size, ss);
+ if (r) {
+ dout(10) << "prepare_pool_size returns " << r << dendl;
+ return r;
+ }
+ r = check_pg_num(-1, pg_num, size, ss);
+ if (r) {
+ dout(10) << "check_pg_num returns " << r << dendl;
+ return r;
+ }
+
+ if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
+ return -EINVAL;
+ }
+
+ uint32_t stripe_width = 0;
+ r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
+ if (r) {
+ dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
+ return r;
+ }
+
+ bool fread = false;
+ if (pool_type == pg_pool_t::TYPE_ERASURE) {
+ switch (fast_read) {
+ case FAST_READ_OFF:
+ fread = false;
+ break;
+ case FAST_READ_ON:
+ fread = true;
+ break;
+ case FAST_READ_DEFAULT:
+ fread = g_conf()->osd_pool_default_ec_fast_read;
+ break;
+ default:
+ *ss << "invalid fast_read setting: " << fast_read;
+ return -EINVAL;
+ }
+ }
+
+ for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
+ p != pending_inc.new_pool_names.end();
+ ++p) {
+ if (p->second == name)
+ return 0;
+ }
+
+ if (-1 == pending_inc.new_pool_max)
+ pending_inc.new_pool_max = osdmap.pool_max;
+ int64_t pool = ++pending_inc.new_pool_max;
+ pg_pool_t empty;
+ pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
+ pi->create_time = ceph_clock_now();
+ pi->type = pool_type;
+ pi->fast_read = fread;
+ pi->flags = g_conf()->osd_pool_default_flags;
+ if (bulk) {
+ pi->set_flag(pg_pool_t::FLAG_BULK);
+ } else if (g_conf()->osd_pool_default_flag_bulk) {
+ pi->set_flag(pg_pool_t::FLAG_BULK);
+ }
+ if (g_conf()->osd_pool_default_flag_hashpspool)
+ pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+ if (g_conf()->osd_pool_default_flag_nodelete)
+ pi->set_flag(pg_pool_t::FLAG_NODELETE);
+ if (g_conf()->osd_pool_default_flag_nopgchange)
+ pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+ if (g_conf()->osd_pool_default_flag_nosizechange)
+ pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+ pi->set_flag(pg_pool_t::FLAG_CREATING);
+ if (g_conf()->osd_pool_use_gmt_hitset)
+ pi->use_gmt_hitset = true;
+ else
+ pi->use_gmt_hitset = false;
+
+ pi->size = size;
+ pi->min_size = min_size;
+ pi->crush_rule = crush_rule;
+ pi->expected_num_objects = expected_num_objects;
+ pi->object_hash = CEPH_STR_HASH_RJENKINS;
+ if (osdmap.stretch_mode_enabled) {
+ pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
+ pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
+ pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
+ pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+ if (osdmap.degraded_stretch_mode) {
+ pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
+ pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
+ // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
+ // TODO: drat, we don't record this ^ anywhere, though given that it
+ // necessarily won't exist elsewhere it likely doesn't matter
+ pi->min_size = pi->min_size / 2;
+ pi->size = pi->size / 2; // only support 2 zones now
+ }
+ }
+
+ if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+ g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
+ m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ pi->pg_autoscale_mode = m;
+ } else {
+ pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
+ }
+ auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
+ pi->set_pg_num(
+ max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
+ : pg_num);
+ pi->set_pg_num_pending(pi->get_pg_num());
+ pi->set_pg_num_target(pg_num);
+ pi->set_pgp_num(pi->get_pg_num());
+ pi->set_pgp_num_target(pgp_num);
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
+ pg_num_min) {
+ pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
+ }
+ if (osdmap.require_osd_release >= ceph_release_t::pacific &&
+ pg_num_max) {
+ pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
+ }
+ if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+ pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ pi->pg_autoscale_mode = m;
+ }
+
+ pi->last_change = pending_inc.epoch;
+ pi->auid = 0;
+
+ if (pool_type == pg_pool_t::TYPE_ERASURE) {
+ pi->erasure_code_profile = erasure_code_profile;
+ } else {
+ pi->erasure_code_profile = "";
+ }
+ pi->stripe_width = stripe_width;
+
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
+ target_size_bytes) {
+ // only store for nautilus+ because TARGET_SIZE_BYTES may be
+ // larger than int32_t max.
+ pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
+ }
+ if (target_size_ratio > 0.0 &&
+ osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ // only store for nautilus+, just to be consistent and tidy.
+ pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
+ }
+
+ pi->cache_target_dirty_ratio_micro =
+ g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
+ pi->cache_target_dirty_high_ratio_micro =
+ g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
+ pi->cache_target_full_ratio_micro =
+ g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
+ pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
+ pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
+
+ pending_inc.new_pool_names[pool] = name;
+ return 0;
+}
+
+bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
+{
+ op->mark_osdmon_event(__func__);
+ ostringstream ss;
+ if (pending_inc.new_flags < 0)
+ pending_inc.new_flags = osdmap.get_flags();
+ pending_inc.new_flags |= flag;
+ ss << OSDMap::get_flag_string(flag) << " is set";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+}
+
+bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
+{
+ op->mark_osdmon_event(__func__);
+ ostringstream ss;
+ if (pending_inc.new_flags < 0)
+ pending_inc.new_flags = osdmap.get_flags();
+ pending_inc.new_flags &= ~flag;
+ ss << OSDMap::get_flag_string(flag) << " is unset";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+}
+
+int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
+ stringstream& ss)
+{
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ return -ENOENT;
+ }
+ string var;
+ cmd_getval(cmdmap, "var", var);
+
+ pg_pool_t p = *osdmap.get_pg_pool(pool);
+ if (pending_inc.new_pools.count(pool))
+ p = pending_inc.new_pools[pool];
+
+ // accept val as a json string in the normal case (current
+ // generation monitor). parse out int or float values from the
+ // string as needed. however, if it is not a string, try to pull
+ // out an int, in case an older monitor with an older json schema is
+ // forwarding a request.
+ string val;
+ string interr, floaterr;
+ int64_t n = 0;
+ double f = 0;
+ int64_t uf = 0; // micro-f
+ cmd_getval(cmdmap, "val", val);
+
+ auto si_options = {
+ "target_max_objects"
+ };
+ auto iec_options = {
+ "target_max_bytes",
+ "target_size_bytes",
+ "compression_max_blob_size",
+ "compression_min_blob_size",
+ "csum_max_block",
+ "csum_min_block",
+ };
+ if (count(begin(si_options), end(si_options), var)) {
+ n = strict_si_cast<int64_t>(val.c_str(), &interr);
+ } else if (count(begin(iec_options), end(iec_options), var)) {
+ n = strict_iec_cast<int64_t>(val.c_str(), &interr);
+ } else {
+ // parse string as both int and float; different fields use different types.
+ n = strict_strtoll(val.c_str(), 10, &interr);
+ f = strict_strtod(val.c_str(), &floaterr);
+ uf = llrintl(f * (double)1000000.0);
+ }
+
+ if (!p.is_tier() &&
+ (var == "hit_set_type" || var == "hit_set_period" ||
+ var == "hit_set_count" || var == "hit_set_fpp" ||
+ var == "target_max_objects" || var == "target_max_bytes" ||
+ var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
+ var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
+ var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
+ var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
+ var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
+ return -EACCES;
+ }
+
+ if (var == "size") {
+ if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
+ ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
+ return -EPERM;
+ }
+ if (p.type == pg_pool_t::TYPE_ERASURE) {
+ ss << "can not change the size of an erasure-coded pool";
+ return -ENOTSUP;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= 0 || n > 10) {
+ ss << "pool size must be between 1 and 10";
+ return -EINVAL;
+ }
+ if (n == 1) {
+ if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
+ ss << "configuring pool size as 1 is disabled by default.";
+ return -EPERM;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
+ "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
+ "pass the flag --yes-i-really-mean-it.";
+ return -EPERM;
+ }
+ }
+ if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
+ return -EINVAL;
+ }
+ int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
+ if (r < 0) {
+ return r;
+ }
+ p.size = n;
+ p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
+ } else if (var == "min_size") {
+ if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
+ ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+
+ if (p.type != pg_pool_t::TYPE_ERASURE) {
+ if (n < 1 || n > p.size) {
+ ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
+ return -EINVAL;
+ }
+ } else {
+ ErasureCodeInterfaceRef erasure_code;
+ int k;
+ stringstream tmp;
+ int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
+ if (err == 0) {
+ k = erasure_code->get_data_chunk_count();
+ } else {
+ ss << __func__ << " get_erasure_code failed: " << tmp.str();
+ return err;
+ }
+
+ if (n < k || n > p.size) {
+ ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
+ return -EINVAL;
+ }
+ }
+ p.min_size = n;
+ } else if (var == "pg_num_actual") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == (int)p.get_pg_num()) {
+ return 0;
+ }
+ if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+ ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
+ if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
+ ss << "cannot adjust pg_num while initial PGs are being created";
+ return -EBUSY;
+ }
+ if (n > (int)p.get_pg_num()) {
+ if (p.get_pg_num() != p.get_pg_num_pending()) {
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_pending changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+ }
+ p.set_pg_num(n);
+ } else {
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "nautilus OSDs are required to adjust pg_num_pending";
+ return -EPERM;
+ }
+ if (n < (int)p.get_pgp_num()) {
+ ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
+ return -EINVAL;
+ }
+ if (n < (int)p.get_pg_num() - 1) {
+ ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
+ << ") - 1; only single pg decrease is currently supported";
+ return -EINVAL;
+ }
+ p.set_pg_num_pending(n);
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_pending changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+ }
+ // force pre-luminous clients to resend their ops, since they
+ // don't understand that split PGs now form a new interval.
+ p.last_force_op_resend_preluminous = pending_inc.epoch;
+ } else if (var == "pg_num") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == (int)p.get_pg_num_target()) {
+ return 0;
+ }
+ if (n <= 0 || static_cast<uint64_t>(n) >
+ g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+ ss << "'pg_num' must be greater than 0 and less than or equal to "
+ << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+ << " (you may adjust 'mon max pool pg num' for higher values)";
+ return -ERANGE;
+ }
+ if (n > (int)p.get_pg_num_target()) {
+ int r = check_pg_num(pool, n, p.get_size(), &ss);
+ if (r) {
+ return r;
+ }
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+ if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
+ ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
+ return -EPERM;
+ }
+ } else {
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "nautilus OSDs are required to decrease pg_num";
+ return -EPERM;
+ }
+ }
+ int64_t pg_min = 0, pg_max = 0;
+ p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
+ p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
+ if (pg_min && n < pg_min) {
+ ss << "specified pg_num " << n
+ << " < pg_num_min " << pg_min;
+ return -EINVAL;
+ }
+ if (pg_max && n > pg_max) {
+ ss << "specified pg_num " << n
+ << " < pg_num_max " << pg_max;
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ // pre-nautilus osdmap format; increase pg_num directly
+ assert(n > (int)p.get_pg_num());
+ // force pre-nautilus clients to resend their ops, since they
+ // don't understand pg_num_target changes form a new interval
+ p.last_force_op_resend_prenautilus = pending_inc.epoch;
+ // force pre-luminous clients to resend their ops, since they
+ // don't understand that split PGs now form a new interval.
+ p.last_force_op_resend_preluminous = pending_inc.epoch;
+ p.set_pg_num(n);
+ } else {
+ // set targets; mgr will adjust pg_num_actual and pgp_num later.
+ // make pgp_num track pg_num if it already matches. if it is set
+ // differently, leave it different and let the user control it
+ // manually.
+ if (p.get_pg_num_target() == p.get_pgp_num_target()) {
+ p.set_pgp_num_target(n);
+ }
+ p.set_pg_num_target(n);
+ }
+ } else if (var == "pgp_num_actual") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= 0) {
+ ss << "specified pgp_num must > 0, but you set to " << n;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num()) {
+ ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num_pending()) {
+ ss << "specified pgp_num " << n
+ << " > pg_num_pending " << p.get_pg_num_pending();
+ return -EINVAL;
+ }
+ p.set_pgp_num(n);
+ } else if (var == "pgp_num") {
+ if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+ ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
+ return -EPERM;
+ }
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n <= 0) {
+ ss << "specified pgp_num must > 0, but you set to " << n;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num_target()) {
+ ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ // pre-nautilus osdmap format; increase pgp_num directly
+ p.set_pgp_num(n);
+ } else {
+ p.set_pgp_num_target(n);
+ }
+ } else if (var == "pg_autoscale_mode") {
+ auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
+ if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ ss << "specified invalid mode " << val;
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
+ return -EINVAL;
+ }
+ p.pg_autoscale_mode = m;
+ } else if (var == "crush_rule") {
+ int id = osdmap.crush->get_rule_id(val);
+ if (id == -ENOENT) {
+ ss << "crush rule " << val << " does not exist";
+ return -ENOENT;
+ }
+ if (id < 0) {
+ ss << cpp_strerror(id);
+ return -ENOENT;
+ }
+ if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
+ return -EINVAL;
+ }
+ p.crush_rule = id;
+ } else if (var == "nodelete" || var == "nopgchange" ||
+ var == "nosizechange" || var == "write_fadvise_dontneed" ||
+ var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
+ uint64_t flag = pg_pool_t::get_flag_by_name(var);
+ // make sure we only compare against 'n' if we didn't receive a string
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.set_flag(flag);
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ p.unset_flag(flag);
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "hashpspool") {
+ uint64_t flag = pg_pool_t::get_flag_by_name(var);
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ if (!force) {
+ ss << "are you SURE? this will remap all placement groups in this pool,"
+ " this triggers large data movement,"
+ " pass --yes-i-really-mean-it if you really do.";
+ return -EPERM;
+ }
+ // make sure we only compare against 'n' if we didn't receive a string
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.set_flag(flag);
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ p.unset_flag(flag);
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "hit_set_type") {
+ if (val == "none")
+ p.hit_set_params = HitSet::Params();
+ else {
+ int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err)
+ return err;
+ if (val == "bloom") {
+ BloomHitSet::Params *bsp = new BloomHitSet::Params;
+ bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
+ p.hit_set_params = HitSet::Params(bsp);
+ } else if (val == "explicit_hash")
+ p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
+ else if (val == "explicit_object")
+ p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
+ else {
+ ss << "unrecognized hit_set type '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "hit_set_period") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ } else if (n < 0) {
+ ss << "hit_set_period should be non-negative";
+ return -EINVAL;
+ }
+ p.hit_set_period = n;
+ } else if (var == "hit_set_count") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ } else if (n < 0) {
+ ss << "hit_set_count should be non-negative";
+ return -EINVAL;
+ }
+ p.hit_set_count = n;
+ } else if (var == "hit_set_fpp") {
+ if (floaterr.length()) {
+ ss << "error parsing floating point value '" << val << "': " << floaterr;
+ return -EINVAL;
+ } else if (f < 0 || f > 1.0) {
+ ss << "hit_set_fpp should be in the range 0..1";
+ return -EINVAL;
+ }
+ if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
+ ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
+ return -EINVAL;
+ }
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
+ bloomp->set_fpp(f);
+ } else if (var == "use_gmt_hitset") {
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.use_gmt_hitset = true;
+ } else {
+ ss << "expecting value 'true' or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "allow_ec_overwrites") {
+ if (!p.is_erasure()) {
+ ss << "ec overwrites can only be enabled for an erasure coded pool";
+ return -EINVAL;
+ }
+ stringstream err;
+ if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
+ !is_pool_currently_all_bluestore(pool, p, &err)) {
+ ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
+ return -EINVAL;
+ }
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ ss << "ec overwrites cannot be disabled once enabled";
+ return -EINVAL;
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (var == "target_max_objects") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.target_max_objects = n;
+ } else if (var == "target_max_bytes") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.target_max_bytes = n;
+ } else if (var == "cache_target_dirty_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+ p.cache_target_dirty_ratio_micro = uf;
+ } else if (var == "cache_target_dirty_high_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+ p.cache_target_dirty_high_ratio_micro = uf;
+ } else if (var == "cache_target_full_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+ p.cache_target_full_ratio_micro = uf;
+ } else if (var == "cache_min_flush_age") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.cache_min_flush_age = n;
+ } else if (var == "cache_min_evict_age") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.cache_min_evict_age = n;
+ } else if (var == "min_read_recency_for_promote") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.min_read_recency_for_promote = n;
+ } else if (var == "hit_set_grade_decay_rate") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > 100 || n < 0) {
+ ss << "value out of range,valid range is 0 - 100";
+ return -EINVAL;
+ }
+ p.hit_set_grade_decay_rate = n;
+ } else if (var == "hit_set_search_last_n") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > p.hit_set_count || n < 0) {
+ ss << "value out of range,valid range is 0 - hit_set_count";
+ return -EINVAL;
+ }
+ p.hit_set_search_last_n = n;
+ } else if (var == "min_write_recency_for_promote") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.min_write_recency_for_promote = n;
+ } else if (var == "fast_read") {
+ if (p.is_replicated()) {
+ ss << "fast read is not supported in replication pool";
+ return -EINVAL;
+ }
+ if (val == "true" || (interr.empty() && n == 1)) {
+ p.fast_read = true;
+ } else if (val == "false" || (interr.empty() && n == 0)) {
+ p.fast_read = false;
+ } else {
+ ss << "expecting value 'true', 'false', '0', or '1'";
+ return -EINVAL;
+ }
+ } else if (pool_opts_t::is_opt_name(var)) {
+ bool unset = val == "unset";
+ if (var == "compression_mode") {
+ if (!unset) {
+ auto cmode = Compressor::get_comp_mode_type(val);
+ if (!cmode) {
+ ss << "unrecognized compression mode '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "compression_algorithm") {
+ if (!unset) {
+ auto alg = Compressor::get_comp_alg_type(val);
+ if (!alg) {
+ ss << "unrecognized compression_algorithm '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "compression_required_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float value '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f < 0 || f > 1) {
+ ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
+ return -EINVAL;
+ }
+ } else if (var == "csum_type") {
+ auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
+ if (t < 0 ) {
+ ss << "unrecognized csum_type '" << val << "'";
+ return -EINVAL;
+ }
+ //preserve csum_type numeric value
+ n = t;
+ interr.clear();
+ } else if (var == "compression_max_blob_size" ||
+ var == "compression_min_blob_size" ||
+ var == "csum_max_block" ||
+ var == "csum_min_block") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ } else if (var == "fingerprint_algorithm") {
+ if (!unset) {
+ auto alg = pg_pool_t::get_fingerprint_from_str(val);
+ if (!alg) {
+ ss << "unrecognized fingerprint_algorithm '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "target_size_bytes") {
+ if (interr.length()) {
+ ss << "error parsing unit value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+ ss << "must set require_osd_release to nautilus or "
+ << "later before setting target_size_bytes";
+ return -EINVAL;
+ }
+ } else if (var == "target_size_ratio") {
+ if (f < 0.0) {
+ ss << "target_size_ratio cannot be negative";
+ return -EINVAL;
+ }
+ } else if (var == "pg_num_min") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n > (int)p.get_pg_num_target()) {
+ ss << "specified pg_num_min " << n
+ << " > pg_num " << p.get_pg_num_target();
+ return -EINVAL;
+ }
+ } else if (var == "pg_num_max") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n && n < (int)p.get_pg_num_target()) {
+ ss << "specified pg_num_max " << n
+ << " < pg_num " << p.get_pg_num_target();
+ return -EINVAL;
+ }
+ } else if (var == "recovery_priority") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (!g_conf()->debug_allow_any_pool_priority) {
+ if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
+ ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
+ << " and " << OSD_POOL_PRIORITY_MAX;
+ return -EINVAL;
+ }
+ }
+ } else if (var == "pg_autoscale_bias") {
+ if (f < 0.0 || f > 1000.0) {
+ ss << "pg_autoscale_bias must be between 0 and 1000";
+ return -EINVAL;
+ }
+ } else if (var == "dedup_tier") {
+ if (interr.empty()) {
+ ss << "expecting value 'pool name'";
+ return -EINVAL;
+ }
+ // Current base tier in dedup does not support ec pool
+ if (p.is_erasure()) {
+ ss << "pool '" << poolstr
+ << "' is an ec pool, which cannot be a base tier";
+ return -ENOTSUP;
+ }
+ int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
+ if (lowtierpool_id < 0) {
+ ss << "unrecognized pool '" << val << "'";
+ return -ENOENT;
+ }
+ const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
+ ceph_assert(tp);
+ n = lowtierpool_id;
+ // The original input is string (pool name), but we convert it to int64_t.
+ // So, clear interr
+ interr.clear();
+ } else if (var == "dedup_chunk_algorithm") {
+ if (!unset) {
+ auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
+ if (!alg) {
+ ss << "unrecognized fingerprint_algorithm '" << val << "'";
+ return -EINVAL;
+ }
+ }
+ } else if (var == "dedup_cdc_chunk_size") {
+ if (interr.length()) {
+ ss << "error parsing int value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ }
+
+ pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
+ switch (desc.type) {
+ case pool_opts_t::STR:
+ if (unset) {
+ p.opts.unset(desc.key);
+ } else {
+ p.opts.set(desc.key, static_cast<std::string>(val));
+ }
+ break;
+ case pool_opts_t::INT:
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ if (n == 0) {
+ p.opts.unset(desc.key);
+ } else {
+ p.opts.set(desc.key, static_cast<int64_t>(n));
+ }
+ break;
+ case pool_opts_t::DOUBLE:
+ if (floaterr.length()) {
+ ss << "error parsing floating point value '" << val << "': " << floaterr;
+ return -EINVAL;
+ }
+ if (f == 0) {
+ p.opts.unset(desc.key);
+ } else {
+ p.opts.set(desc.key, static_cast<double>(f));
+ }
+ break;
+ default:
+ ceph_assert(!"unknown type");
+ }
+ } else {
+ ss << "unrecognized variable '" << var << "'";
+ return -EINVAL;
+ }
+ if (val != "unset") {
+ ss << "set pool " << pool << " " << var << " to " << val;
+ } else {
+ ss << "unset pool " << pool << " " << var;
+ }
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool] = p;
+ return 0;
+}
+
+int OSDMonitor::prepare_command_pool_application(const string &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream& ss)
+{
+ return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
+}
+
+int OSDMonitor::preprocess_command_pool_application(const string &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream& ss,
+ bool *modified)
+{
+ return _command_pool_application(prefix, cmdmap, ss, modified, false);
+}
+
+
+/**
+ * Common logic for preprocess and prepare phases of pool application
+ * tag commands. In preprocess mode we're only detecting invalid
+ * commands, and determining whether it was a modification or a no-op.
+ * In prepare mode we're actually updating the pending state.
+ */
+int OSDMonitor::_command_pool_application(const string &prefix,
+ const cmdmap_t& cmdmap,
+ stringstream& ss,
+ bool *modified,
+ bool preparing)
+{
+ string pool_name;
+ cmd_getval(cmdmap, "pool", pool_name);
+ int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ return -ENOENT;
+ }
+
+ pg_pool_t p = *osdmap.get_pg_pool(pool);
+ if (preparing) {
+ if (pending_inc.new_pools.count(pool)) {
+ p = pending_inc.new_pools[pool];
+ }
+ }
+
+ string app;
+ cmd_getval(cmdmap, "app", app);
+ bool app_exists = (p.application_metadata.count(app) > 0);
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ if (key == "all") {
+ ss << "key cannot be 'all'";
+ return -EINVAL;
+ }
+
+ string value;
+ cmd_getval(cmdmap, "value", value);
+ if (value == "all") {
+ ss << "value cannot be 'all'";
+ return -EINVAL;
+ }
+
+ if (boost::algorithm::ends_with(prefix, "enable")) {
+ if (app.empty()) {
+ ss << "application name must be provided";
+ return -EINVAL;
+ }
+
+ if (p.is_tier()) {
+ ss << "application must be enabled on base tier";
+ return -EINVAL;
+ }
+
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ if (!app_exists && !p.application_metadata.empty() && !force) {
+ ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
+ << "application; pass --yes-i-really-mean-it to proceed anyway";
+ return -EPERM;
+ }
+
+ if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
+ ss << "too many enabled applications on pool '" << pool_name << "'; "
+ << "max " << MAX_POOL_APPLICATIONS;
+ return -EINVAL;
+ }
+
+ if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "application name '" << app << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ if (!app_exists) {
+ p.application_metadata[app] = {};
+ }
+ ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
+
+ } else if (boost::algorithm::ends_with(prefix, "disable")) {
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ if (!force) {
+ ss << "Are you SURE? Disabling an application within a pool might result "
+ << "in loss of application functionality; pass "
+ << "--yes-i-really-mean-it to proceed anyway";
+ return -EPERM;
+ }
+
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return 0; // idempotent
+ }
+
+ p.application_metadata.erase(app);
+ ss << "disable application '" << app << "' on pool '" << pool_name << "'";
+
+ } else if (boost::algorithm::ends_with(prefix, "set")) {
+ if (p.is_tier()) {
+ ss << "application metadata must be set on base tier";
+ return -EINVAL;
+ }
+
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return -ENOENT;
+ }
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+
+ if (key.empty()) {
+ ss << "key must be provided";
+ return -EINVAL;
+ }
+
+ auto &app_keys = p.application_metadata[app];
+ if (app_keys.count(key) == 0 &&
+ app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
+ ss << "too many keys set for application '" << app << "' on pool '"
+ << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
+ return -EINVAL;
+ }
+
+ if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "key '" << app << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ string value;
+ cmd_getval(cmdmap, "value", value);
+ if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
+ ss << "value '" << value << "' too long; max length "
+ << MAX_POOL_APPLICATION_LENGTH;
+ return -EINVAL;
+ }
+
+ p.application_metadata[app][key] = value;
+ ss << "set application '" << app << "' key '" << key << "' to '"
+ << value << "' on pool '" << pool_name << "'";
+ } else if (boost::algorithm::ends_with(prefix, "rm")) {
+ if (!app_exists) {
+ ss << "application '" << app << "' is not enabled on pool '" << pool_name
+ << "'";
+ return -ENOENT;
+ }
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ auto it = p.application_metadata[app].find(key);
+ if (it == p.application_metadata[app].end()) {
+ ss << "application '" << app << "' on pool '" << pool_name
+ << "' does not have key '" << key << "'";
+ return 0; // idempotent
+ }
+
+ p.application_metadata[app].erase(it);
+ ss << "removed application '" << app << "' key '" << key << "' on pool '"
+ << pool_name << "'";
+ } else {
+ ceph_abort();
+ }
+
+ if (preparing) {
+ p.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool] = p;
+ }
+
+ // Because we fell through this far, we didn't hit no-op cases,
+ // so pool was definitely modified
+ if (modified != nullptr) {
+ *modified = true;
+ }
+
+ return 0;
+}
+
+int OSDMonitor::_prepare_command_osd_crush_remove(
+ CrushWrapper &newcrush,
+ int32_t id,
+ int32_t ancestor,
+ bool has_ancestor,
+ bool unlink_only)
+{
+ int err = 0;
+
+ if (has_ancestor) {
+ err = newcrush.remove_item_under(cct, id, ancestor,
+ unlink_only);
+ } else {
+ err = newcrush.remove_item(cct, id, unlink_only);
+ }
+ return err;
+}
+
+void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
+{
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+}
+
+int OSDMonitor::prepare_command_osd_crush_remove(
+ CrushWrapper &newcrush,
+ int32_t id,
+ int32_t ancestor,
+ bool has_ancestor,
+ bool unlink_only)
+{
+ int err = _prepare_command_osd_crush_remove(
+ newcrush, id, ancestor,
+ has_ancestor, unlink_only);
+
+ if (err < 0)
+ return err;
+
+ ceph_assert(err == 0);
+ do_osd_crush_remove(newcrush);
+
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_remove(int32_t id)
+{
+ if (osdmap.is_up(id)) {
+ return -EBUSY;
+ }
+
+ pending_inc.new_state[id] = osdmap.get_state(id);
+ pending_inc.new_uuid[id] = uuid_d();
+ pending_metadata_rm.insert(id);
+ pending_metadata.erase(id);
+
+ return 0;
+}
+
+int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
+{
+ ceph_assert(existing_id);
+ *existing_id = -1;
+
+ for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
+ if (!osdmap.exists(i) &&
+ pending_inc.new_up_client.count(i) == 0 &&
+ (pending_inc.new_state.count(i) == 0 ||
+ (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
+ *existing_id = i;
+ return -1;
+ }
+ }
+
+ if (pending_inc.new_max_osd < 0) {
+ return osdmap.get_max_osd();
+ }
+ return pending_inc.new_max_osd;
+}
+
+void OSDMonitor::do_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ const string& device_class,
+ int32_t* new_id)
+{
+ dout(10) << __func__ << " uuid " << uuid << dendl;
+ ceph_assert(new_id);
+
+ // We presume validation has been performed prior to calling this
+ // function. We assert with prejudice.
+
+ int32_t allocated_id = -1; // declare here so we can jump
+ int32_t existing_id = -1;
+ if (!uuid.is_zero()) {
+ existing_id = osdmap.identify_osd(uuid);
+ if (existing_id >= 0) {
+ ceph_assert(id < 0 || id == existing_id);
+ *new_id = existing_id;
+ goto out;
+ } else if (id >= 0) {
+ // uuid does not exist, and id has been provided, so just create
+ // the new osd.id
+ *new_id = id;
+ goto out;
+ }
+ }
+
+ // allocate a new id
+ allocated_id = _allocate_osd_id(&existing_id);
+ dout(10) << __func__ << " allocated id " << allocated_id
+ << " existing id " << existing_id << dendl;
+ if (existing_id >= 0) {
+ ceph_assert(existing_id < osdmap.get_max_osd());
+ ceph_assert(allocated_id < 0);
+ *new_id = existing_id;
+ } else if (allocated_id >= 0) {
+ ceph_assert(existing_id < 0);
+ // raise max_osd
+ if (pending_inc.new_max_osd < 0) {
+ pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
+ } else {
+ ++pending_inc.new_max_osd;
+ }
+ *new_id = pending_inc.new_max_osd - 1;
+ ceph_assert(*new_id == allocated_id);
+ } else {
+ ceph_abort_msg("unexpected condition");
+ }
+
+out:
+ if (device_class.size()) {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (newcrush.get_max_devices() < *new_id + 1) {
+ newcrush.set_max_devices(*new_id + 1);
+ }
+ string name = string("osd.") + stringify(*new_id);
+ if (!newcrush.item_exists(*new_id)) {
+ newcrush.set_item_name(*new_id, name);
+ }
+ ostringstream ss;
+ int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
+ if (r < 0) {
+ derr << __func__ << " failed to set " << name << " device_class "
+ << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
+ << dendl;
+ // non-fatal... this might be a replay and we want to be idempotent.
+ } else {
+ dout(20) << __func__ << " set " << name << " device_class " << device_class
+ << dendl;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ } else {
+ dout(20) << __func__ << " no device_class" << dendl;
+ }
+
+ dout(10) << __func__ << " using id " << *new_id << dendl;
+ if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
+ pending_inc.new_max_osd = *new_id + 1;
+ }
+
+ pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
+ // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
+ // set it for us. (ugh.)
+ pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
+ if (!uuid.is_zero())
+ pending_inc.new_uuid[*new_id] = uuid;
+}
+
+int OSDMonitor::validate_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ const bool check_osd_exists,
+ int32_t* existing_id,
+ stringstream& ss)
+{
+
+ dout(10) << __func__ << " id " << id << " uuid " << uuid
+ << " check_osd_exists " << check_osd_exists << dendl;
+
+ ceph_assert(existing_id);
+
+ if (id < 0 && uuid.is_zero()) {
+ // we have nothing to validate
+ *existing_id = -1;
+ return 0;
+ } else if (uuid.is_zero()) {
+ // we have an id but we will ignore it - because that's what
+ // `osd create` does.
+ return 0;
+ }
+
+ /*
+ * This function will be used to validate whether we are able to
+ * create a new osd when the `uuid` is specified.
+ *
+ * It will be used by both `osd create` and `osd new`, as the checks
+ * are basically the same when it pertains to osd id and uuid validation.
+ * However, `osd create` presumes an `uuid` is optional, for legacy
+ * reasons, while `osd new` requires the `uuid` to be provided. This
+ * means that `osd create` will not be idempotent if an `uuid` is not
+ * provided, but we will always guarantee the idempotency of `osd new`.
+ */
+
+ ceph_assert(!uuid.is_zero());
+ if (pending_inc.identify_osd(uuid) >= 0) {
+ // osd is about to exist
+ return -EAGAIN;
+ }
+
+ int32_t i = osdmap.identify_osd(uuid);
+ if (i >= 0) {
+ // osd already exists
+ if (id >= 0 && i != id) {
+ ss << "uuid " << uuid << " already in use for different id " << i;
+ return -EEXIST;
+ }
+ // return a positive errno to distinguish between a blocking error
+ // and an error we consider to not be a problem (i.e., this would be
+ // an idempotent operation).
+ *existing_id = i;
+ return EEXIST;
+ }
+ // i < 0
+ if (id >= 0) {
+ if (pending_inc.new_state.count(id)) {
+ // osd is about to exist
+ return -EAGAIN;
+ }
+ // we may not care if an osd exists if we are recreating a previously
+ // destroyed osd.
+ if (check_osd_exists && osdmap.exists(id)) {
+ ss << "id " << id << " already in use and does not match uuid "
+ << uuid;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_create(
+ const int32_t id,
+ const uuid_d& uuid,
+ int32_t* existing_id,
+ stringstream& ss)
+{
+ dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+ ceph_assert(existing_id);
+ if (osdmap.is_destroyed(id)) {
+ ss << "ceph osd create has been deprecated. Please use ceph osd new "
+ "instead.";
+ return -EINVAL;
+ }
+
+ if (uuid.is_zero()) {
+ dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
+ }
+
+ return validate_osd_create(id, uuid, true, existing_id, ss);
+}
+
+int OSDMonitor::prepare_command_osd_new(
+ MonOpRequestRef op,
+ const cmdmap_t& cmdmap,
+ const map<string,string>& params,
+ stringstream &ss,
+ Formatter *f)
+{
+ uuid_d uuid;
+ string uuidstr;
+ int64_t id = -1;
+
+ ceph_assert(paxos.is_plugged());
+
+ dout(10) << __func__ << " " << op << dendl;
+
+ /* validate command. abort now if something's wrong. */
+
+ /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
+ *
+ * If `id` is not specified, we will identify any existing osd based
+ * on `uuid`. Operation will be idempotent iff secrets match.
+ *
+ * If `id` is specified, we will identify any existing osd based on
+ * `uuid` and match against `id`. If they match, operation will be
+ * idempotent iff secrets match.
+ *
+ * `-i secrets.json` will be optional. If supplied, will be used
+ * to check for idempotency when `id` and `uuid` match.
+ *
+ * If `id` is not specified, and `uuid` does not exist, an id will
+ * be found or allocated for the osd.
+ *
+ * If `id` is specified, and the osd has been previously marked
+ * as destroyed, then the `id` will be reused.
+ */
+ if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
+ ss << "requires the OSD's UUID to be specified.";
+ return -EINVAL;
+ } else if (!uuid.parse(uuidstr.c_str())) {
+ ss << "invalid UUID value '" << uuidstr << "'.";
+ return -EINVAL;
+ }
+
+ if (cmd_getval(cmdmap, "id", id) &&
+ (id < 0)) {
+ ss << "invalid OSD id; must be greater or equal than zero.";
+ return -EINVAL;
+ }
+
+ // are we running an `osd create`-like command, or recreating
+ // a previously destroyed osd?
+
+ bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
+
+ // we will care about `id` to assess whether osd is `destroyed`, or
+ // to create a new osd.
+ // we will need an `id` by the time we reach auth.
+
+ int32_t existing_id = -1;
+ int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
+ &existing_id, ss);
+
+ bool may_be_idempotent = false;
+ if (err == EEXIST) {
+ // this is idempotent from the osdmon's point-of-view
+ may_be_idempotent = true;
+ ceph_assert(existing_id >= 0);
+ id = existing_id;
+ } else if (err < 0) {
+ return err;
+ }
+
+ if (!may_be_idempotent) {
+ // idempotency is out of the window. We are either creating a new
+ // osd or recreating a destroyed osd.
+ //
+ // We now need to figure out if we have an `id` (and if it's valid),
+ // of find an `id` if we don't have one.
+
+ // NOTE: we need to consider the case where the `id` is specified for
+ // `osd create`, and we must honor it. So this means checking if
+ // the `id` is destroyed, and if so assume the destroy; otherwise,
+ // check if it `exists` - in which case we complain about not being
+ // `destroyed`. In the end, if nothing fails, we must allow the
+ // creation, so that we are compatible with `create`.
+ if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
+ dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
+ ss << "OSD " << id << " has not yet been destroyed";
+ return -EINVAL;
+ } else if (id < 0) {
+ // find an `id`
+ id = _allocate_osd_id(&existing_id);
+ if (id < 0) {
+ ceph_assert(existing_id >= 0);
+ id = existing_id;
+ }
+ dout(10) << __func__ << " found id " << id << " to use" << dendl;
+ } else if (id >= 0 && osdmap.is_destroyed(id)) {
+ dout(10) << __func__ << " recreating osd." << id << dendl;
+ } else {
+ dout(10) << __func__ << " creating new osd." << id << dendl;
+ }
+ } else {
+ ceph_assert(id >= 0);
+ ceph_assert(osdmap.exists(id));
+ }
+
+ // we are now able to either create a brand new osd or reuse an existing
+ // osd that has been previously destroyed.
+
+ dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+
+ if (may_be_idempotent && params.empty()) {
+ // nothing to do, really.
+ dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
+ ceph_assert(id >= 0);
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", id);
+ f->close_section();
+ } else {
+ ss << id;
+ }
+ return EEXIST;
+ }
+
+ string device_class;
+ auto p = params.find("crush_device_class");
+ if (p != params.end()) {
+ device_class = p->second;
+ dout(20) << __func__ << " device_class will be " << device_class << dendl;
+ }
+ string cephx_secret, lockbox_secret, dmcrypt_key;
+ bool has_lockbox = false;
+ bool has_secrets = params.count("cephx_secret")
+ || params.count("cephx_lockbox_secret")
+ || params.count("dmcrypt_key");
+
+ KVMonitor *svc = nullptr;
+ AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
+
+ if (has_secrets) {
+ if (params.count("cephx_secret") == 0) {
+ ss << "requires a cephx secret.";
+ return -EINVAL;
+ }
+ cephx_secret = params.at("cephx_secret");
+
+ bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
+ bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
+
+ dout(10) << __func__ << " has lockbox " << has_lockbox_secret
+ << " dmcrypt " << has_dmcrypt_key << dendl;
+
+ if (has_lockbox_secret && has_dmcrypt_key) {
+ has_lockbox = true;
+ lockbox_secret = params.at("cephx_lockbox_secret");
+ dmcrypt_key = params.at("dmcrypt_key");
+ } else if (!has_lockbox_secret != !has_dmcrypt_key) {
+ ss << "requires both a cephx lockbox secret and a dm-crypt key.";
+ return -EINVAL;
+ }
+
+ dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
+
+ err = mon.authmon()->validate_osd_new(id, uuid,
+ cephx_secret,
+ lockbox_secret,
+ cephx_entity,
+ lockbox_entity,
+ ss);
+ if (err < 0) {
+ return err;
+ } else if (may_be_idempotent && err != EEXIST) {
+ // for this to be idempotent, `id` should already be >= 0; no need
+ // to use validate_id.
+ ceph_assert(id >= 0);
+ ss << "osd." << id << " exists but secrets do not match";
+ return -EEXIST;
+ }
+
+ if (has_lockbox) {
+ svc = mon.kvmon();
+ err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
+ if (err < 0) {
+ return err;
+ } else if (may_be_idempotent && err != EEXIST) {
+ ceph_assert(id >= 0);
+ ss << "osd." << id << " exists but dm-crypt key does not match.";
+ return -EEXIST;
+ }
+ }
+ }
+ ceph_assert(!has_secrets || !cephx_secret.empty());
+ ceph_assert(!has_lockbox || !lockbox_secret.empty());
+
+ if (may_be_idempotent) {
+ // we have nothing to do for either the osdmon or the authmon,
+ // and we have no lockbox - so the config key service will not be
+ // touched. This is therefore an idempotent operation, and we can
+ // just return right away.
+ dout(10) << __func__ << " idempotent -- no op." << dendl;
+ ceph_assert(id >= 0);
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", id);
+ f->close_section();
+ } else {
+ ss << id;
+ }
+ return EEXIST;
+ }
+ ceph_assert(!may_be_idempotent);
+
+ // perform updates.
+ if (has_secrets) {
+ ceph_assert(!cephx_secret.empty());
+ ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
+ (!lockbox_secret.empty() && !dmcrypt_key.empty()));
+
+ err = mon.authmon()->do_osd_new(cephx_entity,
+ lockbox_entity,
+ has_lockbox);
+ ceph_assert(0 == err);
+
+ if (has_lockbox) {
+ ceph_assert(nullptr != svc);
+ svc->do_osd_new(uuid, dmcrypt_key);
+ }
+ }
+
+ if (is_recreate_destroyed) {
+ ceph_assert(id >= 0);
+ ceph_assert(osdmap.is_destroyed(id));
+ pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
+ if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
+ pending_inc.new_state[id] |= CEPH_OSD_NEW;
+ }
+ if (osdmap.get_state(id) & CEPH_OSD_UP) {
+ // due to http://tracker.ceph.com/issues/20751 some clusters may
+ // have UP set for non-existent OSDs; make sure it is cleared
+ // for a newly created osd.
+ pending_inc.new_state[id] |= CEPH_OSD_UP;
+ }
+ pending_inc.new_uuid[id] = uuid;
+ } else {
+ ceph_assert(id >= 0);
+ int32_t new_id = -1;
+ do_osd_create(id, uuid, device_class, &new_id);
+ ceph_assert(new_id >= 0);
+ ceph_assert(id == new_id);
+ }
+
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", id);
+ f->close_section();
+ } else {
+ ss << id;
+ }
+
+ return 0;
+}
+
+bool OSDMonitor::prepare_command(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ stringstream ss;
+ cmdmap_t cmdmap;
+ if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ string rs = ss.str();
+ mon.reply_command(op, -EINVAL, rs, get_last_committed());
+ return true;
+ }
+
+ MonSession *session = op->get_session();
+ if (!session) {
+ derr << __func__ << " no session" << dendl;
+ mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+ return true;
+ }
+
+ return prepare_command_impl(op, cmdmap);
+}
+
+static int parse_reweights(CephContext *cct,
+ const cmdmap_t& cmdmap,
+ const OSDMap& osdmap,
+ map<int32_t, uint32_t>* weights)
+{
+ string weights_str;
+ if (!cmd_getval(cmdmap, "weights", weights_str)) {
+ return -EINVAL;
+ }
+ std::replace(begin(weights_str), end(weights_str), '\'', '"');
+ json_spirit::mValue json_value;
+ if (!json_spirit::read(weights_str, json_value)) {
+ return -EINVAL;
+ }
+ if (json_value.type() != json_spirit::obj_type) {
+ return -EINVAL;
+ }
+ const auto obj = json_value.get_obj();
+ try {
+ for (auto& osd_weight : obj) {
+ auto osd_id = std::stoi(osd_weight.first);
+ if (!osdmap.exists(osd_id)) {
+ return -ENOENT;
+ }
+ if (osd_weight.second.type() != json_spirit::str_type) {
+ return -EINVAL;
+ }
+ auto weight = std::stoul(osd_weight.second.get_str());
+ weights->insert({osd_id, weight});
+ }
+ } catch (const std::logic_error& e) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_destroy(
+ int32_t id,
+ stringstream& ss)
+{
+ ceph_assert(paxos.is_plugged());
+
+ // we check if the osd exists for the benefit of `osd purge`, which may
+ // have previously removed the osd. If the osd does not exist, return
+ // -ENOENT to convey this, and let the caller deal with it.
+ //
+ // we presume that all auth secrets and config keys were removed prior
+ // to this command being called. if they exist by now, we also assume
+ // they must have been created by some other command and do not pertain
+ // to this non-existent osd.
+ if (!osdmap.exists(id)) {
+ dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
+ return -ENOENT;
+ }
+
+ uuid_d uuid = osdmap.get_uuid(id);
+ dout(10) << __func__ << " destroying osd." << id
+ << " uuid " << uuid << dendl;
+
+ // if it has been destroyed, we assume our work here is done.
+ if (osdmap.is_destroyed(id)) {
+ ss << "destroyed osd." << id;
+ return 0;
+ }
+
+ EntityName cephx_entity, lockbox_entity;
+ bool idempotent_auth = false, idempotent_cks = false;
+
+ int err = mon.authmon()->validate_osd_destroy(id, uuid,
+ cephx_entity,
+ lockbox_entity,
+ ss);
+ if (err < 0) {
+ if (err == -ENOENT) {
+ idempotent_auth = true;
+ } else {
+ return err;
+ }
+ }
+
+ auto svc = mon.kvmon();
+ err = svc->validate_osd_destroy(id, uuid);
+ if (err < 0) {
+ ceph_assert(err == -ENOENT);
+ err = 0;
+ idempotent_cks = true;
+ }
+
+ if (!idempotent_auth) {
+ err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
+ ceph_assert(0 == err);
+ }
+
+ if (!idempotent_cks) {
+ svc->do_osd_destroy(id, uuid);
+ }
+
+ pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
+ pending_inc.new_uuid[id] = uuid_d();
+
+ // we can only propose_pending() once per service, otherwise we'll be
+ // defying PaxosService and all laws of nature. Therefore, as we may
+ // be used during 'osd purge', let's keep the caller responsible for
+ // proposing.
+ ceph_assert(err == 0);
+ return 0;
+}
+
+int OSDMonitor::prepare_command_osd_purge(
+ int32_t id,
+ stringstream& ss)
+{
+ ceph_assert(paxos.is_plugged());
+ dout(10) << __func__ << " purging osd." << id << dendl;
+
+ ceph_assert(!osdmap.is_up(id));
+
+ /*
+ * This may look a bit weird, but this is what's going to happen:
+ *
+ * 1. we make sure that removing from crush works
+ * 2. we call `prepare_command_osd_destroy()`. If it returns an
+ * error, then we abort the whole operation, as no updates
+ * have been made. However, we this function will have
+ * side-effects, thus we need to make sure that all operations
+ * performed henceforth will *always* succeed.
+ * 3. we call `prepare_command_osd_remove()`. Although this
+ * function can return an error, it currently only checks if the
+ * osd is up - and we have made sure that it is not so, so there
+ * is no conflict, and it is effectively an update.
+ * 4. finally, we call `do_osd_crush_remove()`, which will perform
+ * the crush update we delayed from before.
+ */
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ bool may_be_idempotent = false;
+
+ int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
+ if (err == -ENOENT) {
+ err = 0;
+ may_be_idempotent = true;
+ } else if (err < 0) {
+ ss << "error removing osd." << id << " from crush";
+ return err;
+ }
+
+ // no point destroying the osd again if it has already been marked destroyed
+ if (!osdmap.is_destroyed(id)) {
+ err = prepare_command_osd_destroy(id, ss);
+ if (err < 0) {
+ if (err == -ENOENT) {
+ err = 0;
+ } else {
+ return err;
+ }
+ } else {
+ may_be_idempotent = false;
+ }
+ }
+ ceph_assert(0 == err);
+
+ if (may_be_idempotent && !osdmap.exists(id)) {
+ dout(10) << __func__ << " osd." << id << " does not exist and "
+ << "we are idempotent." << dendl;
+ return -ENOENT;
+ }
+
+ err = prepare_command_osd_remove(id);
+ // we should not be busy, as we should have made sure this id is not up.
+ ceph_assert(0 == err);
+
+ do_osd_crush_remove(newcrush);
+ return 0;
+}
+
+bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
+ const cmdmap_t& cmdmap)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MMonCommand>();
+ bool ret = false;
+ stringstream ss;
+ string rs;
+ bufferlist rdata;
+ int err = 0;
+
+ string format;
+ cmd_getval(cmdmap, "format", format, string("plain"));
+ boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+ string prefix;
+ cmd_getval(cmdmap, "prefix", prefix);
+
+ int64_t osdid;
+ string osd_name;
+ bool osdid_present = false;
+ if (prefix != "osd pg-temp" &&
+ prefix != "osd pg-upmap" &&
+ prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
+ osdid_present = cmd_getval(cmdmap, "id", osdid);
+ }
+ if (osdid_present) {
+ ostringstream oss;
+ oss << "osd." << osdid;
+ osd_name = oss.str();
+ }
+
+ // Even if there's a pending state with changes that could affect
+ // a command, considering that said state isn't yet committed, we
+ // just don't care about those changes if the command currently being
+ // handled acts as a no-op against the current committed state.
+ // In a nutshell, we assume this command happens *before*.
+ //
+ // Let me make this clearer:
+ //
+ // - If we have only one client, and that client issues some
+ // operation that would conflict with this operation but is
+ // still on the pending state, then we would be sure that said
+ // operation wouldn't have returned yet, so the client wouldn't
+ // issue this operation (unless the client didn't wait for the
+ // operation to finish, and that would be the client's own fault).
+ //
+ // - If we have more than one client, each client will observe
+ // whatever is the state at the moment of the commit. So, if we
+ // have two clients, one issuing an unlink and another issuing a
+ // link, and if the link happens while the unlink is still on the
+ // pending state, from the link's point-of-view this is a no-op.
+ // If different clients are issuing conflicting operations and
+ // they care about that, then the clients should make sure they
+ // enforce some kind of concurrency mechanism -- from our
+ // perspective that's what Douglas Adams would call an SEP.
+ //
+ // This should be used as a general guideline for most commands handled
+ // in this function. Adapt as you see fit, but please bear in mind that
+ // this is the expected behavior.
+
+
+ if (prefix == "osd setcrushmap" ||
+ (prefix == "osd crush set" && !osdid_present)) {
+ if (pending_inc.crush.length()) {
+ dout(10) << __func__ << " waiting for pending crush update " << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ dout(10) << "prepare_command setting new crush map" << dendl;
+ bufferlist data(m->get_data());
+ CrushWrapper crush;
+ try {
+ auto bl = data.cbegin();
+ crush.decode(bl);
+ }
+ catch (const std::exception &e) {
+ err = -EINVAL;
+ ss << "Failed to parse crushmap: " << e.what();
+ goto reply;
+ }
+
+ int64_t prior_version = 0;
+ if (cmd_getval(cmdmap, "prior_version", prior_version)) {
+ if (prior_version == osdmap.get_crush_version() - 1) {
+ // see if we are a resend of the last update. this is imperfect
+ // (multiple racing updaters may not both get reliable success)
+ // but we expect crush updaters (via this interface) to be rare-ish.
+ bufferlist current, proposed;
+ osdmap.crush->encode(current, mon.get_quorum_con_features());
+ crush.encode(proposed, mon.get_quorum_con_features());
+ if (current.contents_equal(proposed)) {
+ dout(10) << __func__
+ << " proposed matches current and version equals previous"
+ << dendl;
+ err = 0;
+ ss << osdmap.get_crush_version();
+ goto reply;
+ }
+ }
+ if (prior_version != osdmap.get_crush_version()) {
+ err = -EPERM;
+ ss << "prior_version " << prior_version << " != crush version "
+ << osdmap.get_crush_version();
+ goto reply;
+ }
+ }
+
+ if (crush.has_legacy_rule_ids()) {
+ err = -EINVAL;
+ ss << "crush maps with ruleset != ruleid are no longer allowed";
+ goto reply;
+ }
+ if (!validate_crush_against_features(&crush, ss)) {
+ err = -EINVAL;
+ goto reply;
+ }
+
+ err = osdmap.validate_crush_rules(&crush, &ss);
+ if (err < 0) {
+ goto reply;
+ }
+
+ if (g_conf()->mon_osd_crush_smoke_test) {
+ // sanity check: test some inputs to make sure this map isn't
+ // totally broken
+ dout(10) << " testing map" << dendl;
+ stringstream ess;
+ CrushTester tester(crush, ess);
+ tester.set_min_x(0);
+ tester.set_max_x(50);
+ auto start = ceph::coarse_mono_clock::now();
+ int r = tester.test_with_fork(g_conf()->mon_lease);
+ auto duration = ceph::coarse_mono_clock::now() - start;
+ if (r < 0) {
+ dout(10) << " tester.test_with_fork returns " << r
+ << ": " << ess.str() << dendl;
+ ss << "crush smoke test failed with " << r << ": " << ess.str();
+ err = r;
+ goto reply;
+ }
+ dout(10) << __func__ << " crush somke test duration: "
+ << duration << ", result: " << ess.str() << dendl;
+ }
+
+ pending_inc.crush = data;
+ ss << osdmap.get_crush_version() + 1;
+ goto update;
+
+ } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
+ int bid = -1 - b;
+ if (newcrush.bucket_exists(bid) &&
+ newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
+ dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
+ newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
+ }
+ }
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush set-device-class") {
+ string device_class;
+ if (!cmd_getval(cmdmap, "class", device_class)) {
+ err = -EINVAL; // no value!
+ goto reply;
+ }
+
+ bool stop = false;
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ set<int> updated;
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ osdmap.get_all_osds(osds);
+ stop = true;
+ } else {
+ // try traditional single osd way
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ // ss has reason for failure
+ ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+ err = -EINVAL;
+ continue;
+ }
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ ostringstream oss;
+ oss << "osd." << osd;
+ string name = oss.str();
+
+ if (newcrush.get_max_devices() < osd + 1) {
+ newcrush.set_max_devices(osd + 1);
+ }
+ string action;
+ if (newcrush.item_exists(osd)) {
+ action = "updating";
+ } else {
+ action = "creating";
+ newcrush.set_item_name(osd, name);
+ }
+
+ dout(5) << action << " crush item id " << osd << " name '" << name
+ << "' device_class '" << device_class << "'"
+ << dendl;
+ err = newcrush.update_device_class(osd, device_class, name, &ss);
+ if (err < 0) {
+ goto reply;
+ }
+ if (err == 0 && !_have_pending_crush()) {
+ if (!stop) {
+ // for single osd only, wildcard makes too much noise
+ ss << "set-device-class item id " << osd << " name '" << name
+ << "' device_class '" << device_class << "': no change. ";
+ }
+ } else {
+ updated.insert(osd);
+ }
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "set osd(s) " << updated << " to class '" << device_class << "'";
+ getline(ss, rs);
+ wait_for_finished_proposal(
+ op,
+ new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush rm-device-class") {
+ bool stop = false;
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ set<int> updated;
+
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ osdmap.get_all_osds(osds);
+ stop = true;
+ } else {
+ // try traditional single osd way
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ // ss has reason for failure
+ ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+ err = -EINVAL;
+ goto reply;
+ }
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ auto class_name = newcrush.get_item_class(osd);
+ if (!class_name) {
+ ss << "osd." << osd << " belongs to no class, ";
+ continue;
+ }
+ // note that we do not verify if class_is_in_use here
+ // in case the device is misclassified and user wants
+ // to overridely reset...
+
+ err = newcrush.remove_device_class(cct, osd, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply;
+ }
+ updated.insert(osd);
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "done removing class of osd(s): " << updated;
+ getline(ss, rs);
+ wait_for_finished_proposal(
+ op,
+ new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush class create") {
+ string device_class;
+ if (!cmd_getval(cmdmap, "class", device_class)) {
+ err = -EINVAL; // no value!
+ goto reply;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::luminous) {
+ ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+ << "luminous' before using crush device classes";
+ err = -EPERM;
+ goto reply;
+ }
+ if (!_have_pending_crush() &&
+ _get_stable_crush().class_exists(device_class)) {
+ ss << "class '" << device_class << "' already exists";
+ goto reply;
+ }
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (newcrush.class_exists(device_class)) {
+ ss << "class '" << device_class << "' already exists";
+ goto update;
+ }
+ int class_id = newcrush.get_or_create_class_id(device_class);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "created class " << device_class << " with id " << class_id
+ << " to crush map";
+ goto update;
+ } else if (prefix == "osd crush class rm") {
+ string device_class;
+ if (!cmd_getval(cmdmap, "class", device_class)) {
+ err = -EINVAL; // no value!
+ goto reply;
+ }
+ if (osdmap.require_osd_release < ceph_release_t::luminous) {
+ ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+ << "luminous' before using crush device classes";
+ err = -EPERM;
+ goto reply;
+ }
+
+ if (!osdmap.crush->class_exists(device_class)) {
+ err = 0;
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (!newcrush.class_exists(device_class)) {
+ err = 0; // make command idempotent
+ goto wait;
+ }
+ int class_id = newcrush.get_class_id(device_class);
+ stringstream ts;
+ if (newcrush.class_is_in_use(class_id, &ts)) {
+ err = -EBUSY;
+ ss << "class '" << device_class << "' " << ts.str();
+ goto reply;
+ }
+
+ // check if class is used by any erasure-code-profiles
+ mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
+ osdmap.get_erasure_code_profiles();
+ auto ec_profiles = pending_inc.get_erasure_code_profiles();
+#ifdef HAVE_STDLIB_MAP_SPLICING
+ ec_profiles.merge(old_ec_profiles);
+#else
+ ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
+ make_move_iterator(end(old_ec_profiles)));
+#endif
+ list<string> referenced_by;
+ for (auto &i: ec_profiles) {
+ for (auto &j: i.second) {
+ if ("crush-device-class" == j.first && device_class == j.second) {
+ referenced_by.push_back(i.first);
+ }
+ }
+ }
+ if (!referenced_by.empty()) {
+ err = -EBUSY;
+ ss << "class '" << device_class
+ << "' is still referenced by erasure-code-profile(s): " << referenced_by;
+ goto reply;
+ }
+
+ set<int> osds;
+ newcrush.get_devices_by_class(device_class, &osds);
+ for (auto& p: osds) {
+ err = newcrush.remove_device_class(g_ceph_context, p, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply;
+ }
+ }
+
+ if (osds.empty()) {
+ // empty class, remove directly
+ err = newcrush.remove_class_name(device_class);
+ if (err < 0) {
+ ss << "class '" << device_class << "' cannot be removed '"
+ << cpp_strerror(err) << "'";
+ goto reply;
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "removed class " << device_class << " with id " << class_id
+ << " from crush map";
+ goto update;
+ } else if (prefix == "osd crush class rename") {
+ string srcname, dstname;
+ if (!cmd_getval(cmdmap, "srcname", srcname)) {
+ err = -EINVAL;
+ goto reply;
+ }
+ if (!cmd_getval(cmdmap, "dstname", dstname)) {
+ err = -EINVAL;
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
+ // suppose this is a replay and return success
+ // so command is idempotent
+ ss << "already renamed to '" << dstname << "'";
+ err = 0;
+ goto reply;
+ }
+
+ err = newcrush.rename_class(srcname, dstname);
+ if (err < 0) {
+ ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
+ << cpp_strerror(err);
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "rename class '" << srcname << "' to '" << dstname << "'";
+ goto update;
+ } else if (prefix == "osd crush add-bucket") {
+ // os crush add-bucket <name> <type>
+ string name, typestr;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "type", typestr);
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ if (!argvec.empty()) {
+ CrushWrapper::parse_loc_map(argvec, &loc);
+ dout(0) << "will create and move bucket '" << name
+ << "' to location " << loc << dendl;
+ }
+
+ if (!_have_pending_crush() &&
+ _get_stable_crush().name_exists(name)) {
+ ss << "bucket '" << name << "' already exists";
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ if (newcrush.name_exists(name)) {
+ ss << "bucket '" << name << "' already exists";
+ goto update;
+ }
+ int type = newcrush.get_type_id(typestr);
+ if (type < 0) {
+ ss << "type '" << typestr << "' does not exist";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (type == 0) {
+ ss << "type '" << typestr << "' is for devices, not buckets";
+ err = -EINVAL;
+ goto reply;
+ }
+ int bucketno;
+ err = newcrush.add_bucket(0, 0,
+ CRUSH_HASH_DEFAULT, type, 0, NULL,
+ NULL, &bucketno);
+ if (err < 0) {
+ ss << "add_bucket error: '" << cpp_strerror(err) << "'";
+ goto reply;
+ }
+ err = newcrush.set_item_name(bucketno, name);
+ if (err < 0) {
+ ss << "error setting bucket name to '" << name << "'";
+ goto reply;
+ }
+
+ if (!loc.empty()) {
+ if (!newcrush.check_item_loc(cct, bucketno, loc,
+ (int *)NULL)) {
+ err = newcrush.move_bucket(cct, bucketno, loc);
+ if (err < 0) {
+ ss << "error moving bucket '" << name << "' to location " << loc;
+ goto reply;
+ }
+ } else {
+ ss << "no need to move item id " << bucketno << " name '" << name
+ << "' to location " << loc << " in crush map";
+ }
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ if (loc.empty()) {
+ ss << "added bucket " << name << " type " << typestr
+ << " to crush map";
+ } else {
+ ss << "added bucket " << name << " type " << typestr
+ << " to location " << loc;
+ }
+ goto update;
+ } else if (prefix == "osd crush rename-bucket") {
+ string srcname, dstname;
+ cmd_getval(cmdmap, "srcname", srcname);
+ cmd_getval(cmdmap, "dstname", dstname);
+
+ err = crush_rename_bucket(srcname, dstname, &ss);
+ if (err == -EALREADY) // equivalent to success for idempotency
+ err = 0;
+ if (err)
+ goto reply;
+ else
+ goto update;
+ } else if (prefix == "osd crush weight-set create" ||
+ prefix == "osd crush weight-set create-compat") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ int64_t pool;
+ int positions;
+ if (newcrush.has_non_straw2_buckets()) {
+ ss << "crush map contains one or more bucket(s) that are not straw2";
+ err = -EPERM;
+ goto reply;
+ }
+ if (prefix == "osd crush weight-set create") {
+ if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+ osdmap.require_min_compat_client < ceph_release_t::luminous) {
+ ss << "require_min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < luminous, which is required for per-pool weight-sets. "
+ << "Try 'ceph osd set-require-min-compat-client luminous' "
+ << "before using the new interface";
+ err = -EPERM;
+ goto reply;
+ }
+ string poolname, mode;
+ cmd_getval(cmdmap, "pool", poolname);
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply;
+ }
+ cmd_getval(cmdmap, "mode", mode);
+ if (mode != "flat" && mode != "positional") {
+ ss << "unrecognized weight-set mode '" << mode << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ positions = 1;
+ }
+ if (!newcrush.create_choose_args(pool, positions)) {
+ if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+ ss << "compat weight-set already created";
+ } else {
+ ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
+ << "' already created";
+ }
+ goto reply;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ goto update;
+
+ } else if (prefix == "osd crush weight-set rm" ||
+ prefix == "osd crush weight-set rm-compat") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ int64_t pool;
+ if (prefix == "osd crush weight-set rm") {
+ string poolname;
+ cmd_getval(cmdmap, "pool", poolname);
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply;
+ }
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ }
+ newcrush.rm_choose_args(pool);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ goto update;
+
+ } else if (prefix == "osd crush weight-set reweight" ||
+ prefix == "osd crush weight-set reweight-compat") {
+ string poolname, item;
+ vector<double> weight;
+ cmd_getval(cmdmap, "pool", poolname);
+ cmd_getval(cmdmap, "item", item);
+ cmd_getval(cmdmap, "weight", weight);
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ int64_t pool;
+ if (prefix == "osd crush weight-set reweight") {
+ pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' not found";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (!newcrush.have_choose_args(pool)) {
+ ss << "no weight-set for pool '" << poolname << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ auto arg_map = newcrush.choose_args_get(pool);
+ int positions = newcrush.get_choose_args_positions(arg_map);
+ if (weight.size() != (size_t)positions) {
+ ss << "must specify exact " << positions << " weight values";
+ err = -EINVAL;
+ goto reply;
+ }
+ } else {
+ pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+ if (!newcrush.have_choose_args(pool)) {
+ ss << "no backward-compatible weight-set";
+ err = -ENOENT;
+ goto reply;
+ }
+ }
+ if (!newcrush.name_exists(item)) {
+ ss << "item '" << item << "' does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ err = newcrush.choose_args_adjust_item_weightf(
+ cct,
+ newcrush.choose_args_get(pool),
+ newcrush.get_item_id(item),
+ weight,
+ &ss);
+ if (err < 0) {
+ goto reply;
+ }
+ err = 0;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ goto update;
+ } else if (osdid_present &&
+ (prefix == "osd crush set" || prefix == "osd crush add")) {
+ // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
+ // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
+ // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
+
+ if (!osdmap.exists(osdid)) {
+ err = -ENOENT;
+ ss << osd_name
+ << " does not exist. Create it before updating the crush map";
+ goto reply;
+ }
+
+ double weight;
+ if (!cmd_getval(cmdmap, "weight", weight)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ string args;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ if (prefix == "osd crush set"
+ && !_get_stable_crush().item_exists(osdid)) {
+ err = -ENOENT;
+ ss << "unable to set item id " << osdid << " name '" << osd_name
+ << "' weight " << weight << " at location " << loc
+ << ": does not exist";
+ goto reply;
+ }
+
+ dout(5) << "adding/updating crush item id " << osdid << " name '"
+ << osd_name << "' weight " << weight << " at location "
+ << loc << dendl;
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ string action;
+ if (prefix == "osd crush set" ||
+ newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
+ action = "set";
+ err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
+ } else {
+ action = "add";
+ err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
+ if (err == 0)
+ err = 1;
+ }
+
+ if (err < 0)
+ goto reply;
+
+ if (err == 0 && !_have_pending_crush()) {
+ ss << action << " item id " << osdid << " name '" << osd_name
+ << "' weight " << weight << " at location " << loc << ": no change";
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
+ << weight << " at location " << loc << " to crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush create-or-move") {
+ do {
+ // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
+ if (!osdmap.exists(osdid)) {
+ err = -ENOENT;
+ ss << osd_name
+ << " does not exist. create it before updating the crush map";
+ goto reply;
+ }
+
+ double weight;
+ if (!cmd_getval(cmdmap, "weight", weight)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ string args;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ dout(0) << "create-or-move crush item name '" << osd_name
+ << "' initial_weight " << weight << " at location " << loc
+ << dendl;
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
+ g_conf()->osd_crush_update_weight_set);
+ if (err == 0) {
+ ss << "create-or-move updated item name '" << osd_name
+ << "' weight " << weight
+ << " at location " << loc << " to crush map";
+ break;
+ }
+ if (err > 0) {
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "create-or-move updating item name '" << osd_name
+ << "' weight " << weight
+ << " at location " << loc << " to crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } while (false);
+
+ } else if (prefix == "osd crush move") {
+ do {
+ // osd crush move <name> <loc1> [<loc2> ...]
+ string name;
+ vector<string> argvec;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "item " << name << " does not exist";
+ break;
+ }
+ int id = newcrush.get_item_id(name);
+
+ if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
+ if (id >= 0) {
+ err = newcrush.create_or_move_item(
+ cct, id, 0, name, loc,
+ g_conf()->osd_crush_update_weight_set);
+ } else {
+ err = newcrush.move_bucket(cct, id, loc);
+ }
+ if (err >= 0) {
+ ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } else {
+ ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
+ err = 0;
+ }
+ } while (false);
+ } else if (prefix == "osd crush swap-bucket") {
+ string source, dest;
+ cmd_getval(cmdmap, "source", source);
+ cmd_getval(cmdmap, "dest", dest);
+
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (!newcrush.name_exists(source)) {
+ ss << "source item " << source << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (!newcrush.name_exists(dest)) {
+ ss << "dest item " << dest << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ int sid = newcrush.get_item_id(source);
+ int did = newcrush.get_item_id(dest);
+ int sparent;
+ if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
+ ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
+ err = -EPERM;
+ goto reply;
+ }
+ if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
+ !force) {
+ ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
+ << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
+ << "; pass --yes-i-really-mean-it to proceed anyway";
+ err = -EPERM;
+ goto reply;
+ }
+ int r = newcrush.swap_bucket(cct, sid, did);
+ if (r < 0) {
+ ss << "failed to swap bucket contents: " << cpp_strerror(r);
+ err = r;
+ goto reply;
+ }
+ ss << "swapped bucket of " << source << " to " << dest;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon, op, err, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush link") {
+ // osd crush link <name> <loc1> [<loc2> ...]
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ vector<string> argvec;
+ cmd_getval(cmdmap, "args", argvec);
+ map<string,string> loc;
+ CrushWrapper::parse_loc_map(argvec, &loc);
+
+ // Need an explicit check for name_exists because get_item_id returns
+ // 0 on unfound.
+ int id = osdmap.crush->get_item_id(name);
+ if (!osdmap.crush->name_exists(name)) {
+ err = -ENOENT;
+ ss << "item " << name << " does not exist";
+ goto reply;
+ } else {
+ dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
+ }
+ if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
+ ss << "no need to move item id " << id << " name '" << name
+ << "' to location " << loc << " in crush map";
+ err = 0;
+ goto reply;
+ }
+
+ dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "item " << name << " does not exist";
+ goto reply;
+ } else {
+ int id = newcrush.get_item_id(name);
+ if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
+ err = newcrush.link_bucket(cct, id, loc);
+ if (err >= 0) {
+ ss << "linked item id " << id << " name '" << name
+ << "' to location " << loc << " in crush map";
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ } else {
+ ss << "cannot link item id " << id << " name '" << name
+ << "' to location " << loc;
+ goto reply;
+ }
+ } else {
+ ss << "no need to move item id " << id << " name '" << name
+ << "' to location " << loc << " in crush map";
+ err = 0;
+ }
+ }
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush rm" ||
+ prefix == "osd crush remove" ||
+ prefix == "osd crush unlink") {
+ do {
+ // osd crush rm <id> [ancestor]
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ string name;
+ cmd_getval(cmdmap, "name", name);
+
+ if (!osdmap.crush->name_exists(name)) {
+ err = 0;
+ ss << "device '" << name << "' does not appear in the crush map";
+ break;
+ }
+ if (!newcrush.name_exists(name)) {
+ err = 0;
+ ss << "device '" << name << "' does not appear in the crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ int id = newcrush.get_item_id(name);
+ int ancestor = 0;
+
+ bool unlink_only = prefix == "osd crush unlink";
+ string ancestor_str;
+ if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
+ if (!newcrush.name_exists(ancestor_str)) {
+ err = -ENOENT;
+ ss << "ancestor item '" << ancestor_str
+ << "' does not appear in the crush map";
+ break;
+ }
+ ancestor = newcrush.get_item_id(ancestor_str);
+ }
+
+ err = prepare_command_osd_crush_remove(
+ newcrush,
+ id, ancestor,
+ (ancestor < 0), unlink_only);
+
+ if (err == -ENOENT) {
+ ss << "item " << id << " does not appear in that position";
+ err = 0;
+ break;
+ }
+ if (err == 0) {
+ if (!unlink_only)
+ pending_inc.new_crush_node_flags[id] = 0;
+ ss << "removed item id " << id << " name '" << name << "' from crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } while (false);
+
+ } else if (prefix == "osd crush reweight-all") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ newcrush.reweight(cct);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "reweighted crush hierarchy";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush reweight") {
+ // osd crush reweight <name> <weight>
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "device '" << name << "' does not appear in the crush map";
+ goto reply;
+ }
+
+ int id = newcrush.get_item_id(name);
+ if (id < 0) {
+ ss << "device '" << name << "' is not a leaf in the crush map";
+ err = -EINVAL;
+ goto reply;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ err = newcrush.adjust_item_weightf(cct, id, w,
+ g_conf()->osd_crush_update_weight_set);
+ if (err < 0)
+ goto reply;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "reweighted item id " << id << " name '" << name << "' to " << w
+ << " in crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush reweight-subtree") {
+ // osd crush reweight <name> <weight>
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ if (!newcrush.name_exists(name)) {
+ err = -ENOENT;
+ ss << "device '" << name << "' does not appear in the crush map";
+ goto reply;
+ }
+
+ int id = newcrush.get_item_id(name);
+ if (id >= 0) {
+ ss << "device '" << name << "' is not a subtree in the crush map";
+ err = -EINVAL;
+ goto reply;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ err = newcrush.adjust_subtree_weightf(cct, id, w,
+ g_conf()->osd_crush_update_weight_set);
+ if (err < 0)
+ goto reply;
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
+ << " in crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush tunables") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ err = 0;
+ string profile;
+ cmd_getval(cmdmap, "profile", profile);
+ if (profile == "legacy" || profile == "argonaut") {
+ newcrush.set_tunables_legacy();
+ } else if (profile == "bobtail") {
+ newcrush.set_tunables_bobtail();
+ } else if (profile == "firefly") {
+ newcrush.set_tunables_firefly();
+ } else if (profile == "hammer") {
+ newcrush.set_tunables_hammer();
+ } else if (profile == "jewel") {
+ newcrush.set_tunables_jewel();
+ } else if (profile == "optimal") {
+ newcrush.set_tunables_optimal();
+ } else if (profile == "default") {
+ newcrush.set_tunables_default();
+ } else {
+ ss << "unrecognized profile '" << profile << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "adjusted tunables profile to " << profile;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd crush set-tunable") {
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ err = 0;
+ string tunable;
+ cmd_getval(cmdmap, "tunable", tunable);
+
+ int64_t value = -1;
+ if (!cmd_getval(cmdmap, "value", value)) {
+ err = -EINVAL;
+ ss << "failed to parse integer value "
+ << cmd_vartype_stringify(cmdmap.at("value"));
+ goto reply;
+ }
+
+ if (tunable == "straw_calc_version") {
+ if (value != 0 && value != 1) {
+ ss << "value must be 0 or 1; got " << value;
+ err = -EINVAL;
+ goto reply;
+ }
+ newcrush.set_straw_calc_version(value);
+ } else {
+ ss << "unrecognized tunable '" << tunable << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ if (!validate_crush_against_features(&newcrush, ss)) {
+ err = -EINVAL;
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ ss << "adjusted tunable " << tunable << " to " << value;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule create-simple") {
+ string name, root, type, mode;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "root", root);
+ cmd_getval(cmdmap, "type", type);
+ cmd_getval(cmdmap, "mode", mode);
+ if (mode == "")
+ mode = "firstn";
+
+ if (osdmap.crush->rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ if (newcrush.rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ } else {
+ int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
+ pg_pool_t::TYPE_REPLICATED, &ss);
+ if (ruleno < 0) {
+ err = ruleno;
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule create-replicated") {
+ string name, root, type, device_class;
+ cmd_getval(cmdmap, "name", name);
+ cmd_getval(cmdmap, "root", root);
+ cmd_getval(cmdmap, "type", type);
+ cmd_getval(cmdmap, "class", device_class);
+
+ if (osdmap.crush->rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ if (newcrush.rule_exists(name)) {
+ // The name is uniquely associated to a ruleid and the rule it contains
+ // From the user point of view, the rule is more meaningfull.
+ ss << "rule " << name << " already exists";
+ err = 0;
+ } else {
+ int ruleno = newcrush.add_simple_rule(
+ name, root, type, device_class,
+ "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
+ if (ruleno < 0) {
+ err = ruleno;
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd erasure-code-profile rm") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+
+ if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
+ goto wait;
+
+ if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
+ err = -EBUSY;
+ goto reply;
+ }
+
+ if (osdmap.has_erasure_code_profile(name) ||
+ pending_inc.new_erasure_code_profiles.count(name)) {
+ if (osdmap.has_erasure_code_profile(name)) {
+ pending_inc.old_erasure_code_profiles.push_back(name);
+ } else {
+ dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
+ pending_inc.new_erasure_code_profiles.erase(name);
+ }
+
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ ss << "erasure-code-profile " << name << " does not exist";
+ err = 0;
+ goto reply;
+ }
+
+ } else if (prefix == "osd erasure-code-profile set") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+ vector<string> profile;
+ cmd_getval(cmdmap, "profile", profile);
+
+ bool force = false;
+ cmd_getval(cmdmap, "force", force);
+
+ map<string,string> profile_map;
+ err = parse_erasure_code_profile(profile, &profile_map, &ss);
+ if (err)
+ goto reply;
+ if (auto found = profile_map.find("crush-failure-domain");
+ found != profile_map.end()) {
+ const auto& failure_domain = found->second;
+ int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
+ if (failure_domain_type < 0) {
+ ss << "erasure-code-profile " << profile_map
+ << " contains an invalid failure-domain " << std::quoted(failure_domain);
+ err = -EINVAL;
+ goto reply;
+ }
+ }
+
+ if (profile_map.find("plugin") == profile_map.end()) {
+ ss << "erasure-code-profile " << profile_map
+ << " must contain a plugin entry" << std::endl;
+ err = -EINVAL;
+ goto reply;
+ }
+ string plugin = profile_map["plugin"];
+
+ if (pending_inc.has_erasure_code_profile(name)) {
+ dout(20) << "erasure code profile " << name << " try again" << dendl;
+ goto wait;
+ } else {
+ err = normalize_profile(name, profile_map, force, &ss);
+ if (err)
+ goto reply;
+
+ if (osdmap.has_erasure_code_profile(name)) {
+ ErasureCodeProfile existing_profile_map =
+ osdmap.get_erasure_code_profile(name);
+ err = normalize_profile(name, existing_profile_map, force, &ss);
+ if (err)
+ goto reply;
+
+ if (existing_profile_map == profile_map) {
+ err = 0;
+ goto reply;
+ }
+ if (!force) {
+ err = -EPERM;
+ ss << "will not override erasure code profile " << name
+ << " because the existing profile "
+ << existing_profile_map
+ << " is different from the proposed profile "
+ << profile_map;
+ goto reply;
+ }
+ }
+
+ dout(20) << "erasure code profile set " << name << "="
+ << profile_map << dendl;
+ pending_inc.set_erasure_code_profile(name, profile_map);
+ }
+
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule create-erasure") {
+ err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply;
+ string name, poolstr;
+ cmd_getval(cmdmap, "name", name);
+ string profile;
+ cmd_getval(cmdmap, "profile", profile);
+ if (profile == "")
+ profile = "default";
+ if (profile == "default") {
+ if (!osdmap.has_erasure_code_profile(profile)) {
+ if (pending_inc.has_erasure_code_profile(profile)) {
+ dout(20) << "erasure code profile " << profile << " already pending" << dendl;
+ goto wait;
+ }
+
+ map<string,string> profile_map;
+ err = osdmap.get_erasure_code_profile_default(cct,
+ profile_map,
+ &ss);
+ if (err)
+ goto reply;
+ err = normalize_profile(name, profile_map, true, &ss);
+ if (err)
+ goto reply;
+ dout(20) << "erasure code profile set " << profile << "="
+ << profile_map << dendl;
+ pending_inc.set_erasure_code_profile(profile, profile_map);
+ goto wait;
+ }
+ }
+
+ int rule;
+ err = crush_rule_create_erasure(name, profile, &rule, &ss);
+ if (err < 0) {
+ switch(err) {
+ case -EEXIST: // return immediately
+ ss << "rule " << name << " already exists";
+ err = 0;
+ goto reply;
+ break;
+ case -EALREADY: // wait for pending to be proposed
+ ss << "rule " << name << " already exists";
+ err = 0;
+ break;
+ default: // non recoverable error
+ goto reply;
+ break;
+ }
+ } else {
+ ss << "created rule " << name << " at " << rule;
+ }
+
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule rm") {
+ string name;
+ cmd_getval(cmdmap, "name", name);
+
+ if (!osdmap.crush->rule_exists(name)) {
+ ss << "rule " << name << " does not exist";
+ err = 0;
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+ if (!newcrush.rule_exists(name)) {
+ ss << "rule " << name << " does not exist";
+ err = 0;
+ } else {
+ int ruleno = newcrush.get_rule_id(name);
+ ceph_assert(ruleno >= 0);
+
+ // make sure it is not in use.
+ // FIXME: this is ok in some situations, but let's not bother with that
+ // complexity now.
+ int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
+ if (osdmap.crush_rule_in_use(ruleset)) {
+ ss << "crush ruleset " << name << " " << ruleset << " is in use";
+ err = -EBUSY;
+ goto reply;
+ }
+
+ err = newcrush.remove_rule(ruleno);
+ if (err < 0) {
+ goto reply;
+ }
+
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd crush rule rename") {
+ string srcname;
+ string dstname;
+ cmd_getval(cmdmap, "srcname", srcname);
+ cmd_getval(cmdmap, "dstname", dstname);
+ if (srcname.empty() || dstname.empty()) {
+ ss << "must specify both source rule name and destination rule name";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (srcname == dstname) {
+ ss << "destination rule name is equal to source rule name";
+ err = 0;
+ goto reply;
+ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
+ // srcname does not exist and dstname already exists
+ // suppose this is a replay and return success
+ // (so this command is idempotent)
+ ss << "already renamed to '" << dstname << "'";
+ err = 0;
+ goto reply;
+ }
+
+ err = newcrush.rename_rule(srcname, dstname, &ss);
+ if (err < 0) {
+ // ss has reason for failure
+ goto reply;
+ }
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd setmaxosd") {
+ int64_t newmax;
+ if (!cmd_getval(cmdmap, "newmax", newmax)) {
+ ss << "unable to parse 'newmax' value '"
+ << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ if (newmax > g_conf()->mon_max_osd) {
+ err = -ERANGE;
+ ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
+ << g_conf()->mon_max_osd << ")";
+ goto reply;
+ }
+
+ // Don't allow shrinking OSD number as this will cause data loss
+ // and may cause kernel crashes.
+ // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
+ if (newmax < osdmap.get_max_osd()) {
+ // Check if the OSDs exist between current max and new value.
+ // If there are any OSDs exist, then don't allow shrinking number
+ // of OSDs.
+ for (int i = newmax; i < osdmap.get_max_osd(); i++) {
+ if (osdmap.exists(i)) {
+ err = -EBUSY;
+ ss << "cannot shrink max_osd to " << newmax
+ << " because osd." << i << " (and possibly others) still in use";
+ goto reply;
+ }
+ }
+ }
+
+ pending_inc.new_max_osd = newmax;
+ ss << "set new max_osd = " << pending_inc.new_max_osd;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd set-full-ratio" ||
+ prefix == "osd set-backfillfull-ratio" ||
+ prefix == "osd set-nearfull-ratio") {
+ double n;
+ if (!cmd_getval(cmdmap, "ratio", n)) {
+ ss << "unable to parse 'ratio' value '"
+ << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (prefix == "osd set-full-ratio")
+ pending_inc.new_full_ratio = n;
+ else if (prefix == "osd set-backfillfull-ratio")
+ pending_inc.new_backfillfull_ratio = n;
+ else if (prefix == "osd set-nearfull-ratio")
+ pending_inc.new_nearfull_ratio = n;
+ ss << prefix << " " << n;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd set-require-min-compat-client") {
+ string v;
+ cmd_getval(cmdmap, "version", v);
+ ceph_release_t vno = ceph_release_from_name(v);
+ if (!vno) {
+ ss << "version " << v << " is not recognized";
+ err = -EINVAL;
+ goto reply;
+ }
+ OSDMap newmap;
+ newmap.deepish_copy_from(osdmap);
+ newmap.apply_incremental(pending_inc);
+ newmap.require_min_compat_client = vno;
+ auto mvno = newmap.get_min_compat_client();
+ if (vno < mvno) {
+ ss << "osdmap current utilizes features that require " << mvno
+ << "; cannot set require_min_compat_client below that to " << vno;
+ err = -EPERM;
+ goto reply;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ FeatureMap m;
+ mon.get_combined_feature_map(&m);
+ uint64_t features = ceph_release_features(to_integer<int>(vno));
+ bool first = true;
+ bool ok = true;
+ for (int type : {
+ CEPH_ENTITY_TYPE_CLIENT,
+ CEPH_ENTITY_TYPE_MDS,
+ CEPH_ENTITY_TYPE_MGR }) {
+ auto p = m.m.find(type);
+ if (p == m.m.end()) {
+ continue;
+ }
+ for (auto& q : p->second) {
+ uint64_t missing = ~q.first & features;
+ if (missing) {
+ if (first) {
+ ss << "cannot set require_min_compat_client to " << v << ": ";
+ } else {
+ ss << "; ";
+ }
+ first = false;
+ ss << q.second << " connected " << ceph_entity_type_name(type)
+ << "(s) look like " << ceph_release_name(
+ ceph_release_from_features(q.first))
+ << " (missing 0x" << std::hex << missing << std::dec << ")";
+ ok = false;
+ }
+ }
+ }
+ if (!ok) {
+ ss << "; add --yes-i-really-mean-it to do it anyway";
+ err = -EPERM;
+ goto reply;
+ }
+ }
+ ss << "set require_min_compat_client to " << vno;
+ pending_inc.new_require_min_compat_client = vno;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pause") {
+ return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+
+ } else if (prefix == "osd unpause") {
+ return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+
+ } else if (prefix == "osd set") {
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ if (key == "pause")
+ return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+ else if (key == "noup")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
+ else if (key == "nodown")
+ return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
+ else if (key == "noout")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
+ else if (key == "noin")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
+ else if (key == "nobackfill")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
+ else if (key == "norebalance")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
+ else if (key == "norecover")
+ return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
+ else if (key == "noscrub")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
+ else if (key == "nodeep-scrub")
+ return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
+ else if (key == "notieragent")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+ else if (key == "nosnaptrim")
+ return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
+ else if (key == "pglog_hardlimit") {
+ if (!osdmap.get_num_up_osds() && !sure) {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
+ // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+ // we are reusing a jewel feature bit that was retired in luminous.
+ if (osdmap.require_osd_release >= ceph_release_t::luminous &&
+ (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
+ || sure)) {
+ return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
+ } else {
+ ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
+ err = -EPERM;
+ goto reply;
+ }
+ } else {
+ ss << "unrecognized flag '" << key << "'";
+ err = -EINVAL;
+ }
+
+ } else if (prefix == "osd unset") {
+ string key;
+ cmd_getval(cmdmap, "key", key);
+ if (key == "pause")
+ return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+ else if (key == "noup")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
+ else if (key == "nodown")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
+ else if (key == "noout")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
+ else if (key == "noin")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
+ else if (key == "nobackfill")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
+ else if (key == "norebalance")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
+ else if (key == "norecover")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
+ else if (key == "noscrub")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
+ else if (key == "nodeep-scrub")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
+ else if (key == "notieragent")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+ else if (key == "nosnaptrim")
+ return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
+ else {
+ ss << "unrecognized flag '" << key << "'";
+ err = -EINVAL;
+ }
+
+ } else if (prefix == "osd require-osd-release") {
+ string release;
+ cmd_getval(cmdmap, "release", release);
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ ceph_release_t rel = ceph_release_from_name(release.c_str());
+ if (!rel) {
+ ss << "unrecognized release " << release;
+ err = -EINVAL;
+ goto reply;
+ }
+ if (rel == osdmap.require_osd_release) {
+ // idempotent
+ err = 0;
+ goto reply;
+ }
+ ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
+ if (!osdmap.get_num_up_osds() && !sure) {
+ ss << "Not advisable to continue since no OSDs are up. Pass "
+ << "--yes-i-really-mean-it if you really wish to continue.";
+ err = -EPERM;
+ goto reply;
+ }
+ if (rel == ceph_release_t::mimic) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_MIMIC)) {
+ ss << "not all mons are mimic";
+ err = -EPERM;
+ goto reply;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
+ err = -EPERM;
+ goto reply;
+ }
+ } else if (rel == ceph_release_t::nautilus) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ ss << "not all mons are nautilus";
+ err = -EPERM;
+ goto reply;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
+ err = -EPERM;
+ goto reply;
+ }
+ } else if (rel == ceph_release_t::octopus) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_OCTOPUS)) {
+ ss << "not all mons are octopus";
+ err = -EPERM;
+ goto reply;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
+ err = -EPERM;
+ goto reply;
+ }
+ } else if (rel == ceph_release_t::pacific) {
+ if (!mon.monmap->get_required_features().contains_all(
+ ceph::features::mon::FEATURE_PACIFIC)) {
+ ss << "not all mons are pacific";
+ err = -EPERM;
+ goto reply;
+ }
+ if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
+ && !sure) {
+ ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
+ err = -EPERM;
+ goto reply;
+ }
+ } else {
+ ss << "not supported for this release yet";
+ err = -EPERM;
+ goto reply;
+ }
+ if (rel < osdmap.require_osd_release) {
+ ss << "require_osd_release cannot be lowered once it has been set";
+ err = -EPERM;
+ goto reply;
+ }
+ pending_inc.new_require_osd_release = rel;
+ goto update;
+ } else if (prefix == "osd down" ||
+ prefix == "osd out" ||
+ prefix == "osd in" ||
+ prefix == "osd rm" ||
+ prefix == "osd stop") {
+
+ bool any = false;
+ bool stop = false;
+ bool verbose = true;
+ bool definitely_dead = false;
+
+ vector<string> idvec;
+ cmd_getval(cmdmap, "ids", idvec);
+ cmd_getval(cmdmap, "definitely_dead", definitely_dead);
+ derr << "definitely_dead " << (int)definitely_dead << dendl;
+ for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+ set<int> osds;
+
+ // wildcard?
+ if (j == 0 &&
+ (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+ if (prefix == "osd in") {
+ // touch out osds only
+ osdmap.get_out_existing_osds(osds);
+ } else {
+ osdmap.get_all_osds(osds);
+ }
+ stop = true;
+ verbose = false; // so the output is less noisy.
+ } else {
+ long osd = parse_osd_id(idvec[j].c_str(), &ss);
+ if (osd < 0) {
+ ss << "invalid osd id" << osd;
+ err = -EINVAL;
+ continue;
+ } else if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+
+ osds.insert(osd);
+ }
+
+ for (auto &osd : osds) {
+ if (prefix == "osd down") {
+ if (osdmap.is_down(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already down. ";
+ } else {
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
+ ss << "marked down osd." << osd << ". ";
+ any = true;
+ }
+ if (definitely_dead) {
+ if (!pending_inc.new_xinfo.count(osd)) {
+ pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+ }
+ if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
+ any = true;
+ }
+ pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
+ }
+ } else if (prefix == "osd out") {
+ if (osdmap.is_out(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already out. ";
+ } else {
+ pending_inc.new_weight[osd] = CEPH_OSD_OUT;
+ if (osdmap.osd_weight[osd]) {
+ if (pending_inc.new_xinfo.count(osd) == 0) {
+ pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+ }
+ pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
+ }
+ ss << "marked out osd." << osd << ". ";
+ std::ostringstream msg;
+ msg << "Client " << op->get_session()->entity_name
+ << " marked osd." << osd << " out";
+ if (osdmap.is_up(osd)) {
+ msg << ", while it was still marked up";
+ } else {
+ auto period = ceph_clock_now() - down_pending_out[osd];
+ msg << ", after it was down for " << int(period.sec())
+ << " seconds";
+ }
+
+ mon.clog->info() << msg.str();
+ any = true;
+ }
+ } else if (prefix == "osd in") {
+ if (osdmap.is_in(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already in. ";
+ } else {
+ if (osdmap.osd_xinfo[osd].old_weight > 0) {
+ pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
+ if (pending_inc.new_xinfo.count(osd) == 0) {
+ pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+ }
+ pending_inc.new_xinfo[osd].old_weight = 0;
+ } else {
+ pending_inc.new_weight[osd] = CEPH_OSD_IN;
+ }
+ ss << "marked in osd." << osd << ". ";
+ any = true;
+ }
+ } else if (prefix == "osd rm") {
+ err = prepare_command_osd_remove(osd);
+
+ if (err == -EBUSY) {
+ if (any)
+ ss << ", ";
+ ss << "osd." << osd << " is still up; must be down before removal. ";
+ } else {
+ ceph_assert(err == 0);
+ if (any) {
+ ss << ", osd." << osd;
+ } else {
+ ss << "removed osd." << osd;
+ }
+ any = true;
+ }
+ } else if (prefix == "osd stop") {
+ if (osdmap.is_stop(osd)) {
+ if (verbose)
+ ss << "osd." << osd << " is already stopped. ";
+ } else if (osdmap.is_down(osd)) {
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
+ ss << "stop down osd." << osd << ". ";
+ any = true;
+ } else {
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
+ ss << "stop osd." << osd << ". ";
+ any = true;
+ }
+ }
+ }
+ }
+ if (any) {
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } else if (prefix == "osd set-group" ||
+ prefix == "osd unset-group" ||
+ prefix == "osd add-noup" ||
+ prefix == "osd add-nodown" ||
+ prefix == "osd add-noin" ||
+ prefix == "osd add-noout" ||
+ prefix == "osd rm-noup" ||
+ prefix == "osd rm-nodown" ||
+ prefix == "osd rm-noin" ||
+ prefix == "osd rm-noout") {
+ bool do_set = prefix == "osd set-group" ||
+ prefix.find("add") != string::npos;
+ string flag_str;
+ unsigned flags = 0;
+ vector<string> who;
+ if (prefix == "osd set-group" || prefix == "osd unset-group") {
+ cmd_getval(cmdmap, "flags", flag_str);
+ cmd_getval(cmdmap, "who", who);
+ vector<string> raw_flags;
+ boost::split(raw_flags, flag_str, boost::is_any_of(","));
+ for (auto& f : raw_flags) {
+ if (f == "noup")
+ flags |= CEPH_OSD_NOUP;
+ else if (f == "nodown")
+ flags |= CEPH_OSD_NODOWN;
+ else if (f == "noin")
+ flags |= CEPH_OSD_NOIN;
+ else if (f == "noout")
+ flags |= CEPH_OSD_NOOUT;
+ else {
+ ss << "unrecognized flag '" << f << "', must be one of "
+ << "{noup,nodown,noin,noout}";
+ err = -EINVAL;
+ goto reply;
+ }
+ }
+ } else {
+ cmd_getval(cmdmap, "ids", who);
+ if (prefix.find("noup") != string::npos)
+ flags = CEPH_OSD_NOUP;
+ else if (prefix.find("nodown") != string::npos)
+ flags = CEPH_OSD_NODOWN;
+ else if (prefix.find("noin") != string::npos)
+ flags = CEPH_OSD_NOIN;
+ else if (prefix.find("noout") != string::npos)
+ flags = CEPH_OSD_NOOUT;
+ else
+ ceph_assert(0 == "Unreachable!");
+ }
+ if (flags == 0) {
+ ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (who.empty()) {
+ ss << "must specify at least one or more targets to set/unset";
+ err = -EINVAL;
+ goto reply;
+ }
+ set<int> osds;
+ set<int> crush_nodes;
+ set<int> device_classes;
+ for (auto& w : who) {
+ if (w == "any" || w == "all" || w == "*") {
+ osdmap.get_all_osds(osds);
+ break;
+ }
+ std::stringstream ts;
+ if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
+ osds.insert(osd);
+ } else if (osdmap.crush->name_exists(w)) {
+ crush_nodes.insert(osdmap.crush->get_item_id(w));
+ } else if (osdmap.crush->class_exists(w)) {
+ device_classes.insert(osdmap.crush->get_class_id(w));
+ } else {
+ ss << "unable to parse osd id or crush node or device class: "
+ << "\"" << w << "\". ";
+ }
+ }
+ if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
+ // ss has reason for failure
+ err = -EINVAL;
+ goto reply;
+ }
+ bool any = false;
+ for (auto osd : osds) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist. ";
+ continue;
+ }
+ if (do_set) {
+ if (flags & CEPH_OSD_NOUP) {
+ any |= osdmap.is_noup_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
+ }
+ if (flags & CEPH_OSD_NODOWN) {
+ any |= osdmap.is_nodown_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
+ }
+ if (flags & CEPH_OSD_NOIN) {
+ any |= osdmap.is_noin_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
+ }
+ if (flags & CEPH_OSD_NOOUT) {
+ any |= osdmap.is_noout_by_osd(osd) ?
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
+ }
+ } else {
+ if (flags & CEPH_OSD_NOUP) {
+ any |= osdmap.is_noup_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
+ }
+ if (flags & CEPH_OSD_NODOWN) {
+ any |= osdmap.is_nodown_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
+ }
+ if (flags & CEPH_OSD_NOIN) {
+ any |= osdmap.is_noin_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
+ }
+ if (flags & CEPH_OSD_NOOUT) {
+ any |= osdmap.is_noout_by_osd(osd) ?
+ pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
+ pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
+ }
+ }
+ }
+ for (auto& id : crush_nodes) {
+ auto old_flags = osdmap.get_crush_node_flags(id);
+ auto& pending_flags = pending_inc.new_crush_node_flags[id];
+ pending_flags |= old_flags; // adopt existing flags first!
+ if (do_set) {
+ pending_flags |= flags;
+ } else {
+ pending_flags &= ~flags;
+ }
+ any = true;
+ }
+ for (auto& id : device_classes) {
+ auto old_flags = osdmap.get_device_class_flags(id);
+ auto& pending_flags = pending_inc.new_device_class_flags[id];
+ pending_flags |= old_flags;
+ if (do_set) {
+ pending_flags |= flags;
+ } else {
+ pending_flags &= ~flags;
+ }
+ any = true;
+ }
+ if (any) {
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ } else if (prefix == "osd pg-temp") {
+ string pgidstr;
+ if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+ ss << "unable to parse 'pgid' value '"
+ << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ pg_t pgid;
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pg " << pgid << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (pending_inc.new_pg_temp.count(pgid)) {
+ dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+
+ vector<int64_t> id_vec;
+ vector<int32_t> new_pg_temp;
+ cmd_getval(cmdmap, "id", id_vec);
+ if (id_vec.empty()) {
+ pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
+ ss << "done cleaning up pg_temp of " << pgid;
+ goto update;
+ }
+ for (auto osd : id_vec) {
+ if (!osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ new_pg_temp.push_back(osd);
+ }
+
+ int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
+ if ((int)new_pg_temp.size() < pool_min_size) {
+ ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
+ << pool_min_size << ")";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if ((int)new_pg_temp.size() > pool_size) {
+ ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
+ << pool_size << ")";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
+ new_pg_temp.begin(), new_pg_temp.end());
+ ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
+ goto update;
+ } else if (prefix == "osd primary-temp") {
+ string pgidstr;
+ if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+ ss << "unable to parse 'pgid' value '"
+ << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ pg_t pgid;
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pg " << pgid << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+
+ int64_t osd;
+ if (!cmd_getval(cmdmap, "id", osd)) {
+ ss << "unable to parse 'id' value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (osd != -1 && !osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+
+ if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+ osdmap.require_min_compat_client < ceph_release_t::firefly) {
+ ss << "require_min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < firefly, which is required for primary-temp";
+ err = -EPERM;
+ goto reply;
+ }
+
+ pending_inc.new_primary_temp[pgid] = osd;
+ ss << "set " << pgid << " primary_temp mapping to " << osd;
+ goto update;
+ } else if (prefix == "pg repeer") {
+ pg_t pgid;
+ string pgidstr;
+ cmd_getval(cmdmap, "pgid", pgidstr);
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pg '" << pgidstr << "' does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ vector<int> acting;
+ int primary;
+ osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+ if (primary < 0) {
+ err = -EAGAIN;
+ ss << "pg currently has no primary";
+ goto reply;
+ }
+ if (acting.size() > 1) {
+ // map to just primary; it will map back to what it wants
+ pending_inc.new_pg_temp[pgid] = { primary };
+ } else {
+ // hmm, pick another arbitrary osd to induce a change. Note
+ // that this won't work if there is only one suitable OSD in the cluster.
+ int i;
+ bool done = false;
+ for (i = 0; i < osdmap.get_max_osd(); ++i) {
+ if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
+ continue;
+ }
+ pending_inc.new_pg_temp[pgid] = { primary, i };
+ done = true;
+ break;
+ }
+ if (!done) {
+ err = -EAGAIN;
+ ss << "not enough up OSDs in the cluster to force repeer";
+ goto reply;
+ }
+ }
+ goto update;
+ } else if (prefix == "osd pg-upmap" ||
+ prefix == "osd rm-pg-upmap" ||
+ prefix == "osd pg-upmap-items" ||
+ prefix == "osd rm-pg-upmap-items") {
+ if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
+ ss << "min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < luminous, which is required for pg-upmap. "
+ << "Try 'ceph osd set-require-min-compat-client luminous' "
+ << "before using the new interface";
+ err = -EPERM;
+ goto reply;
+ }
+ err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err < 0)
+ goto reply;
+ string pgidstr;
+ if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+ ss << "unable to parse 'pgid' value '"
+ << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ pg_t pgid;
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pg " << pgid << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (pending_inc.old_pools.count(pgid.pool())) {
+ ss << "pool of " << pgid << " is pending removal";
+ err = -ENOENT;
+ getline(ss, rs);
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
+ return true;
+ }
+
+ enum {
+ OP_PG_UPMAP,
+ OP_RM_PG_UPMAP,
+ OP_PG_UPMAP_ITEMS,
+ OP_RM_PG_UPMAP_ITEMS,
+ } option;
+
+ if (prefix == "osd pg-upmap") {
+ option = OP_PG_UPMAP;
+ } else if (prefix == "osd rm-pg-upmap") {
+ option = OP_RM_PG_UPMAP;
+ } else if (prefix == "osd pg-upmap-items") {
+ option = OP_PG_UPMAP_ITEMS;
+ } else {
+ option = OP_RM_PG_UPMAP_ITEMS;
+ }
+
+ // check pending upmap changes
+ switch (option) {
+ case OP_PG_UPMAP: // fall through
+ case OP_RM_PG_UPMAP:
+ if (pending_inc.new_pg_upmap.count(pgid) ||
+ pending_inc.old_pg_upmap.count(pgid)) {
+ dout(10) << __func__ << " waiting for pending update on "
+ << pgid << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ break;
+
+ case OP_PG_UPMAP_ITEMS: // fall through
+ case OP_RM_PG_UPMAP_ITEMS:
+ if (pending_inc.new_pg_upmap_items.count(pgid) ||
+ pending_inc.old_pg_upmap_items.count(pgid)) {
+ dout(10) << __func__ << " waiting for pending update on "
+ << pgid << dendl;
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ break;
+
+ default:
+ ceph_abort_msg("invalid option");
+ }
+
+ switch (option) {
+ case OP_PG_UPMAP:
+ {
+ vector<int64_t> id_vec;
+ if (!cmd_getval(cmdmap, "id", id_vec)) {
+ ss << "unable to parse 'id' value(s) '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
+ if ((int)id_vec.size() < pool_min_size) {
+ ss << "num of osds (" << id_vec.size() <<") < pool min size ("
+ << pool_min_size << ")";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if ((int)id_vec.size() > pool_size) {
+ ss << "num of osds (" << id_vec.size() <<") > pool size ("
+ << pool_size << ")";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ vector<int32_t> new_pg_upmap;
+ for (auto osd : id_vec) {
+ if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
+ ss << "osd." << osd << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
+ if (it != new_pg_upmap.end()) {
+ ss << "osd." << osd << " already exists, ";
+ continue;
+ }
+ new_pg_upmap.push_back(osd);
+ }
+
+ if (new_pg_upmap.empty()) {
+ ss << "no valid upmap items(pairs) is specified";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+ new_pg_upmap.begin(), new_pg_upmap.end());
+ ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
+ }
+ break;
+
+ case OP_RM_PG_UPMAP:
+ {
+ pending_inc.old_pg_upmap.insert(pgid);
+ ss << "clear " << pgid << " pg_upmap mapping";
+ }
+ break;
+
+ case OP_PG_UPMAP_ITEMS:
+ {
+ vector<int64_t> id_vec;
+ if (!cmd_getval(cmdmap, "id", id_vec)) {
+ ss << "unable to parse 'id' value(s) '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ if (id_vec.size() % 2) {
+ ss << "you must specify pairs of osd ids to be remapped";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ int pool_size = osdmap.get_pg_pool_size(pgid);
+ if ((int)(id_vec.size() / 2) > pool_size) {
+ ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
+ << pool_size << ")";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+ ostringstream items;
+ items << "[";
+ for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
+ int from = *p++;
+ int to = *p;
+ if (from == to) {
+ ss << "from osd." << from << " == to osd." << to << ", ";
+ continue;
+ }
+ if (!osdmap.exists(from)) {
+ ss << "osd." << from << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
+ ss << "osd." << to << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ pair<int32_t,int32_t> entry = make_pair(from, to);
+ auto it = std::find(new_pg_upmap_items.begin(),
+ new_pg_upmap_items.end(), entry);
+ if (it != new_pg_upmap_items.end()) {
+ ss << "osd." << from << " -> osd." << to << " already exists, ";
+ continue;
+ }
+ new_pg_upmap_items.push_back(entry);
+ items << from << "->" << to << ",";
+ }
+ string out(items.str());
+ out.resize(out.size() - 1); // drop last ','
+ out += "]";
+
+ if (new_pg_upmap_items.empty()) {
+ ss << "no valid upmap items(pairs) is specified";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ pending_inc.new_pg_upmap_items[pgid] =
+ mempool::osdmap::vector<pair<int32_t,int32_t>>(
+ new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+ ss << "set " << pgid << " pg_upmap_items mapping to " << out;
+ }
+ break;
+
+ case OP_RM_PG_UPMAP_ITEMS:
+ {
+ pending_inc.old_pg_upmap_items.insert(pgid);
+ ss << "clear " << pgid << " pg_upmap_items mapping";
+ }
+ break;
+
+ default:
+ ceph_abort_msg("invalid option");
+ }
+
+ goto update;
+ } else if (prefix == "osd primary-affinity") {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "invalid osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse 'weight' value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
+ if (ww < 0L) {
+ ss << "weight must be >= 0";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+ osdmap.require_min_compat_client < ceph_release_t::firefly) {
+ ss << "require_min_compat_client "
+ << osdmap.require_min_compat_client
+ << " < firefly, which is required for primary-affinity";
+ err = -EPERM;
+ goto reply;
+ }
+ if (osdmap.exists(id)) {
+ pending_inc.new_primary_affinity[id] = ww;
+ ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ } else if (prefix == "osd reweight") {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ double w;
+ if (!cmd_getval(cmdmap, "weight", w)) {
+ ss << "unable to parse weight value '"
+ << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ long ww = (int)((double)CEPH_OSD_IN*w);
+ if (ww < 0L) {
+ ss << "weight must be >= 0";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (osdmap.exists(id)) {
+ pending_inc.new_weight[id] = ww;
+ ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ } else if (prefix == "osd reweightn") {
+ map<int32_t, uint32_t> weights;
+ err = parse_reweights(cct, cmdmap, osdmap, &weights);
+ if (err) {
+ ss << "unable to parse 'weights' value '"
+ << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
+ goto reply;
+ }
+ pending_inc.new_weight.insert(weights.begin(), weights.end());
+ wait_for_finished_proposal(
+ op,
+ new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd lost") {
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "are you SURE? this might mean real, permanent data loss. pass "
+ "--yes-i-really-mean-it if you really do.";
+ err = -EPERM;
+ goto reply;
+ } else if (!osdmap.exists(id)) {
+ ss << "osd." << id << " does not exist";
+ err = -ENOENT;
+ goto reply;
+ } else if (!osdmap.is_down(id)) {
+ ss << "osd." << id << " is not down";
+ err = -EBUSY;
+ goto reply;
+ } else {
+ epoch_t e = osdmap.get_info(id).down_at;
+ pending_inc.new_lost[id] = e;
+ ss << "marked osd lost in epoch " << e;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+
+ } else if (prefix == "osd destroy-actual" ||
+ prefix == "osd purge-actual" ||
+ prefix == "osd purge-new") {
+ /* Destroying an OSD means that we don't expect to further make use of
+ * the OSDs data (which may even become unreadable after this operation),
+ * and that we are okay with scrubbing all its cephx keys and config-key
+ * data (which may include lockbox keys, thus rendering the osd's data
+ * unreadable).
+ *
+ * The OSD will not be removed. Instead, we will mark it as destroyed,
+ * such that a subsequent call to `create` will not reuse the osd id.
+ * This will play into being able to recreate the OSD, at the same
+ * crush location, with minimal data movement.
+ */
+
+ // make sure authmon is writeable.
+ if (!mon.authmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for auth mon to be writeable for "
+ << "osd destroy" << dendl;
+ mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ int64_t id;
+ if (!cmd_getval(cmdmap, "id", id)) {
+ auto p = cmdmap.find("id");
+ if (p == cmdmap.end()) {
+ ss << "no osd id specified";
+ } else {
+ ss << "unable to parse osd id value '"
+ << cmd_vartype_stringify(cmdmap.at("id")) << "";
+ }
+ err = -EINVAL;
+ goto reply;
+ }
+
+ bool is_destroy = (prefix == "osd destroy-actual");
+ if (!is_destroy) {
+ ceph_assert("osd purge-actual" == prefix ||
+ "osd purge-new" == prefix);
+ }
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
+ << "This will mean real, permanent data loss, as well "
+ << "as deletion of cephx and lockbox keys. "
+ << "Pass --yes-i-really-mean-it if you really do.";
+ err = -EPERM;
+ goto reply;
+ } else if (!osdmap.exists(id)) {
+ ss << "osd." << id << " does not exist";
+ err = 0; // idempotent
+ goto reply;
+ } else if (osdmap.is_up(id)) {
+ ss << "osd." << id << " is not `down`.";
+ err = -EBUSY;
+ goto reply;
+ } else if (is_destroy && osdmap.is_destroyed(id)) {
+ ss << "destroyed osd." << id;
+ err = 0;
+ goto reply;
+ }
+
+ if (prefix == "osd purge-new" &&
+ (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
+ ss << "osd." << id << " is not new";
+ err = -EPERM;
+ goto reply;
+ }
+
+ bool goto_reply = false;
+
+ paxos.plug();
+ if (is_destroy) {
+ err = prepare_command_osd_destroy(id, ss);
+ // we checked above that it should exist.
+ ceph_assert(err != -ENOENT);
+ } else {
+ err = prepare_command_osd_purge(id, ss);
+ if (err == -ENOENT) {
+ err = 0;
+ ss << "osd." << id << " does not exist.";
+ goto_reply = true;
+ }
+ }
+ paxos.unplug();
+
+ if (err < 0 || goto_reply) {
+ goto reply;
+ }
+
+ if (is_destroy) {
+ ss << "destroyed osd." << id;
+ } else {
+ ss << "purged osd." << id;
+ }
+
+ getline(ss, rs);
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
+ force_immediate_propose();
+ return true;
+
+ } else if (prefix == "osd new") {
+
+ // make sure authmon is writeable.
+ if (!mon.authmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for auth mon to be writeable for "
+ << "osd new" << dendl;
+ mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ // make sure kvmon is writeable.
+ if (!mon.kvmon()->is_writeable()) {
+ dout(10) << __func__ << " waiting for kv mon to be writeable for "
+ << "osd new" << dendl;
+ mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+ return false;
+ }
+
+ map<string,string> param_map;
+
+ bufferlist bl = m->get_data();
+ string param_json = bl.to_str();
+ dout(20) << __func__ << " osd new json = " << param_json << dendl;
+
+ err = get_json_str_map(param_json, ss, &param_map);
+ if (err < 0)
+ goto reply;
+
+ dout(20) << __func__ << " osd new params " << param_map << dendl;
+
+ paxos.plug();
+ err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
+ paxos.unplug();
+
+ if (err < 0) {
+ goto reply;
+ }
+
+ if (f) {
+ f->flush(rdata);
+ } else {
+ rdata.append(ss);
+ }
+
+ if (err == EEXIST) {
+ // idempotent operation
+ err = 0;
+ goto reply;
+ }
+
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon, op, 0, rs, rdata,
+ get_last_committed() + 1));
+ force_immediate_propose();
+ return true;
+
+ } else if (prefix == "osd create") {
+
+ // optional id provided?
+ int64_t id = -1, cmd_id = -1;
+ if (cmd_getval(cmdmap, "id", cmd_id)) {
+ if (cmd_id < 0) {
+ ss << "invalid osd id value '" << cmd_id << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ dout(10) << " osd create got id " << cmd_id << dendl;
+ }
+
+ uuid_d uuid;
+ string uuidstr;
+ if (cmd_getval(cmdmap, "uuid", uuidstr)) {
+ if (!uuid.parse(uuidstr.c_str())) {
+ ss << "invalid uuid value '" << uuidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ // we only care about the id if we also have the uuid, to
+ // ensure the operation's idempotency.
+ id = cmd_id;
+ }
+
+ int32_t new_id = -1;
+ err = prepare_command_osd_create(id, uuid, &new_id, ss);
+ if (err < 0) {
+ if (err == -EAGAIN) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ // a check has failed; reply to the user.
+ goto reply;
+
+ } else if (err == EEXIST) {
+ // this is an idempotent operation; we can go ahead and reply.
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", new_id);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ss << new_id;
+ rdata.append(ss);
+ }
+ err = 0;
+ goto reply;
+ }
+
+ string empty_device_class;
+ do_osd_create(id, uuid, empty_device_class, &new_id);
+
+ if (f) {
+ f->open_object_section("created_osd");
+ f->dump_int("osdid", new_id);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ ss << new_id;
+ rdata.append(ss);
+ }
+ wait_for_finished_proposal(op,
+ new Monitor::C_Command(mon, op, 0, rs, rdata,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd blocklist clear" ||
+ prefix == "osd blacklist clear") {
+ pending_inc.new_blocklist.clear();
+ std::list<std::pair<entity_addr_t,utime_t > > blocklist;
+ std::list<std::pair<entity_addr_t,utime_t > > range_b;
+ osdmap.get_blocklist(&blocklist, &range_b);
+ for (const auto &entry : blocklist) {
+ pending_inc.old_blocklist.push_back(entry.first);
+ }
+ for (const auto &entry : range_b) {
+ pending_inc.old_range_blocklist.push_back(entry.first);
+ }
+ ss << " removed all blocklist entries";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd blocklist" ||
+ prefix == "osd blacklist") {
+ string addrstr, rangestr;
+ bool range = false;
+ cmd_getval(cmdmap, "addr", addrstr);
+ if (cmd_getval(cmdmap, "range", rangestr)) {
+ if (rangestr == "range") {
+ range = true;
+ } else {
+ ss << "Did you mean to specify \"osd blocklist range\"?";
+ err = -EINVAL;
+ goto reply;
+ }
+ }
+ entity_addr_t addr;
+ if (!addr.parse(addrstr.c_str(), 0)) {
+ ss << "unable to parse address " << addrstr;
+ err = -EINVAL;
+ goto reply;
+ }
+ else {
+ if (range) {
+ if (!addr.maybe_cidr()) {
+ ss << "You specified a range command, but " << addr
+ << " does not parse as a CIDR range";
+ err = -EINVAL;
+ goto reply;
+ }
+ addr.type = entity_addr_t::TYPE_CIDR;
+ err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
+ if (err) {
+ goto reply;
+ }
+ if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
+ (addr.is_ipv6() && addr.get_nonce() > 128)) {
+ ss << "Too many bits in range for that protocol!";
+ err = -EINVAL;
+ goto reply;
+ }
+ } else {
+ if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+ // always blocklist type ANY
+ addr.set_type(entity_addr_t::TYPE_ANY);
+ } else {
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ }
+ }
+
+ string blocklistop;
+ if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
+ cmd_getval(cmdmap, "blacklistop", blocklistop);
+ }
+ if (blocklistop == "add") {
+ utime_t expires = ceph_clock_now();
+ double d;
+ // default one hour
+ cmd_getval(cmdmap, "expire", d,
+ g_conf()->mon_osd_blocklist_default_expire);
+ expires += d;
+
+ auto add_to_pending_blocklists = [](auto& nb, auto& ob,
+ const auto& addr,
+ const auto& expires) {
+ nb[addr] = expires;
+ // cancel any pending un-blocklisting request too
+ auto it = std::find(ob.begin(),
+ ob.end(), addr);
+ if (it != ob.end()) {
+ ob.erase(it);
+ }
+ };
+ if (range) {
+ add_to_pending_blocklists(pending_inc.new_range_blocklist,
+ pending_inc.old_range_blocklist,
+ addr, expires);
+
+ } else {
+ add_to_pending_blocklists(pending_inc.new_blocklist,
+ pending_inc.old_blocklist,
+ addr, expires);
+ }
+
+ ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (blocklistop == "rm") {
+ auto rm_from_pending_blocklists = [](const auto& addr,
+ auto& blocklist,
+ auto& ob, auto& pb) {
+ if (blocklist.count(addr)) {
+ ob.push_back(addr);
+ return true;
+ } else if (pb.count(addr)) {
+ pb.erase(addr);
+ return true;
+ }
+ return false;
+ };
+ if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
+ pending_inc.old_blocklist,
+ pending_inc.new_blocklist)) ||
+ (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
+ pending_inc.old_range_blocklist,
+ pending_inc.new_range_blocklist))) {
+ ss << "un-blocklisting " << addr;
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ }
+ ss << addr << " isn't blocklisted";
+ err = 0;
+ goto reply;
+ }
+ }
+ } else if (prefix == "osd pool mksnap") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string snapname;
+ cmd_getval(cmdmap, "snap", snapname);
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ if (p->is_unmanaged_snaps_mode()) {
+ ss << "pool " << poolstr << " is in unmanaged snaps mode";
+ err = -EINVAL;
+ goto reply;
+ } else if (p->snap_exists(snapname.c_str())) {
+ ss << "pool " << poolstr << " snap " << snapname << " already exists";
+ err = 0;
+ goto reply;
+ } else if (p->is_tier()) {
+ ss << "pool " << poolstr << " is a cache tier";
+ err = -EINVAL;
+ goto reply;
+ }
+ pg_pool_t *pp = 0;
+ if (pending_inc.new_pools.count(pool))
+ pp = &pending_inc.new_pools[pool];
+ if (!pp) {
+ pp = &pending_inc.new_pools[pool];
+ *pp = *p;
+ }
+ if (pp->snap_exists(snapname.c_str())) {
+ ss << "pool " << poolstr << " snap " << snapname << " already exists";
+ } else {
+ pp->add_snap(snapname.c_str(), ceph_clock_now());
+ pp->set_snap_epoch(pending_inc.epoch);
+ ss << "created pool " << poolstr << " snap " << snapname;
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool rmsnap") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string snapname;
+ cmd_getval(cmdmap, "snap", snapname);
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ if (p->is_unmanaged_snaps_mode()) {
+ ss << "pool " << poolstr << " is in unmanaged snaps mode";
+ err = -EINVAL;
+ goto reply;
+ } else if (!p->snap_exists(snapname.c_str())) {
+ ss << "pool " << poolstr << " snap " << snapname << " does not exist";
+ err = 0;
+ goto reply;
+ }
+ pg_pool_t *pp = 0;
+ if (pending_inc.new_pools.count(pool))
+ pp = &pending_inc.new_pools[pool];
+ if (!pp) {
+ pp = &pending_inc.new_pools[pool];
+ *pp = *p;
+ }
+ snapid_t sn = pp->snap_exists(snapname.c_str());
+ if (sn) {
+ pp->remove_snap(sn);
+ pp->set_snap_epoch(pending_inc.epoch);
+ ss << "removed pool " << poolstr << " snap " << snapname;
+ } else {
+ ss << "already removed pool " << poolstr << " snap " << snapname;
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool create") {
+ int64_t pg_num, pgp_num, pg_num_min, pg_num_max;
+ cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
+ cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
+ cmd_getval(cmdmap, "pg_num_max", pg_num_max, int64_t(0));
+ cmd_getval(cmdmap, "pgp_num", pgp_num, int64_t(pg_num));
+ string pool_type_str;
+ cmd_getval(cmdmap, "pool_type", pool_type_str);
+ if (pool_type_str.empty())
+ pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
+
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id >= 0) {
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ if (pool_type_str != p->get_type_name()) {
+ ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
+ err = -EINVAL;
+ } else {
+ ss << "pool '" << poolstr << "' already exists";
+ err = 0;
+ }
+ goto reply;
+ }
+
+ int pool_type;
+ if (pool_type_str == "replicated") {
+ pool_type = pg_pool_t::TYPE_REPLICATED;
+ } else if (pool_type_str == "erasure") {
+ pool_type = pg_pool_t::TYPE_ERASURE;
+ } else {
+ ss << "unknown pool type '" << pool_type_str << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ bool implicit_rule_creation = false;
+ int64_t expected_num_objects = 0;
+ string rule_name;
+ cmd_getval(cmdmap, "rule", rule_name);
+ string erasure_code_profile;
+ cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
+
+ if (pool_type == pg_pool_t::TYPE_ERASURE) {
+ if (erasure_code_profile == "")
+ erasure_code_profile = "default";
+ //handle the erasure code profile
+ if (erasure_code_profile == "default") {
+ if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
+ if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
+ dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
+ goto wait;
+ }
+
+ map<string,string> profile_map;
+ err = osdmap.get_erasure_code_profile_default(cct,
+ profile_map,
+ &ss);
+ if (err)
+ goto reply;
+ dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
+ pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
+ goto wait;
+ }
+ }
+ if (rule_name == "") {
+ implicit_rule_creation = true;
+ if (erasure_code_profile == "default") {
+ rule_name = "erasure-code";
+ } else {
+ dout(1) << "implicitly use rule named after the pool: "
+ << poolstr << dendl;
+ rule_name = poolstr;
+ }
+ }
+ cmd_getval(cmdmap, "expected_num_objects",
+ expected_num_objects, int64_t(0));
+ } else {
+ //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
+ // and put expected_num_objects to rule field
+ if (erasure_code_profile != "") { // cmd is from CLI
+ if (rule_name != "") {
+ string interr;
+ expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
+ if (interr.length()) {
+ ss << "error parsing integer value '" << rule_name << "': " << interr;
+ err = -EINVAL;
+ goto reply;
+ }
+ }
+ rule_name = erasure_code_profile;
+ } else { // cmd is well-formed
+ cmd_getval(cmdmap, "expected_num_objects",
+ expected_num_objects, int64_t(0));
+ }
+ }
+
+ if (!implicit_rule_creation && rule_name != "") {
+ int rule;
+ err = get_crush_rule(rule_name, &rule, &ss);
+ if (err == -EAGAIN) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ if (err)
+ goto reply;
+ }
+
+ if (expected_num_objects < 0) {
+ ss << "'expected_num_objects' must be non-negative";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ set<int32_t> osds;
+ osdmap.get_all_osds(osds);
+ bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
+ string type;
+ if (!get_osd_objectstore_type(osd, &type)) {
+ return type == "filestore";
+ } else {
+ return false;
+ }
+ });
+
+ if (has_filestore_osd &&
+ expected_num_objects > 0 &&
+ cct->_conf->filestore_merge_threshold > 0) {
+ ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ if (has_filestore_osd &&
+ expected_num_objects == 0 &&
+ cct->_conf->filestore_merge_threshold < 0) {
+ int osds = osdmap.get_num_osds();
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
+ ss << "For better initial performance on pools expected to store a "
+ << "large number of objects, consider supplying the "
+ << "expected_num_objects parameter when creating the pool."
+ << " Pass --yes-i-really-mean-it to ignore it";
+ err = -EPERM;
+ goto reply;
+ }
+ }
+
+ int64_t fast_read_param;
+ cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
+ FastReadType fast_read = FAST_READ_DEFAULT;
+ if (fast_read_param == 0)
+ fast_read = FAST_READ_OFF;
+ else if (fast_read_param > 0)
+ fast_read = FAST_READ_ON;
+
+ int64_t repl_size = 0;
+ cmd_getval(cmdmap, "size", repl_size);
+ int64_t target_size_bytes = 0;
+ double target_size_ratio = 0.0;
+ cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
+ cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
+
+ string pg_autoscale_mode;
+ cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
+
+ bool bulk = 0;
+ cmd_getval(cmdmap, "bulk", bulk);
+ err = prepare_new_pool(poolstr,
+ -1, // default crush rule
+ rule_name,
+ pg_num, pgp_num, pg_num_min, pg_num_max,
+ repl_size, target_size_bytes, target_size_ratio,
+ erasure_code_profile, pool_type,
+ (uint64_t)expected_num_objects,
+ fast_read,
+ pg_autoscale_mode,
+ bulk,
+ &ss);
+ if (err < 0) {
+ switch(err) {
+ case -EEXIST:
+ ss << "pool '" << poolstr << "' already exists";
+ break;
+ case -EAGAIN:
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ case -ERANGE:
+ goto reply;
+ default:
+ goto reply;
+ break;
+ }
+ } else {
+ ss << "pool '" << poolstr << "' created";
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd pool delete" ||
+ prefix == "osd pool rm") {
+ // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
+ string poolstr, poolstr2, sure;
+ cmd_getval(cmdmap, "pool", poolstr);
+ cmd_getval(cmdmap, "pool2", poolstr2);
+ int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+ if (pool < 0) {
+ ss << "pool '" << poolstr << "' does not exist";
+ err = 0;
+ goto reply;
+ }
+
+ bool force_no_fake = false;
+ cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
+ bool force = false;
+ cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
+ if (poolstr2 != poolstr ||
+ (!force && !force_no_fake)) {
+ ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
+ << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
+ << "followed by --yes-i-really-really-mean-it.";
+ err = -EPERM;
+ goto reply;
+ }
+ err = _prepare_remove_pool(pool, &ss, force_no_fake);
+ if (err == -EAGAIN) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ if (err < 0)
+ goto reply;
+ goto update;
+ } else if (prefix == "osd pool rename") {
+ string srcpoolstr, destpoolstr;
+ cmd_getval(cmdmap, "srcpool", srcpoolstr);
+ cmd_getval(cmdmap, "destpool", destpoolstr);
+ int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
+ int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
+
+ if (pool_src < 0) {
+ if (pool_dst >= 0) {
+ // src pool doesn't exist, dst pool does exist: to ensure idempotency
+ // of operations, assume this rename succeeded, as it is not changing
+ // the current state. Make sure we output something understandable
+ // for whoever is issuing the command, if they are paying attention,
+ // in case it was not intentional; or to avoid a "wtf?" and a bug
+ // report in case it was intentional, while expecting a failure.
+ ss << "pool '" << srcpoolstr << "' does not exist; pool '"
+ << destpoolstr << "' does -- assuming successful rename";
+ err = 0;
+ } else {
+ ss << "unrecognized pool '" << srcpoolstr << "'";
+ err = -ENOENT;
+ }
+ goto reply;
+ } else if (pool_dst >= 0) {
+ // source pool exists and so does the destination pool
+ ss << "pool '" << destpoolstr << "' already exists";
+ err = -EEXIST;
+ goto reply;
+ }
+
+ int ret = _prepare_rename_pool(pool_src, destpoolstr);
+ if (ret == 0) {
+ ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
+ } else {
+ ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
+ << cpp_strerror(ret);
+ }
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
+ get_last_committed() + 1));
+ return true;
+
+ } else if (prefix == "osd pool set") {
+ err = prepare_command_pool_set(cmdmap, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err < 0)
+ goto reply;
+
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier add") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ ceph_assert(tp);
+
+ if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
+ goto reply;
+ }
+
+ // make sure new tier is empty
+ string force_nonempty;
+ cmd_getval(cmdmap, "force_nonempty", force_nonempty);
+ const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
+ if (pstats && pstats->stats.sum.num_objects != 0 &&
+ force_nonempty != "--force-nonempty") {
+ ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
+ err = -ENOTEMPTY;
+ goto reply;
+ }
+ if (tp->is_erasure()) {
+ ss << "tier pool '" << tierpoolstr
+ << "' is an ec pool, which cannot be a tier";
+ err = -ENOTSUP;
+ goto reply;
+ }
+ if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
+ ((force_nonempty != "--force-nonempty") ||
+ (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
+ ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
+ err = -ENOTEMPTY;
+ goto reply;
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+ if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ np->tiers.insert(tierpool_id);
+ np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
+ ntp->tier_of = pool_id;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier remove" ||
+ prefix == "osd tier rm") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ ceph_assert(tp);
+
+ if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
+ goto reply;
+ }
+
+ if (p->tiers.count(tierpool_id) == 0) {
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ err = 0;
+ goto reply;
+ }
+ if (tp->tier_of != pool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is a tier of '"
+ << osdmap.get_pool_name(tp->tier_of) << "': "
+ // be scary about it; this is an inconsistency and bells must go off
+ << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == tierpool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
+ err = -EBUSY;
+ goto reply;
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+ if (np->tiers.count(tierpool_id) == 0 ||
+ ntp->tier_of != pool_id ||
+ np->read_tier == tierpool_id) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ np->tiers.erase(tierpool_id);
+ ntp->clear_tier();
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier set-overlay") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string overlaypoolstr;
+ cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
+ int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
+ if (overlaypool_id < 0) {
+ ss << "unrecognized pool '" << overlaypoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
+ ceph_assert(overlay_p);
+ if (p->tiers.count(overlaypool_id) == 0) {
+ ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == overlaypool_id) {
+ err = 0;
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ goto reply;
+ }
+ if (p->has_read_tier()) {
+ ss << "pool '" << poolstr << "' has overlay '"
+ << osdmap.get_pool_name(p->read_tier)
+ << "'; please remove-overlay first";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ np->read_tier = overlaypool_id;
+ np->write_tier = overlaypool_id;
+ np->set_last_force_op_resend(pending_inc.epoch);
+ pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
+ noverlay_p->set_last_force_op_resend(pending_inc.epoch);
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
+ ss <<" (WARNING: overlay pool cache_mode is still NONE)";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier remove-overlay" ||
+ prefix == "osd tier rm-overlay") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ if (!p->has_read_tier()) {
+ err = 0;
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ goto reply;
+ }
+
+ if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
+ goto reply;
+ }
+
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ if (np->has_read_tier()) {
+ const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
+ pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
+ nop->set_last_force_op_resend(pending_inc.epoch);
+ }
+ if (np->has_write_tier()) {
+ const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
+ pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
+ nop->set_last_force_op_resend(pending_inc.epoch);
+ }
+ np->clear_read_tier();
+ np->clear_write_tier();
+ np->set_last_force_op_resend(pending_inc.epoch);
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier cache-mode") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ if (!p->is_tier()) {
+ ss << "pool '" << poolstr << "' is not a tier";
+ err = -EINVAL;
+ goto reply;
+ }
+ string modestr;
+ cmd_getval(cmdmap, "mode", modestr);
+ pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+ if (int(mode) < 0) {
+ ss << "'" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+ if (mode == pg_pool_t::CACHEMODE_FORWARD ||
+ mode == pg_pool_t::CACHEMODE_READFORWARD) {
+ ss << "'" << modestr << "' is no longer a supported cache mode";
+ err = -EPERM;
+ goto reply;
+ }
+ if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_NONE &&
+ mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY) &&
+ !sure) {
+ ss << "'" << modestr << "' is not a well-supported cache mode and may "
+ << "corrupt your data. pass --yes-i-really-mean-it to force.";
+ err = -EPERM;
+ goto reply;
+ }
+
+ // pool already has this cache-mode set and there are no pending changes
+ if (p->cache_mode == mode &&
+ (pending_inc.new_pools.count(pool_id) == 0 ||
+ pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
+ ss << "set cache-mode for pool '" << poolstr << "'"
+ << " to " << pg_pool_t::get_cache_mode_name(mode);
+ err = 0;
+ goto reply;
+ }
+
+ /* Mode description:
+ *
+ * none: No cache-mode defined
+ * forward: Forward all reads and writes to base pool [removed]
+ * writeback: Cache writes, promote reads from base pool
+ * readonly: Forward writes to base pool
+ * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
+ * proxy: Proxy all reads and writes to base pool
+ * readproxy: Writes are in writeback mode, Reads are in proxy mode
+ *
+ * Hence, these are the allowed transitions:
+ *
+ * none -> any
+ * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
+ * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
+ * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
+ * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
+ * writeback -> readproxy || proxy
+ * readonly -> any
+ */
+
+ // We check if the transition is valid against the current pool mode, as
+ // it is the only committed state thus far. We will blantly squash
+ // whatever mode is on the pending state.
+
+ if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+ (mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY)) {
+ ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
+ << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
+ << "' pool; only '"
+ << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
+ << "' allowed.";
+ err = -EINVAL;
+ goto reply;
+ }
+ if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY)) ||
+
+ (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_PROXY)) ||
+
+ (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_READPROXY)) ||
+
+ (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
+ (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_PROXY &&
+ mode != pg_pool_t::CACHEMODE_READPROXY))) {
+
+ const pool_stat_t* pstats =
+ mon.mgrstatmon()->get_pool_stat(pool_id);
+
+ if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
+ ss << "unable to set cache-mode '"
+ << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
+ << "': dirty objects found";
+ err = -EBUSY;
+ goto reply;
+ }
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ np->cache_mode = mode;
+ // set this both when moving to and from cache_mode NONE. this is to
+ // capture legacy pools that were set up before this flag existed.
+ np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
+ ss << "set cache-mode for pool '" << poolstr
+ << "' to " << pg_pool_t::get_cache_mode_name(mode);
+ if (mode == pg_pool_t::CACHEMODE_NONE) {
+ const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
+ ceph_assert(base_pool);
+ if (base_pool->read_tier == pool_id ||
+ base_pool->write_tier == pool_id)
+ ss <<" (WARNING: pool is still configured as read or write tier)";
+ }
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd tier add-cache") {
+ err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+ if (err == -EAGAIN)
+ goto wait;
+ if (err)
+ goto reply;
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ ceph_assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ ceph_assert(tp);
+
+ if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
+ goto reply;
+ }
+
+ int64_t size = 0;
+ if (!cmd_getval(cmdmap, "size", size)) {
+ ss << "unable to parse 'size' value '"
+ << cmd_vartype_stringify(cmdmap.at("size")) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ // make sure new tier is empty
+ const pool_stat_t *pstats =
+ mon.mgrstatmon()->get_pool_stat(tierpool_id);
+ if (pstats && pstats->stats.sum.num_objects != 0) {
+ ss << "tier pool '" << tierpoolstr << "' is not empty";
+ err = -ENOTEMPTY;
+ goto reply;
+ }
+ auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
+ pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+ if (int(mode) < 0) {
+ ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply;
+ }
+ HitSet::Params hsp;
+ auto& cache_hit_set_type =
+ g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
+ if (cache_hit_set_type == "bloom") {
+ BloomHitSet::Params *bsp = new BloomHitSet::Params;
+ bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
+ hsp = HitSet::Params(bsp);
+ } else if (cache_hit_set_type == "explicit_hash") {
+ hsp = HitSet::Params(new ExplicitHashHitSet::Params);
+ } else if (cache_hit_set_type == "explicit_object") {
+ hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
+ } else {
+ ss << "osd tier cache default hit set type '"
+ << cache_hit_set_type << "' is not a known type";
+ err = -EINVAL;
+ goto reply;
+ }
+ // go
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+ if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ np->tiers.insert(tierpool_id);
+ np->read_tier = np->write_tier = tierpool_id;
+ np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
+ np->set_last_force_op_resend(pending_inc.epoch);
+ ntp->set_last_force_op_resend(pending_inc.epoch);
+ ntp->tier_of = pool_id;
+ ntp->cache_mode = mode;
+ ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
+ ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
+ ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
+ ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
+ ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
+ ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
+ ntp->hit_set_params = hsp;
+ ntp->target_max_bytes = size;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool set-quota") {
+ string poolstr;
+ cmd_getval(cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+
+ string field;
+ cmd_getval(cmdmap, "field", field);
+ if (field != "max_objects" && field != "max_bytes") {
+ ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
+ err = -EINVAL;
+ goto reply;
+ }
+
+ // val could contain unit designations, so we treat as a string
+ string val;
+ cmd_getval(cmdmap, "val", val);
+ string tss;
+ int64_t value;
+ if (field == "max_objects") {
+ value = strict_sistrtoll(val.c_str(), &tss);
+ } else if (field == "max_bytes") {
+ value = strict_iecstrtoll(val.c_str(), &tss);
+ } else {
+ ceph_abort_msg("unrecognized option");
+ }
+ if (!tss.empty()) {
+ ss << "error parsing value '" << val << "': " << tss;
+ err = -EINVAL;
+ goto reply;
+ }
+
+ pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
+ if (field == "max_objects") {
+ pi->quota_max_objects = value;
+ } else if (field == "max_bytes") {
+ pi->quota_max_bytes = value;
+ } else {
+ ceph_abort_msg("unrecognized option");
+ }
+ ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
+ rs = ss.str();
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+ } else if (prefix == "osd pool application enable" ||
+ prefix == "osd pool application disable" ||
+ prefix == "osd pool application set" ||
+ prefix == "osd pool application rm") {
+ err = prepare_command_pool_application(prefix, cmdmap, ss);
+ if (err == -EAGAIN) {
+ goto wait;
+ } else if (err < 0) {
+ goto reply;
+ } else {
+ goto update;
+ }
+ } else if (prefix == "osd force-create-pg") {
+ pg_t pgid;
+ string pgidstr;
+ cmd_getval(cmdmap, "pgid", pgidstr);
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (!osdmap.pg_exists(pgid)) {
+ ss << "pg " << pgid << " should not exist";
+ err = -ENOENT;
+ goto reply;
+ }
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
+ << "that the cluster will give up ever trying to recover the lost data. Do this "
+ << "only if you are certain that all copies of the PG are in fact lost and you are "
+ << "willing to accept that the data is permanently destroyed. Pass "
+ << "--yes-i-really-mean-it to proceed.";
+ err = -EPERM;
+ goto reply;
+ }
+ bool creating_now;
+ {
+ std::lock_guard<std::mutex> l(creating_pgs_lock);
+ auto emplaced = creating_pgs.pgs.emplace(
+ pgid,
+ creating_pgs_t::pg_create_info(osdmap.get_epoch(),
+ ceph_clock_now()));
+ creating_now = emplaced.second;
+ }
+ if (creating_now) {
+ ss << "pg " << pgidstr << " now creating, ok";
+ // set the pool's CREATING flag so that (1) the osd won't ignore our
+ // create message and (2) we won't propose any future pg_num changes
+ // until after the PG has been instantiated.
+ if (pending_inc.new_pools.count(pgid.pool()) == 0) {
+ pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
+ }
+ pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
+ err = 0;
+ goto update;
+ } else {
+ ss << "pg " << pgid << " already creating";
+ err = 0;
+ goto reply;
+ }
+ } else if (prefix == "osd force_healthy_stretch_mode") {
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "This command will require peering across multiple CRUSH buckets "
+ "(probably two data centers or availability zones?) and may result in PGs "
+ "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
+ err = -EPERM;
+ goto reply;
+ }
+ try_end_recovery_stretch_mode(true);
+ ss << "Triggering healthy stretch mode";
+ err = 0;
+ goto reply;
+ } else if (prefix == "osd force_recovery_stretch_mode") {
+ bool sure = false;
+ cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+ if (!sure) {
+ ss << "This command will increase pool sizes to try and spread them "
+ "across multiple CRUSH buckets (probably two data centers or "
+ "availability zones?) and should have happened automatically"
+ "Pass --yes-i-really-mean-it to proceed.";
+ err = -EPERM;
+ goto reply;
+ }
+ mon.go_recovery_stretch_mode();
+ ss << "Triggering recovery stretch mode";
+ err = 0;
+ goto reply;
+ } else {
+ err = -EINVAL;
+ }
+
+ reply:
+ getline(ss, rs);
+ if (err < 0 && rs.length() == 0)
+ rs = cpp_strerror(err);
+ mon.reply_command(op, err, rs, rdata, get_last_committed());
+ return ret;
+
+ update:
+ getline(ss, rs);
+ wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+ get_last_committed() + 1));
+ return true;
+
+ wait:
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+}
+
+bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+
+ auto m = op->get_req<MPoolOp>();
+ MonSession *session = op->get_session();
+ if (!session) {
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+
+ switch (m->op) {
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ {
+ const std::string* pool_name = nullptr;
+ const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
+ if (pg_pool != nullptr) {
+ pool_name = &osdmap.get_pool_name(m->pool);
+ }
+
+ if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
+ session->entity_name, session->caps,
+ session->get_peer_socket_addr(),
+ pool_name)) {
+ dout(0) << "got unmanaged-snap pool op from entity with insufficient "
+ << "privileges. message: " << *m << std::endl
+ << "caps: " << session->caps << dendl;
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+ }
+ break;
+ default:
+ if (!session->is_capable("osd", MON_CAP_W)) {
+ dout(0) << "got pool op from entity with insufficient privileges. "
+ << "message: " << *m << std::endl
+ << "caps: " << session->caps << dendl;
+ _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+
+ if (enforce_pool_op_caps(op)) {
+ return true;
+ }
+
+ if (m->fsid != mon.monmap->fsid) {
+ dout(0) << __func__ << " drop message on fsid " << m->fsid
+ << " != " << mon.monmap->fsid << " for " << *m << dendl;
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+
+ if (m->op == POOL_OP_CREATE)
+ return preprocess_pool_op_create(op);
+
+ const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
+ if (p == nullptr) {
+ dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
+ if (m->op == POOL_OP_DELETE) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ } else {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ }
+ return true;
+ }
+
+ // check if the snap and snapname exist
+ bool snap_exists = false;
+ if (p->snap_exists(m->name.c_str()))
+ snap_exists = true;
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ if (snap_exists) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ if (p->is_pool_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_DELETE_SNAP:
+ if (p->is_unmanaged_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ if (!snap_exists) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (p->is_pool_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return true;
+ }
+ if (_is_removed_snap(m->pool, m->snapid)) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_DELETE:
+ if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+ return false;
+ case POOL_OP_AUID_CHANGE:
+ return false;
+ default:
+ ceph_abort();
+ break;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
+{
+ if (!osdmap.have_pg_pool(pool)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - pool dne" << dendl;
+ return true;
+ }
+ if (osdmap.in_removed_snaps_queue(pool, snap)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - in osdmap removed_snaps_queue" << dendl;
+ return true;
+ }
+ snapid_t begin, end;
+ int r = lookup_purged_snap(pool, snap, &begin, &end);
+ if (r == 0) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - purged, [" << begin << "," << end << ")" << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
+{
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - pool pending deletion" << dendl;
+ return true;
+ }
+ if (pending_inc.in_new_removed_snaps(pool, snap)) {
+ dout(10) << __func__ << " pool " << pool << " snap " << snap
+ << " - in pending new_removed_snaps" << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
+ if (pool >= 0) {
+ _pool_op_reply(op, 0, osdmap.get_epoch());
+ return true;
+ }
+
+ return false;
+}
+
+bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ dout(10) << "prepare_pool_op " << *m << dendl;
+ if (m->op == POOL_OP_CREATE) {
+ return prepare_pool_op_create(op);
+ } else if (m->op == POOL_OP_DELETE) {
+ return prepare_pool_op_delete(op);
+ }
+
+ int ret = 0;
+ bool changed = false;
+
+ if (!osdmap.have_pg_pool(m->pool)) {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ return false;
+ }
+
+ const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ if (pool->is_tier()) {
+ ret = -EINVAL;
+ _pool_op_reply(op, ret, osdmap.get_epoch());
+ return false;
+ } // else, fall through
+ case POOL_OP_DELETE_SNAP:
+ if (!pool->is_unmanaged_snaps_mode()) {
+ bool snap_exists = pool->snap_exists(m->name.c_str());
+ if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
+ || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
+ ret = 0;
+ } else {
+ break;
+ }
+ } else {
+ ret = -EINVAL;
+ }
+ _pool_op_reply(op, ret, osdmap.get_epoch());
+ return false;
+
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ // we won't allow removal of an unmanaged snapshot from a pool
+ // not in unmanaged snaps mode.
+ if (!pool->is_unmanaged_snaps_mode()) {
+ _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
+ return false;
+ }
+ /* fall-thru */
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ // but we will allow creating an unmanaged snapshot on any pool
+ // as long as it is not in 'pool' snaps mode.
+ if (pool->is_pool_snaps_mode()) {
+ _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+ return false;
+ }
+ }
+
+ // projected pool info
+ pg_pool_t pp;
+ if (pending_inc.new_pools.count(m->pool))
+ pp = pending_inc.new_pools[m->pool];
+ else
+ pp = *osdmap.get_pg_pool(m->pool);
+
+ bufferlist reply_data;
+
+ // pool snaps vs unmanaged snaps are mutually exclusive
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ case POOL_OP_DELETE_SNAP:
+ if (pp.is_unmanaged_snaps_mode()) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (pp.is_pool_snaps_mode()) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ switch (m->op) {
+ case POOL_OP_CREATE_SNAP:
+ if (!pp.snap_exists(m->name.c_str())) {
+ pp.add_snap(m->name.c_str(), ceph_clock_now());
+ dout(10) << "create snap in pool " << m->pool << " " << m->name
+ << " seq " << pp.get_snap_epoch() << dendl;
+ changed = true;
+ }
+ break;
+
+ case POOL_OP_DELETE_SNAP:
+ {
+ snapid_t s = pp.snap_exists(m->name.c_str());
+ if (s) {
+ pp.remove_snap(s);
+ pending_inc.new_removed_snaps[m->pool].insert(s);
+ changed = true;
+ }
+ }
+ break;
+
+ case POOL_OP_CREATE_UNMANAGED_SNAP:
+ {
+ uint64_t snapid = pp.add_unmanaged_snap(
+ osdmap.require_osd_release < ceph_release_t::octopus);
+ encode(snapid, reply_data);
+ changed = true;
+ }
+ break;
+
+ case POOL_OP_DELETE_UNMANAGED_SNAP:
+ if (!_is_removed_snap(m->pool, m->snapid) &&
+ !_is_pending_removed_snap(m->pool, m->snapid)) {
+ if (m->snapid > pp.get_snap_seq()) {
+ _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+ return false;
+ }
+ pp.remove_unmanaged_snap(
+ m->snapid,
+ osdmap.require_osd_release < ceph_release_t::octopus);
+ pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
+ // also record the new seq as purged: this avoids a discontinuity
+ // after all of the snaps have been purged, since the seq assigned
+ // during removal lives in the same namespace as the actual snaps.
+ pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
+ changed = true;
+ }
+ break;
+
+ case POOL_OP_AUID_CHANGE:
+ _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
+ return false;
+
+ default:
+ ceph_abort();
+ break;
+ }
+
+ if (changed) {
+ pp.set_snap_epoch(pending_inc.epoch);
+ pending_inc.new_pools[m->pool] = pp;
+ }
+
+ out:
+ wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
+ return true;
+}
+
+bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ int err = prepare_new_pool(op);
+ wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
+ return true;
+}
+
+int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
+ ostream *ss)
+{
+ const string& poolstr = osdmap.get_pool_name(pool_id);
+
+ // If the Pool is in use by CephFS, refuse to delete it
+ FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+ if (pending_fsmap.pool_in_use(pool_id)) {
+ *ss << "pool '" << poolstr << "' is in use by CephFS";
+ return -EBUSY;
+ }
+
+ if (pool.tier_of >= 0) {
+ *ss << "pool '" << poolstr << "' is a tier of '"
+ << osdmap.get_pool_name(pool.tier_of) << "'";
+ return -EBUSY;
+ }
+ if (!pool.tiers.empty()) {
+ *ss << "pool '" << poolstr << "' has tiers";
+ for(auto tier : pool.tiers) {
+ *ss << " " << osdmap.get_pool_name(tier);
+ }
+ return -EBUSY;
+ }
+
+ if (!g_conf()->mon_allow_pool_delete) {
+ *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
+ return -EPERM;
+ }
+
+ if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
+ *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
+ return -EPERM;
+ }
+
+ *ss << "pool '" << poolstr << "' removed";
+ return 0;
+}
+
+/**
+ * Check if it is safe to add a tier to a base pool
+ *
+ * @return
+ * True if the operation should proceed, false if we should abort here
+ * (abort doesn't necessarily mean error, could be idempotency)
+ */
+bool OSDMonitor::_check_become_tier(
+ const int64_t tier_pool_id, const pg_pool_t *tier_pool,
+ const int64_t base_pool_id, const pg_pool_t *base_pool,
+ int *err,
+ ostream *ss) const
+{
+ const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
+ const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
+
+ const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+ if (pending_fsmap.pool_in_use(tier_pool_id)) {
+ *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
+ *err = -EBUSY;
+ return false;
+ }
+
+ if (base_pool->tiers.count(tier_pool_id)) {
+ ceph_assert(tier_pool->tier_of == base_pool_id);
+ *err = 0;
+ *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
+ << base_pool_name << "'";
+ return false;
+ }
+
+ if (base_pool->is_tier()) {
+ *ss << "pool '" << base_pool_name << "' is already a tier of '"
+ << osdmap.get_pool_name(base_pool->tier_of) << "', "
+ << "multiple tiers are not yet supported.";
+ *err = -EINVAL;
+ return false;
+ }
+
+ if (tier_pool->has_tiers()) {
+ *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
+ for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
+ it != tier_pool->tiers.end(); ++it)
+ *ss << "'" << osdmap.get_pool_name(*it) << "',";
+ *ss << " multiple tiers are not yet supported.";
+ *err = -EINVAL;
+ return false;
+ }
+
+ if (tier_pool->is_tier()) {
+ *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
+ << osdmap.get_pool_name(tier_pool->tier_of) << "'";
+ *err = -EINVAL;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+
+/**
+ * Check if it is safe to remove a tier from this base pool
+ *
+ * @return
+ * True if the operation should proceed, false if we should abort here
+ * (abort doesn't necessarily mean error, could be idempotency)
+ */
+bool OSDMonitor::_check_remove_tier(
+ const int64_t base_pool_id, const pg_pool_t *base_pool,
+ const pg_pool_t *tier_pool,
+ int *err, ostream *ss) const
+{
+ const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
+
+ // Apply CephFS-specific checks
+ const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+ if (pending_fsmap.pool_in_use(base_pool_id)) {
+ if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
+ // If the underlying pool is erasure coded and does not allow EC
+ // overwrites, we can't permit the removal of the replicated tier that
+ // CephFS relies on to access it
+ *ss << "pool '" << base_pool_name <<
+ "' does not allow EC overwrites and is in use by CephFS"
+ " via its tier";
+ *err = -EBUSY;
+ return false;
+ }
+
+ if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
+ *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
+ "tier is still in use as a writeback cache. Change the cache "
+ "mode and flush the cache before removing it";
+ *err = -EBUSY;
+ return false;
+ }
+ }
+
+ *err = 0;
+ return true;
+}
+
+int OSDMonitor::_prepare_remove_pool(
+ int64_t pool, ostream *ss, bool no_fake)
+{
+ dout(10) << __func__ << " " << pool << dendl;
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ int r = _check_remove_pool(pool, *p, ss);
+ if (r < 0)
+ return r;
+
+ auto new_pool = pending_inc.new_pools.find(pool);
+ if (new_pool != pending_inc.new_pools.end()) {
+ // if there is a problem with the pending info, wait and retry
+ // this op.
+ const auto& p = new_pool->second;
+ int r = _check_remove_pool(pool, p, ss);
+ if (r < 0)
+ return -EAGAIN;
+ }
+
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << __func__ << " " << pool << " already pending removal"
+ << dendl;
+ return 0;
+ }
+
+ if (g_conf()->mon_fake_pool_delete && !no_fake) {
+ string old_name = osdmap.get_pool_name(pool);
+ string new_name = old_name + "." + stringify(pool) + ".DELETED";
+ dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
+ << old_name << " -> " << new_name << dendl;
+ pending_inc.new_pool_names[pool] = new_name;
+ return 0;
+ }
+
+ // remove
+ pending_inc.old_pools.insert(pool);
+
+ // remove any pg_temp mappings for this pool
+ for (auto p = osdmap.pg_temp->begin();
+ p != osdmap.pg_temp->end();
+ ++p) {
+ if (p->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
+ << p->first << dendl;
+ pending_inc.new_pg_temp[p->first].clear();
+ }
+ }
+ // remove any primary_temp mappings for this pool
+ for (auto p = osdmap.primary_temp->begin();
+ p != osdmap.primary_temp->end();
+ ++p) {
+ if (p->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing obsolete primary_temp" << p->first << dendl;
+ pending_inc.new_primary_temp[p->first] = -1;
+ }
+ }
+ // remove any pg_upmap mappings for this pool
+ for (auto& p : osdmap.pg_upmap) {
+ if (p.first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing obsolete pg_upmap "
+ << p.first << dendl;
+ pending_inc.old_pg_upmap.insert(p.first);
+ }
+ }
+ // remove any pending pg_upmap mappings for this pool
+ {
+ auto it = pending_inc.new_pg_upmap.begin();
+ while (it != pending_inc.new_pg_upmap.end()) {
+ if (it->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing pending pg_upmap "
+ << it->first << dendl;
+ it = pending_inc.new_pg_upmap.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
+ // remove any pg_upmap_items mappings for this pool
+ for (auto& p : osdmap.pg_upmap_items) {
+ if (p.first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing obsolete pg_upmap_items " << p.first
+ << dendl;
+ pending_inc.old_pg_upmap_items.insert(p.first);
+ }
+ }
+ // remove any pending pg_upmap mappings for this pool
+ {
+ auto it = pending_inc.new_pg_upmap_items.begin();
+ while (it != pending_inc.new_pg_upmap_items.end()) {
+ if (it->first.pool() == pool) {
+ dout(10) << __func__ << " " << pool
+ << " removing pending pg_upmap_items "
+ << it->first << dendl;
+ it = pending_inc.new_pg_upmap_items.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
+
+ // remove any choose_args for this pool
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ if (newcrush.have_choose_args(pool)) {
+ dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
+ newcrush.rm_choose_args(pool);
+ pending_inc.crush.clear();
+ newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+ }
+ return 0;
+}
+
+int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
+{
+ dout(10) << "_prepare_rename_pool " << pool << dendl;
+ if (pending_inc.old_pools.count(pool)) {
+ dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
+ return -ENOENT;
+ }
+ for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
+ p != pending_inc.new_pool_names.end();
+ ++p) {
+ if (p->second == newname && p->first != pool) {
+ return -EEXIST;
+ }
+ }
+
+ pending_inc.new_pool_names[pool] = newname;
+ return 0;
+}
+
+bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ ostringstream ss;
+ int ret = _prepare_remove_pool(m->pool, &ss, false);
+ if (ret == -EAGAIN) {
+ wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+ return true;
+ }
+ if (ret < 0)
+ dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+ wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
+ pending_inc.epoch));
+ return true;
+}
+
+void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
+ int ret, epoch_t epoch, bufferlist *blp)
+{
+ op->mark_osdmon_event(__func__);
+ auto m = op->get_req<MPoolOp>();
+ dout(20) << "_pool_op_reply " << ret << dendl;
+ MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
+ ret, epoch, get_last_committed(), blp);
+ mon.send_reply(op, reply);
+}
+
+void OSDMonitor::convert_pool_priorities(void)
+{
+ pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
+ int64_t max_prio = 0;
+ int64_t min_prio = 0;
+ for (const auto &i : osdmap.get_pools()) {
+ const auto &pool = i.second;
+
+ if (pool.opts.is_set(key)) {
+ int64_t prio = 0;
+ pool.opts.get(key, &prio);
+ if (prio > max_prio)
+ max_prio = prio;
+ if (prio < min_prio)
+ min_prio = prio;
+ }
+ }
+ if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
+ dout(20) << __func__ << " nothing to fix" << dendl;
+ return;
+ }
+ // Current pool priorities exceeds new maximum
+ for (const auto &i : osdmap.get_pools()) {
+ const auto pool_id = i.first;
+ pg_pool_t pool = i.second;
+
+ int64_t prio = 0;
+ pool.opts.get(key, &prio);
+ int64_t n;
+
+ if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
+ // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
+ n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
+ } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
+ // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
+ n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
+ } else {
+ continue;
+ }
+ if (n == 0) {
+ pool.opts.unset(key);
+ } else {
+ pool.opts.set(key, static_cast<int64_t>(n));
+ }
+ dout(10) << __func__ << " pool " << pool_id
+ << " recovery_priority adjusted "
+ << prio << " to " << n << dendl;
+ pool.last_change = pending_inc.epoch;
+ pending_inc.new_pools[pool_id] = pool;
+ }
+}
+
+void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
+ int *errcode,
+ set<pg_pool_t*>* pools,
+ const string& new_crush_rule)
+{
+ dout(20) << __func__ << dendl;
+ *okay = false;
+ int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
+ if (new_crush_rule_result < 0) {
+ ss << "unrecognized crush rule " << new_crush_rule_result;
+ *errcode = new_crush_rule_result;
+ return;
+ }
+ __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
+ for (const auto& pooli : osdmap.pools) {
+ int64_t poolid = pooli.first;
+ const pg_pool_t *p = &pooli.second;
+ if (!p->is_replicated()) {
+ ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
+ *errcode = -EINVAL;
+ return;
+ }
+ uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+ if ((p->get_size() != default_size ||
+ (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
+ (p->get_crush_rule() != new_rule)) {
+ ss << "we currently require stretch mode pools start out with the"
+ " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
+ *errcode = -EINVAL;
+ return;
+ }
+ pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
+ // TODO: The part where we unconditionally copy the pools into pending_inc is bad
+ // the attempt may fail and then we have these pool updates...but they won't do anything
+ // if there is a failure, so if it's hard to change the interface, no need to bother
+ pools->insert(pp);
+ }
+ *okay = true;
+ return;
+}
+
+void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
+ int *errcode, bool commit,
+ const string& dividing_bucket,
+ uint32_t bucket_count,
+ const set<pg_pool_t*>& pools,
+ const string& new_crush_rule)
+{
+ dout(20) << __func__ << dendl;
+ *okay = false;
+ CrushWrapper crush;
+ _get_pending_crush(crush);
+ int dividing_id;
+ int retval = crush.get_validated_type_id(dividing_bucket, &dividing_id);
+ if (retval == -1) {
+ ss << dividing_bucket << " is not a valid crush bucket type";
+ *errcode = -ENOENT;
+ ceph_assert(!commit || retval != -1);
+ return;
+ }
+ vector<int> subtrees;
+ crush.get_subtree_of_type(dividing_id, &subtrees);
+ if (subtrees.size() != 2) {
+ ss << "there are " << subtrees.size() << dividing_bucket
+ << "'s in the cluster but stretch mode currently only works with 2!";
+ *errcode = -EINVAL;
+ ceph_assert(!commit || subtrees.size() == 2);
+ return;
+ }
+
+ int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
+ if (new_crush_rule_result < 0) {
+ ss << "unrecognized crush rule " << new_crush_rule;
+ *errcode = new_crush_rule_result;
+ ceph_assert(!commit || (new_crush_rule_result > 0));
+ return;
+ }
+ __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
+
+ int weight1 = crush.get_item_weight(subtrees[0]);
+ int weight2 = crush.get_item_weight(subtrees[1]);
+ if (weight1 != weight2) {
+ // TODO: I'm really not sure this is a good idea?
+ ss << "the 2 " << dividing_bucket
+ << "instances in the cluster have differing weights "
+ << weight1 << " and " << weight2
+ <<" but stretch mode currently requires they be the same!";
+ *errcode = -EINVAL;
+ ceph_assert(!commit || (weight1 == weight2));
+ return;
+ }
+ if (bucket_count != 2) {
+ ss << "currently we only support 2-site stretch clusters!";
+ *errcode = -EINVAL;
+ ceph_assert(!commit || bucket_count == 2);
+ return;
+ }
+ // TODO: check CRUSH rules for pools so that we are appropriately divided
+ if (commit) {
+ for (auto pool : pools) {
+ pool->crush_rule = new_rule;
+ pool->peering_crush_bucket_count = bucket_count;
+ pool->peering_crush_bucket_target = bucket_count;
+ pool->peering_crush_bucket_barrier = dividing_id;
+ pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+ pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
+ pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+ }
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = true;
+ pending_inc.new_stretch_bucket_count = bucket_count;
+ pending_inc.new_degraded_stretch_mode = 0;
+ pending_inc.new_stretch_mode_bucket = dividing_id;
+ }
+ *okay = true;
+ return;
+}
+
+bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
+ set<int> *really_down_buckets,
+ set<string> *really_down_mons)
+{
+ dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
+ ceph_assert(is_readable());
+ if (dead_buckets.empty()) return false;
+ set<int> down_cache;
+ bool really_down = false;
+ for (auto dbi : dead_buckets) {
+ const string& bucket_name = dbi.first;
+ ceph_assert(osdmap.crush->name_exists(bucket_name));
+ int bucket_id = osdmap.crush->get_item_id(bucket_name);
+ dout(20) << "Checking " << bucket_name << " id " << bucket_id
+ << " to see if OSDs are also down" << dendl;
+ bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
+ if (subtree_down) {
+ dout(20) << "subtree is down!" << dendl;
+ really_down = true;
+ really_down_buckets->insert(bucket_id);
+ really_down_mons->insert(dbi.second.begin(), dbi.second.end());
+ }
+ }
+ dout(10) << "We determined CRUSH buckets " << *really_down_buckets
+ << " and mons " << *really_down_mons << " are really down" << dendl;
+ return really_down;
+}
+
+void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
+ const set<string>& live_zones)
+{
+ dout(20) << __func__ << dendl;
+ stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
+ // update the general OSDMap changes
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+ pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+ int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
+ ceph_assert(new_site_count == 1); // stretch count 2!
+ pending_inc.new_degraded_stretch_mode = new_site_count;
+ pending_inc.new_recovering_stretch_mode = 0;
+ pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+
+ // and then apply them to all the pg_pool_ts
+ ceph_assert(live_zones.size() == 1); // only support 2 zones now
+ const string& remaining_site_name = *(live_zones.begin());
+ ceph_assert(osdmap.crush->name_exists(remaining_site_name));
+ int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
+ for (auto pgi : osdmap.pools) {
+ if (pgi.second.peering_crush_bucket_count) {
+ pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+ newp.peering_crush_bucket_count = new_site_count;
+ newp.peering_crush_mandatory_member = remaining_site;
+ newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
+ newp.set_last_force_op_resend(pending_inc.epoch);
+ }
+ }
+ propose_pending();
+}
+
+void OSDMonitor::trigger_recovery_stretch_mode()
+{
+ dout(20) << __func__ << dendl;
+ stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+ pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+ pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
+ pending_inc.new_recovering_stretch_mode = 1;
+ pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+
+ for (auto pgi : osdmap.pools) {
+ if (pgi.second.peering_crush_bucket_count) {
+ pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+ newp.set_last_force_op_resend(pending_inc.epoch);
+ }
+ }
+ propose_pending();
+}
+
+void OSDMonitor::set_degraded_stretch_mode()
+{
+ stretch_recovery_triggered.set_from_double(0);
+}
+
+void OSDMonitor::set_recovery_stretch_mode()
+{
+ if (stretch_recovery_triggered.is_zero()) {
+ stretch_recovery_triggered = ceph_clock_now();
+ }
+}
+
+void OSDMonitor::set_healthy_stretch_mode()
+{
+ stretch_recovery_triggered.set_from_double(0);
+}
+
+void OSDMonitor::notify_new_pg_digest()
+{
+ dout(20) << __func__ << dendl;
+ if (!stretch_recovery_triggered.is_zero()) {
+ try_end_recovery_stretch_mode(false);
+ }
+}
+
+struct CMonExitRecovery : public Context {
+ OSDMonitor *m;
+ bool force;
+ CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
+ void finish(int r) {
+ m->try_end_recovery_stretch_mode(force);
+ }
+};
+
+void OSDMonitor::try_end_recovery_stretch_mode(bool force)
+{
+ dout(20) << __func__ << dendl;
+ if (!mon.is_leader()) return;
+ if (!mon.is_degraded_stretch_mode()) return;
+ if (!mon.is_recovering_stretch_mode()) return;
+ if (!is_readable()) {
+ wait_for_readable_ctx(new CMonExitRecovery(this, force));
+ return;
+ }
+
+ if (osdmap.recovering_stretch_mode &&
+ ((!stretch_recovery_triggered.is_zero() &&
+ ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
+ stretch_recovery_triggered) ||
+ force)) {
+ if (!mon.mgrstatmon()->is_readable()) {
+ mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
+ return;
+ }
+ const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
+ double misplaced, degraded, inactive, unknown;
+ pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
+ if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
+ // we can exit degraded stretch mode!
+ mon.trigger_healthy_stretch_mode();
+ }
+ }
+}
+
+void OSDMonitor::trigger_healthy_stretch_mode()
+{
+ ceph_assert(is_writeable());
+ stretch_recovery_triggered.set_from_double(0);
+ pending_inc.change_stretch_mode = true;
+ pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+ pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+ pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
+ pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
+ pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+ for (auto pgi : osdmap.pools) {
+ if (pgi.second.peering_crush_bucket_count) {
+ pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+ newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
+ newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+ newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+ newp.set_last_force_op_resend(pending_inc.epoch);
+ }
+ }
+ propose_pending();
+}