// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * Copyright (C) 2017 OVH * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include "acconfig.h" #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_SYS_PARAM_H #include #endif #ifdef HAVE_SYS_MOUNT_H #include #endif #include "osd/PG.h" #include "include/types.h" #include "include/compat.h" #include "include/random.h" #include "OSD.h" #include "OSDMap.h" #include "Watch.h" #include "osdc/Objecter.h" #include "common/errno.h" #include "common/ceph_argparse.h" #include "common/ceph_time.h" #include "common/version.h" #include "common/pick_address.h" #include "common/blkdev.h" #include "common/numa.h" #include "os/ObjectStore.h" #ifdef HAVE_LIBFUSE #include "os/FuseStore.h" #endif #include "PrimaryLogPG.h" #include "msg/Messenger.h" #include "msg/Message.h" #include "mon/MonClient.h" #include "messages/MLog.h" #include "messages/MGenericMessage.h" #include "messages/MOSDPing.h" #include "messages/MOSDFailure.h" #include "messages/MOSDMarkMeDown.h" #include "messages/MOSDFull.h" #include "messages/MOSDOp.h" #include "messages/MOSDOpReply.h" #include "messages/MOSDBackoff.h" #include "messages/MOSDBeacon.h" #include "messages/MOSDRepOp.h" #include "messages/MOSDRepOpReply.h" #include "messages/MOSDBoot.h" #include "messages/MOSDPGTemp.h" #include "messages/MOSDPGReadyToMerge.h" #include "messages/MOSDMap.h" #include "messages/MMonGetOSDMap.h" #include "messages/MOSDPGNotify.h" #include "messages/MOSDPGQuery.h" #include "messages/MOSDPGLog.h" #include "messages/MOSDPGRemove.h" #include "messages/MOSDPGInfo.h" #include "messages/MOSDPGCreate.h" #include "messages/MOSDPGCreate2.h" #include "messages/MOSDPGTrim.h" #include "messages/MOSDPGScan.h" #include "messages/MBackfillReserve.h" #include "messages/MRecoveryReserve.h" #include "messages/MOSDForceRecovery.h" #include "messages/MOSDECSubOpWrite.h" #include "messages/MOSDECSubOpWriteReply.h" #include "messages/MOSDECSubOpRead.h" #include "messages/MOSDECSubOpReadReply.h" #include "messages/MOSDPGCreated.h" #include "messages/MOSDPGUpdateLogMissing.h" #include "messages/MOSDPGUpdateLogMissingReply.h" #include "messages/MOSDPeeringOp.h" #include "messages/MOSDAlive.h" #include "messages/MOSDScrub.h" #include "messages/MOSDScrub2.h" #include "messages/MOSDRepScrub.h" #include "messages/MMonCommand.h" #include "messages/MCommand.h" #include "messages/MCommandReply.h" #include "messages/MPGStats.h" #include "messages/MPGStatsAck.h" #include "messages/MWatchNotify.h" #include "messages/MOSDPGPush.h" #include "messages/MOSDPGPushReply.h" #include "messages/MOSDPGPull.h" #include "common/perf_counters.h" #include "common/Timer.h" #include "common/LogClient.h" #include "common/AsyncReserver.h" #include "common/HeartbeatMap.h" #include "common/admin_socket.h" #include "common/ceph_context.h" #include "global/signal_handler.h" #include "global/pidfile.h" #include "include/color.h" #include "perfglue/cpu_profiler.h" #include "perfglue/heap_profiler.h" #include "osd/OpRequest.h" #include "auth/AuthAuthorizeHandler.h" #include "auth/RotatingKeyRing.h" #include "objclass/objclass.h" #include "common/cmdparse.h" #include "include/str_list.h" #include "include/util.h" #include "include/ceph_assert.h" #include "common/config.h" #include "common/EventTrace.h" #include "json_spirit/json_spirit_reader.h" #include "json_spirit/json_spirit_writer.h" #ifdef WITH_LTTNG #define TRACEPOINT_DEFINE #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE #include "tracing/osd.h" #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE #undef TRACEPOINT_DEFINE #else #define tracepoint(...) #endif #define dout_context cct #define dout_subsys ceph_subsys_osd #undef dout_prefix #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) { return *_dout << "osd." << whoami << " " << epoch << " "; } //Initial features in new superblock. //Features here are also automatically upgraded CompatSet OSD::get_osd_initial_compat_set() { CompatSet::FeatureSet ceph_osd_feature_compat; CompatSet::FeatureSet ceph_osd_feature_ro_compat; CompatSet::FeatureSet ceph_osd_feature_incompat; ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, ceph_osd_feature_incompat); } //Features are added here that this OSD supports. CompatSet OSD::get_osd_compat_set() { CompatSet compat = get_osd_initial_compat_set(); //Any features here can be set in code, but not in initial superblock compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); return compat; } OSDService::OSDService(OSD *osd) : osd(osd), cct(osd->cct), whoami(osd->whoami), store(osd->store), log_client(osd->log_client), clog(osd->clog), pg_recovery_stats(osd->pg_recovery_stats), cluster_messenger(osd->cluster_messenger), client_messenger(osd->client_messenger), logger(osd->logger), recoverystate_perf(osd->recoverystate_perf), monc(osd->monc), class_handler(osd->class_handler), osd_max_object_size(cct->_conf, "osd_max_object_size"), osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"), publish_lock{ceph::make_mutex("OSDService::publish_lock")}, pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")}, max_oldest_map(0), peer_map_epoch_lock("OSDService::peer_map_epoch_lock"), sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_local(0), scrubs_remote(0), agent_lock("OSDService::agent_lock"), agent_valid_iterator(false), agent_ops(0), flush_mode_high_count(0), agent_active(true), agent_thread(this), agent_stop_flag(false), agent_timer_lock("OSDService::agent_timer_lock"), agent_timer(osd->client_messenger->cct, agent_timer_lock), last_recalibrate(ceph_clock_now()), promote_max_objects(0), promote_max_bytes(0), objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)), m_objecter_finishers(cct->_conf->osd_objecter_finishers), watch_lock("OSDService::watch_lock"), watch_timer(osd->client_messenger->cct, watch_lock), next_notif_id(0), recovery_request_lock("OSDService::recovery_request_lock"), recovery_request_timer(cct, recovery_request_lock, false), sleep_lock("OSDService::sleep_lock"), sleep_timer(cct, sleep_lock, false), reserver_finisher(cct), local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills, cct->_conf->osd_min_recovery_priority), remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills, cct->_conf->osd_min_recovery_priority), pg_temp_lock("OSDService::pg_temp_lock"), snap_reserver(cct, &reserver_finisher, cct->_conf->osd_max_trimming_pgs), recovery_lock("OSDService::recovery_lock"), recovery_ops_active(0), recovery_ops_reserved(0), recovery_paused(false), map_cache_lock("OSDService::map_cache_lock"), map_cache(cct, cct->_conf->osd_map_cache_size), map_bl_cache(cct->_conf->osd_map_cache_size), map_bl_inc_cache(cct->_conf->osd_map_cache_size), stat_lock("OSDService::stat_lock"), full_status_lock("OSDService::full_status_lock"), cur_state(NONE), cur_ratio(0), physical_ratio(0), epoch_lock("OSDService::epoch_lock"), boot_epoch(0), up_epoch(0), bind_epoch(0), is_stopping_lock("OSDService::is_stopping_lock") #ifdef PG_DEBUG_REFS , pgid_lock("OSDService::pgid_lock") #endif { objecter->init(); for (int i = 0; i < m_objecter_finishers; i++) { ostringstream str; str << "objecter-finisher-" << i; Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher"); objecter_finishers.push_back(fin); } } OSDService::~OSDService() { delete objecter; for (auto f : objecter_finishers) { delete f; f = NULL; } } #ifdef PG_DEBUG_REFS void OSDService::add_pgid(spg_t pgid, PG *pg){ std::lock_guard l(pgid_lock); if (!pgid_tracker.count(pgid)) { live_pgs[pgid] = pg; } pgid_tracker[pgid]++; } void OSDService::remove_pgid(spg_t pgid, PG *pg) { std::lock_guard l(pgid_lock); ceph_assert(pgid_tracker.count(pgid)); ceph_assert(pgid_tracker[pgid] > 0); pgid_tracker[pgid]--; if (pgid_tracker[pgid] == 0) { pgid_tracker.erase(pgid); live_pgs.erase(pgid); } } void OSDService::dump_live_pgids() { std::lock_guard l(pgid_lock); derr << "live pgids:" << dendl; for (map::const_iterator i = pgid_tracker.cbegin(); i != pgid_tracker.cend(); ++i) { derr << "\t" << *i << dendl; live_pgs[i->first]->dump_live_ids(); } } #endif void OSDService::identify_splits_and_merges( OSDMapRef old_map, OSDMapRef new_map, spg_t pgid, set> *split_children, set> *merge_pgs) { if (!old_map->have_pg_pool(pgid.pool())) { return; } int old_pgnum = old_map->get_pg_num(pgid.pool()); auto p = osd->pg_num_history.pg_nums.find(pgid.pool()); if (p == osd->pg_num_history.pg_nums.end()) { return; } dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch() << " to e" << new_map->get_epoch() << " pg_nums " << p->second << dendl; deque queue; queue.push_back(pgid); set did; while (!queue.empty()) { auto cur = queue.front(); queue.pop_front(); did.insert(cur); unsigned pgnum = old_pgnum; for (auto q = p->second.lower_bound(old_map->get_epoch()); q != p->second.end() && q->first <= new_map->get_epoch(); ++q) { if (pgnum < q->second) { // split? if (cur.ps() < pgnum) { set children; if (cur.is_split(pgnum, q->second, &children)) { dout(20) << __func__ << " " << cur << " e" << q->first << " pg_num " << pgnum << " -> " << q->second << " children " << children << dendl; for (auto i : children) { split_children->insert(make_pair(i, q->first)); if (!did.count(i)) queue.push_back(i); } } } else if (cur.ps() < q->second) { dout(20) << __func__ << " " << cur << " e" << q->first << " pg_num " << pgnum << " -> " << q->second << " is a child" << dendl; // normally we'd capture this from the parent, but it's // possible the parent doesn't exist yet (it will be // fabricated to allow an intervening merge). note this PG // as a split child here to be sure we catch it. split_children->insert(make_pair(cur, q->first)); } else { dout(20) << __func__ << " " << cur << " e" << q->first << " pg_num " << pgnum << " -> " << q->second << " is post-split, skipping" << dendl; } } else if (merge_pgs) { // merge? if (cur.ps() >= q->second) { if (cur.ps() < pgnum) { spg_t parent; if (cur.is_merge_source(pgnum, q->second, &parent)) { set children; parent.is_split(q->second, pgnum, &children); dout(20) << __func__ << " " << cur << " e" << q->first << " pg_num " << pgnum << " -> " << q->second << " is merge source, target " << parent << ", source(s) " << children << dendl; merge_pgs->insert(make_pair(parent, q->first)); if (!did.count(parent)) { // queue (and re-scan) parent in case it might not exist yet // and there are some future splits pending on it queue.push_back(parent); } for (auto c : children) { merge_pgs->insert(make_pair(c, q->first)); if (!did.count(c)) queue.push_back(c); } } } else { dout(20) << __func__ << " " << cur << " e" << q->first << " pg_num " << pgnum << " -> " << q->second << " is beyond old pgnum, skipping" << dendl; } } else { set children; if (cur.is_split(q->second, pgnum, &children)) { dout(20) << __func__ << " " << cur << " e" << q->first << " pg_num " << pgnum << " -> " << q->second << " is merge target, source " << children << dendl; for (auto c : children) { merge_pgs->insert(make_pair(c, q->first)); if (!did.count(c)) queue.push_back(c); } merge_pgs->insert(make_pair(cur, q->first)); } } } pgnum = q->second; } } } void OSDService::need_heartbeat_peer_update() { osd->need_heartbeat_peer_update(); } void OSDService::start_shutdown() { { std::lock_guard l(agent_timer_lock); agent_timer.shutdown(); } { std::lock_guard l(sleep_lock); sleep_timer.shutdown(); } { std::lock_guard l(recovery_request_lock); recovery_request_timer.shutdown(); } } void OSDService::shutdown_reserver() { reserver_finisher.wait_for_empty(); reserver_finisher.stop(); } void OSDService::shutdown() { { std::lock_guard l(watch_lock); watch_timer.shutdown(); } objecter->shutdown(); for (auto f : objecter_finishers) { f->wait_for_empty(); f->stop(); } publish_map(OSDMapRef()); next_osdmap = OSDMapRef(); } void OSDService::init() { reserver_finisher.start(); for (auto f : objecter_finishers) { f->start(); } objecter->set_client_incarnation(0); // deprioritize objecter in daemonperf output objecter->get_logger()->set_prio_adjust(-3); watch_timer.init(); agent_timer.init(); agent_thread.create("osd_srv_agent"); if (cct->_conf->osd_recovery_delay_start) defer_recovery(cct->_conf->osd_recovery_delay_start); } void OSDService::final_init() { objecter->start(osdmap.get()); } void OSDService::activate_map() { // wake/unwake the tiering agent agent_lock.Lock(); agent_active = !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) && osd->is_active(); agent_cond.Signal(); agent_lock.Unlock(); } void OSDService::request_osdmap_update(epoch_t e) { osd->osdmap_subscribe(e, false); } class AgentTimeoutCB : public Context { PGRef pg; public: explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {} void finish(int) override { pg->agent_choose_mode_restart(); } }; void OSDService::agent_entry() { dout(10) << __func__ << " start" << dendl; agent_lock.Lock(); while (!agent_stop_flag) { if (agent_queue.empty()) { dout(20) << __func__ << " empty queue" << dendl; agent_cond.Wait(agent_lock); continue; } uint64_t level = agent_queue.rbegin()->first; set& top = agent_queue.rbegin()->second; dout(10) << __func__ << " tiers " << agent_queue.size() << ", top is " << level << " with pgs " << top.size() << ", ops " << agent_ops << "/" << cct->_conf->osd_agent_max_ops << (agent_active ? " active" : " NOT ACTIVE") << dendl; dout(20) << __func__ << " oids " << agent_oids << dendl; int max = cct->_conf->osd_agent_max_ops - agent_ops; int agent_flush_quota = max; if (!flush_mode_high_count) agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops; if (agent_flush_quota <= 0 || top.empty() || !agent_active) { agent_cond.Wait(agent_lock); continue; } if (!agent_valid_iterator || agent_queue_pos == top.end()) { agent_queue_pos = top.begin(); agent_valid_iterator = true; } PGRef pg = *agent_queue_pos; dout(10) << "high_count " << flush_mode_high_count << " agent_ops " << agent_ops << " flush_quota " << agent_flush_quota << dendl; agent_lock.Unlock(); if (!pg->agent_work(max, agent_flush_quota)) { dout(10) << __func__ << " " << pg->pg_id << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time << " seconds" << dendl; osd->logger->inc(l_osd_tier_delay); // Queue a timer to call agent_choose_mode for this pg in 5 seconds agent_timer_lock.Lock(); Context *cb = new AgentTimeoutCB(pg); agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb); agent_timer_lock.Unlock(); } agent_lock.Lock(); } agent_lock.Unlock(); dout(10) << __func__ << " finish" << dendl; } void OSDService::agent_stop() { { std::lock_guard l(agent_lock); // By this time all ops should be cancelled ceph_assert(agent_ops == 0); // By this time all PGs are shutdown and dequeued if (!agent_queue.empty()) { set& top = agent_queue.rbegin()->second; derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl; ceph_abort_msg("agent queue not empty"); } agent_stop_flag = true; agent_cond.Signal(); } agent_thread.join(); } // ------------------------------------- void OSDService::promote_throttle_recalibrate() { utime_t now = ceph_clock_now(); double dur = now - last_recalibrate; last_recalibrate = now; unsigned prob = promote_probability_millis; uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec; uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec; unsigned min_prob = 1; uint64_t attempts, obj, bytes; promote_counter.sample_and_attenuate(&attempts, &obj, &bytes); dout(10) << __func__ << " " << attempts << " attempts, promoted " << obj << " objects and " << byte_u_t(bytes) << "; target " << target_obj_sec << " obj/sec or " << byte_u_t(target_bytes_sec) << "/sec" << dendl; // calculate what the probability *should* be, given the targets unsigned new_prob; if (attempts && dur > 0) { uint64_t avg_size = 1; if (obj) avg_size = std::max(bytes / obj, 1); unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts; unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0 / (double)attempts; dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size " << avg_size << dendl; if (target_obj_sec && target_bytes_sec) new_prob = std::min(po, pb); else if (target_obj_sec) new_prob = po; else if (target_bytes_sec) new_prob = pb; else new_prob = 1000; } else { new_prob = 1000; } dout(20) << __func__ << " new_prob " << new_prob << dendl; // correct for persistent skew between target rate and actual rate, adjust double ratio = 1.0; unsigned actual = 0; if (attempts && obj) { actual = obj * 1000 / attempts; ratio = (double)actual / (double)prob; new_prob = (double)new_prob / ratio; } new_prob = std::max(new_prob, min_prob); new_prob = std::min(new_prob, 1000u); // adjust prob = (prob + new_prob) / 2; prob = std::max(prob, min_prob); prob = std::min(prob, 1000u); dout(10) << __func__ << " actual " << actual << ", actual/prob ratio " << ratio << ", adjusted new_prob " << new_prob << ", prob " << promote_probability_millis << " -> " << prob << dendl; promote_probability_millis = prob; // set hard limits for this interval to mitigate stampedes promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2; promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2; } // ------------------------------------- float OSDService::get_failsafe_full_ratio() { float full_ratio = cct->_conf->osd_failsafe_full_ratio; if (full_ratio > 1.0) full_ratio /= 100.0; return full_ratio; } OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject) { // The OSDMap ratios take precendence. So if the failsafe is .95 and // the admin sets the cluster full to .96, the failsafe moves up to .96 // too. (Not that having failsafe == full is ideal, but it's better than // dropping writes before the clusters appears full.) OSDMapRef osdmap = get_osdmap(); if (!osdmap || osdmap->get_epoch() == 0) { return NONE; } float nearfull_ratio = osdmap->get_nearfull_ratio(); float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio); float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio); float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio); if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) { // use the failsafe for nearfull and full; the mon isn't using the // flags anyway because we're mid-upgrade. full_ratio = failsafe_ratio; backfillfull_ratio = failsafe_ratio; nearfull_ratio = failsafe_ratio; } else if (full_ratio <= 0 || backfillfull_ratio <= 0 || nearfull_ratio <= 0) { derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl; // use failsafe flag. ick. the monitor did something wrong or the user // did something stupid. full_ratio = failsafe_ratio; backfillfull_ratio = failsafe_ratio; nearfull_ratio = failsafe_ratio; } if (injectfull_state > NONE && injectfull) { inject = "(Injected)"; return injectfull_state; } else if (pratio > failsafe_ratio) { return FAILSAFE; } else if (ratio > full_ratio) { return FULL; } else if (ratio > backfillfull_ratio) { return BACKFILLFULL; } else if (pratio > nearfull_ratio) { return NEARFULL; } return NONE; } void OSDService::check_full_status(float ratio, float pratio) { std::lock_guard l(full_status_lock); cur_ratio = ratio; physical_ratio = pratio; string inject; s_names new_state; new_state = recalc_full_state(ratio, pratio, inject); dout(20) << __func__ << " cur ratio " << ratio << ", physical ratio " << pratio << ", new state " << get_full_state_name(new_state) << " " << inject << dendl; // warn if (cur_state != new_state) { dout(10) << __func__ << " " << get_full_state_name(cur_state) << " -> " << get_full_state_name(new_state) << dendl; if (new_state == FAILSAFE) { clog->error() << "full status failsafe engaged, dropping updates, now " << (int)roundf(ratio * 100) << "% full"; } else if (cur_state == FAILSAFE) { clog->error() << "full status failsafe disengaged, no longer dropping " << "updates, now " << (int)roundf(ratio * 100) << "% full"; } cur_state = new_state; } } bool OSDService::need_fullness_update() { OSDMapRef osdmap = get_osdmap(); s_names cur = NONE; if (osdmap->exists(whoami)) { if (osdmap->get_state(whoami) & CEPH_OSD_FULL) { cur = FULL; } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) { cur = BACKFILLFULL; } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) { cur = NEARFULL; } } s_names want = NONE; if (is_full()) want = FULL; else if (is_backfillfull()) want = BACKFILLFULL; else if (is_nearfull()) want = NEARFULL; return want != cur; } bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const { if (injectfull && injectfull_state >= type) { // injectfull is either a count of the number of times to return failsafe full // or if -1 then always return full if (injectfull > 0) --injectfull; ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD (" << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")" << dendl; return true; } return false; } bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const { std::lock_guard l(full_status_lock); if (_check_inject_full(dpp, type)) return true; if (cur_state >= type) ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio << " physical " << physical_ratio << dendl; return cur_state >= type; } bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat) { ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl; { std::lock_guard l(full_status_lock); if (_check_inject_full(dpp, type)) { return true; } } float pratio; float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used); string notused; s_names tentative_state = recalc_full_state(ratio, pratio, notused); if (tentative_state >= type) ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl; return tentative_state >= type; } bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const { return _check_full(dpp, FAILSAFE); } bool OSDService::check_full(DoutPrefixProvider *dpp) const { return _check_full(dpp, FULL); } bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats) { return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats); } bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const { return _check_full(dpp, BACKFILLFULL); } bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const { return _check_full(dpp, NEARFULL); } bool OSDService::is_failsafe_full() const { std::lock_guard l(full_status_lock); return cur_state == FAILSAFE; } bool OSDService::is_full() const { std::lock_guard l(full_status_lock); return cur_state >= FULL; } bool OSDService::is_backfillfull() const { std::lock_guard l(full_status_lock); return cur_state >= BACKFILLFULL; } bool OSDService::is_nearfull() const { std::lock_guard l(full_status_lock); return cur_state >= NEARFULL; } void OSDService::set_injectfull(s_names type, int64_t count) { std::lock_guard l(full_status_lock); injectfull_state = type; injectfull = count; } void OSDService::set_statfs(const struct store_statfs_t &stbuf, osd_alert_list_t& alerts) { uint64_t bytes = stbuf.total; uint64_t avail = stbuf.available; uint64_t used = stbuf.get_used_raw(); // For testing fake statfs values so it doesn't matter if all // OSDs are using the same partition. if (cct->_conf->fake_statfs_for_testing) { uint64_t total_num_bytes = 0; vector pgs; osd->_get_pgs(&pgs); for (auto p : pgs) { total_num_bytes += p->get_stats_num_bytes(); } bytes = cct->_conf->fake_statfs_for_testing; if (total_num_bytes < bytes) avail = bytes - total_num_bytes; else avail = 0; dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing << " adjust available " << avail << dendl; used = bytes - avail; } osd->logger->set(l_osd_stat_bytes, bytes); osd->logger->set(l_osd_stat_bytes_used, used); osd->logger->set(l_osd_stat_bytes_avail, avail); std::lock_guard l(stat_lock); osd_stat.statfs = stbuf; osd_stat.os_alerts.clear(); osd_stat.os_alerts[whoami].swap(alerts); if (cct->_conf->fake_statfs_for_testing) { osd_stat.statfs.total = bytes; osd_stat.statfs.available = avail; // For testing don't want used to go negative, so clear reserved osd_stat.statfs.internally_reserved = 0; } } osd_stat_t OSDService::set_osd_stat(vector& hb_peers, int num_pgs) { utime_t now = ceph_clock_now(); auto stale_time = g_conf().get_val("osd_mon_heartbeat_stat_stale"); std::lock_guard l(stat_lock); osd_stat.hb_peers.swap(hb_peers); osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist); osd_stat.num_pgs = num_pgs; // Clean entries that aren't updated // This is called often enough that we can just remove 1 at a time for (auto i: osd_stat.hb_pingtime) { if (i.second.last_update == 0) continue; if (stale_time && now.sec() - i.second.last_update > stale_time) { dout(20) << __func__ << " time out heartbeat for osd " << i.first << " last_update " << i.second.last_update << dendl; osd_stat.hb_pingtime.erase(i.first); break; } } return osd_stat; } void OSDService::inc_osd_stat_repaired() { std::lock_guard l(stat_lock); osd_stat.num_shards_repaired++; return; } void OSDService::set_osd_stat_repaired(int64_t count) { std::lock_guard l(stat_lock); osd_stat.num_shards_repaired = count; return; } float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used) { *pratio = ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total); if (adjust_used) { dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl; if (new_stat.statfs.available > adjust_used) new_stat.statfs.available -= adjust_used; else new_stat.statfs.available = 0; dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl; } // Check all pgs and adjust kb_used to include all pending backfill data int backfill_adjusted = 0; vector pgs; osd->_get_pgs(&pgs); for (auto p : pgs) { backfill_adjusted += p->pg_stat_adjust(&new_stat); } if (backfill_adjusted) { dout(20) << __func__ << " backfill adjusted " << new_stat << dendl; } return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total); } bool OSDService::check_osdmap_full(const set &missing_on) { OSDMapRef osdmap = get_osdmap(); for (auto shard : missing_on) { if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL) return true; } return false; } void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch) { OSDMapRef next_map = get_nextmap_reserved(); // service map is always newer/newest ceph_assert(from_epoch <= next_map->get_epoch()); if (next_map->is_down(peer) || next_map->get_info(peer).up_from > from_epoch) { m->put(); release_map(next_map); return; } ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd( next_map->get_cluster_addrs(peer)); share_map_peer(peer, peer_con.get(), next_map); peer_con->send_message(m); release_map(next_map); } ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch) { OSDMapRef next_map = get_nextmap_reserved(); // service map is always newer/newest ceph_assert(from_epoch <= next_map->get_epoch()); if (next_map->is_down(peer) || next_map->get_info(peer).up_from > from_epoch) { release_map(next_map); return NULL; } ConnectionRef con = osd->cluster_messenger->connect_to_osd( next_map->get_cluster_addrs(peer)); release_map(next_map); return con; } pair OSDService::get_con_osd_hb(int peer, epoch_t from_epoch) { OSDMapRef next_map = get_nextmap_reserved(); // service map is always newer/newest ceph_assert(from_epoch <= next_map->get_epoch()); pair ret; if (next_map->is_down(peer) || next_map->get_info(peer).up_from > from_epoch) { release_map(next_map); return ret; } ret.first = osd->hb_back_client_messenger->connect_to_osd( next_map->get_hb_back_addrs(peer)); ret.second = osd->hb_front_client_messenger->connect_to_osd( next_map->get_hb_front_addrs(peer)); release_map(next_map); return ret; } entity_name_t OSDService::get_cluster_msgr_name() const { return cluster_messenger->get_myname(); } void OSDService::queue_want_pg_temp(pg_t pgid, const vector& want, bool forced) { std::lock_guard l(pg_temp_lock); auto p = pg_temp_pending.find(pgid); if (p == pg_temp_pending.end() || p->second.acting != want || forced) { pg_temp_wanted[pgid] = {want, forced}; } } void OSDService::remove_want_pg_temp(pg_t pgid) { std::lock_guard l(pg_temp_lock); pg_temp_wanted.erase(pgid); pg_temp_pending.erase(pgid); } void OSDService::_sent_pg_temp() { #ifdef HAVE_STDLIB_MAP_SPLICING pg_temp_pending.merge(pg_temp_wanted); #else pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)), make_move_iterator(end(pg_temp_wanted))); #endif pg_temp_wanted.clear(); } void OSDService::requeue_pg_temp() { std::lock_guard l(pg_temp_lock); // wanted overrides pending. note that remove_want_pg_temp // clears the item out of both. unsigned old_wanted = pg_temp_wanted.size(); unsigned old_pending = pg_temp_pending.size(); _sent_pg_temp(); pg_temp_wanted.swap(pg_temp_pending); dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> " << pg_temp_wanted.size() << dendl; } std::ostream& operator<<(std::ostream& out, const OSDService::pg_temp_t& pg_temp) { out << pg_temp.acting; if (pg_temp.forced) { out << " (forced)"; } return out; } void OSDService::send_pg_temp() { std::lock_guard l(pg_temp_lock); if (pg_temp_wanted.empty()) return; dout(10) << "send_pg_temp " << pg_temp_wanted << dendl; MOSDPGTemp *ms[2] = {nullptr, nullptr}; for (auto& [pgid, pg_temp] : pg_temp_wanted) { auto& m = ms[pg_temp.forced]; if (!m) { m = new MOSDPGTemp(osdmap->get_epoch()); m->forced = pg_temp.forced; } m->pg_temp.emplace(pgid, pg_temp.acting); } for (auto m : ms) { if (m) { monc->send_mon_message(m); } } _sent_pg_temp(); } void OSDService::send_pg_created(pg_t pgid) { std::lock_guard l(pg_created_lock); dout(20) << __func__ << dendl; auto o = get_osdmap(); if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) { pg_created.insert(pgid); monc->send_mon_message(new MOSDPGCreated(pgid)); } } void OSDService::send_pg_created() { std::lock_guard l(pg_created_lock); dout(20) << __func__ << dendl; auto o = get_osdmap(); if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) { for (auto pgid : pg_created) { monc->send_mon_message(new MOSDPGCreated(pgid)); } } } void OSDService::prune_pg_created() { std::lock_guard l(pg_created_lock); dout(20) << __func__ << dendl; auto o = get_osdmap(); auto i = pg_created.begin(); while (i != pg_created.end()) { auto p = o->get_pg_pool(i->pool()); if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { dout(20) << __func__ << " pruning " << *i << dendl; i = pg_created.erase(i); } else { dout(20) << __func__ << " keeping " << *i << dendl; ++i; } } } // -------------------------------------- // dispatch epoch_t OSDService::get_peer_epoch(int peer) { std::lock_guard l(peer_map_epoch_lock); map::iterator p = peer_map_epoch.find(peer); if (p == peer_map_epoch.end()) return 0; return p->second; } epoch_t OSDService::note_peer_epoch(int peer, epoch_t e) { std::lock_guard l(peer_map_epoch_lock); map::iterator p = peer_map_epoch.find(peer); if (p != peer_map_epoch.end()) { if (p->second < e) { dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl; p->second = e; } else { dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl; } return p->second; } else { dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl; peer_map_epoch[peer] = e; return e; } } void OSDService::forget_peer_epoch(int peer, epoch_t as_of) { std::lock_guard l(peer_map_epoch_lock); map::iterator p = peer_map_epoch.find(peer); if (p != peer_map_epoch.end()) { if (p->second <= as_of) { dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of << " had " << p->second << dendl; peer_map_epoch.erase(p); } else { dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of << " has " << p->second << " - not forgetting" << dendl; } } } bool OSDService::should_share_map(entity_name_t name, Connection *con, epoch_t epoch, const OSDMapRef& osdmap, const epoch_t *sent_epoch_p) { dout(20) << "should_share_map " << name << " " << con->get_peer_addr() << " " << epoch << dendl; // does client have old map? if (name.is_client()) { bool message_sendmap = epoch < osdmap->get_epoch(); if (message_sendmap && sent_epoch_p) { dout(20) << "client session last_sent_epoch: " << *sent_epoch_p << " versus osdmap epoch " << osdmap->get_epoch() << dendl; if (*sent_epoch_p < osdmap->get_epoch()) { return true; } // else we don't need to send it out again } } if (con->get_messenger() == osd->cluster_messenger && con != osd->cluster_messenger->get_loopback_connection() && osdmap->is_up(name.num()) && (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() || osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) { // remember epoch_t has = std::max(get_peer_epoch(name.num()), epoch); // share? if (has < osdmap->get_epoch()) { dout(10) << name << " " << con->get_peer_addr() << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; return true; } } return false; } void OSDService::share_map( entity_name_t name, Connection *con, epoch_t epoch, OSDMapRef& osdmap, epoch_t *sent_epoch_p) { dout(20) << "share_map " << name << " " << con->get_peer_addr() << " " << epoch << dendl; if (!osd->is_active()) { /*It is safe not to proceed as OSD is not in healthy state*/ return; } bool want_shared = should_share_map(name, con, epoch, osdmap, sent_epoch_p); if (want_shared){ if (name.is_client()) { dout(10) << name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; // we know the Session is valid or we wouldn't be sending if (sent_epoch_p) { *sent_epoch_p = osdmap->get_epoch(); } send_incremental_map(epoch, con, osdmap); } else if (con->get_messenger() == osd->cluster_messenger && osdmap->is_up(name.num()) && (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() || osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) { dout(10) << name << " " << con->get_peer_addrs() << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; note_peer_epoch(name.num(), osdmap->get_epoch()); send_incremental_map(epoch, con, osdmap); } } } void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map) { if (!map) map = get_osdmap(); // send map? epoch_t pe = get_peer_epoch(peer); if (pe) { if (pe < map->get_epoch()) { send_incremental_map(pe, con, map); note_peer_epoch(peer, map->get_epoch()); } else dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl; } else { dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl; // no idea about peer's epoch. // ??? send recent ??? // do nothing. } } bool OSDService::can_inc_scrubs() { bool can_inc = false; std::lock_guard l(sched_scrub_lock); if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote << " remote < max " << cct->_conf->osd_max_scrubs << dendl; can_inc = true; } else { dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; } return can_inc; } bool OSDService::inc_scrubs_local() { bool result = false; std::lock_guard l{sched_scrub_lock}; if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1) << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; result = true; ++scrubs_local; } else { dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; } return result; } void OSDService::dec_scrubs_local() { std::lock_guard l{sched_scrub_lock}; dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1) << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl; --scrubs_local; ceph_assert(scrubs_local >= 0); } bool OSDService::inc_scrubs_remote() { bool result = false; std::lock_guard l{sched_scrub_lock}; if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) { dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1) << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; result = true; ++scrubs_remote; } else { dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl; } return result; } void OSDService::dec_scrubs_remote() { std::lock_guard l{sched_scrub_lock}; dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1) << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl; --scrubs_remote; ceph_assert(scrubs_remote >= 0); } void OSDService::dump_scrub_reservations(Formatter *f) { std::lock_guard l{sched_scrub_lock}; f->dump_int("scrubs_local", scrubs_local); f->dump_int("scrubs_remote", scrubs_remote); f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs); } void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch, epoch_t *_bind_epoch) const { std::lock_guard l(epoch_lock); if (_boot_epoch) *_boot_epoch = boot_epoch; if (_up_epoch) *_up_epoch = up_epoch; if (_bind_epoch) *_bind_epoch = bind_epoch; } void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch, const epoch_t *_bind_epoch) { std::lock_guard l(epoch_lock); if (_boot_epoch) { ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch); boot_epoch = *_boot_epoch; } if (_up_epoch) { ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch); up_epoch = *_up_epoch; } if (_bind_epoch) { ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch); bind_epoch = *_bind_epoch; } } bool OSDService::prepare_to_stop() { std::lock_guard l(is_stopping_lock); if (get_state() != NOT_STOPPING) return false; OSDMapRef osdmap = get_osdmap(); if (osdmap && osdmap->is_up(whoami)) { dout(0) << __func__ << " telling mon we are shutting down" << dendl; set_state(PREPARING_TO_STOP); monc->send_mon_message( new MOSDMarkMeDown( monc->get_fsid(), whoami, osdmap->get_addrs(whoami), osdmap->get_epoch(), true // request ack )); utime_t now = ceph_clock_now(); utime_t timeout; timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout); while ((ceph_clock_now() < timeout) && (get_state() != STOPPING)) { is_stopping_cond.WaitUntil(is_stopping_lock, timeout); } } dout(0) << __func__ << " starting shutdown" << dendl; set_state(STOPPING); return true; } void OSDService::got_stop_ack() { std::lock_guard l(is_stopping_lock); if (get_state() == PREPARING_TO_STOP) { dout(0) << __func__ << " starting shutdown" << dendl; set_state(STOPPING); is_stopping_cond.Signal(); } else { dout(10) << __func__ << " ignoring msg" << dendl; } } MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to, OSDSuperblock& sblock) { MOSDMap *m = new MOSDMap(monc->get_fsid(), osdmap->get_encoding_features()); m->oldest_map = max_oldest_map; m->newest_map = sblock.newest_map; int max = cct->_conf->osd_map_message_max; ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes; if (since < m->oldest_map) { // we don't have the next map the target wants, so start with a // full map. bufferlist bl; dout(10) << __func__ << " oldest map " << max_oldest_map << " > since " << since << ", starting with full map" << dendl; since = m->oldest_map; if (!get_map_bl(since, bl)) { derr << __func__ << " missing full map " << since << dendl; goto panic; } max--; max_bytes -= bl.length(); m->maps[since].claim(bl); } for (epoch_t e = since + 1; e <= to; ++e) { bufferlist bl; if (get_inc_map_bl(e, bl)) { m->incremental_maps[e].claim(bl); } else { dout(10) << __func__ << " missing incremental map " << e << dendl; if (!get_map_bl(e, bl)) { derr << __func__ << " also missing full map " << e << dendl; goto panic; } m->maps[e].claim(bl); } max--; max_bytes -= bl.length(); if (max <= 0 || max_bytes <= 0) { break; } } return m; panic: if (!m->maps.empty() || !m->incremental_maps.empty()) { // send what we have so far return m; } // send something bufferlist bl; if (get_inc_map_bl(m->newest_map, bl)) { m->incremental_maps[m->newest_map].claim(bl); } else { derr << __func__ << " unable to load latest map " << m->newest_map << dendl; if (!get_map_bl(m->newest_map, bl)) { derr << __func__ << " unable to load latest full map " << m->newest_map << dendl; ceph_abort(); } m->maps[m->newest_map].claim(bl); } return m; } void OSDService::send_map(MOSDMap *m, Connection *con) { con->send_message(m); } void OSDService::send_incremental_map(epoch_t since, Connection *con, const OSDMapRef& osdmap) { epoch_t to = osdmap->get_epoch(); dout(10) << "send_incremental_map " << since << " -> " << to << " to " << con << " " << con->get_peer_addr() << dendl; MOSDMap *m = NULL; while (!m) { OSDSuperblock sblock(get_superblock()); if (since < sblock.oldest_map) { // just send latest full map MOSDMap *m = new MOSDMap(monc->get_fsid(), osdmap->get_encoding_features()); m->oldest_map = max_oldest_map; m->newest_map = sblock.newest_map; get_map_bl(to, m->maps[to]); send_map(m, con); return; } if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) { dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs << ", only sending most recent" << dendl; since = to - cct->_conf->osd_map_share_max_epochs; } m = build_incremental_map_msg(since, to, sblock); } send_map(m, con); } bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl) { bool found = map_bl_cache.lookup(e, &bl); if (found) { if (logger) logger->inc(l_osd_map_bl_cache_hit); return true; } if (logger) logger->inc(l_osd_map_bl_cache_miss); found = store->read(meta_ch, OSD::get_osdmap_pobject_name(e), 0, 0, bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0; if (found) { _add_map_bl(e, bl); } return found; } bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl) { std::lock_guard l(map_cache_lock); bool found = map_bl_inc_cache.lookup(e, &bl); if (found) { if (logger) logger->inc(l_osd_map_bl_cache_hit); return true; } if (logger) logger->inc(l_osd_map_bl_cache_miss); found = store->read(meta_ch, OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl, CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0; if (found) { _add_map_inc_bl(e, bl); } return found; } void OSDService::_add_map_bl(epoch_t e, bufferlist& bl) { dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl; // cache a contiguous buffer if (bl.get_num_buffers() > 1) { bl.rebuild(); } bl.try_assign_to_mempool(mempool::mempool_osd_mapbl); map_bl_cache.add(e, bl); } void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl) { dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl; // cache a contiguous buffer if (bl.get_num_buffers() > 1) { bl.rebuild(); } bl.try_assign_to_mempool(mempool::mempool_osd_mapbl); map_bl_inc_cache.add(e, bl); } int OSDService::get_deleted_pool_pg_num(int64_t pool) { std::lock_guard l(map_cache_lock); auto p = deleted_pool_pg_nums.find(pool); if (p != deleted_pool_pg_nums.end()) { return p->second; } dout(20) << __func__ << " " << pool << " loading" << dendl; ghobject_t oid = OSD::make_final_pool_info_oid(pool); bufferlist bl; int r = store->read(meta_ch, oid, 0, 0, bl); ceph_assert(r >= 0); auto blp = bl.cbegin(); pg_pool_t pi; ::decode(pi, blp); deleted_pool_pg_nums[pool] = pi.get_pg_num(); dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl; return pi.get_pg_num(); } OSDMapRef OSDService::_add_map(OSDMap *o) { epoch_t e = o->get_epoch(); if (cct->_conf->osd_map_dedup) { // Dedup against an existing map at a nearby epoch OSDMapRef for_dedup = map_cache.lower_bound(e); if (for_dedup) { OSDMap::dedup(for_dedup.get(), o); } } bool existed; OSDMapRef l = map_cache.add(e, o, &existed); if (existed) { delete o; } return l; } OSDMapRef OSDService::try_get_map(epoch_t epoch) { std::lock_guard l(map_cache_lock); OSDMapRef retval = map_cache.lookup(epoch); if (retval) { dout(30) << "get_map " << epoch << " -cached" << dendl; if (logger) { logger->inc(l_osd_map_cache_hit); } return retval; } if (logger) { logger->inc(l_osd_map_cache_miss); epoch_t lb = map_cache.cached_key_lower_bound(); if (epoch < lb) { dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl; logger->inc(l_osd_map_cache_miss_low); logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch); } } OSDMap *map = new OSDMap; if (epoch > 0) { dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl; bufferlist bl; if (!_get_map_bl(epoch, bl) || bl.length() == 0) { derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl; delete map; return OSDMapRef(); } map->decode(bl); } else { dout(20) << "get_map " << epoch << " - return initial " << map << dendl; } return _add_map(map); } // ops void OSDService::reply_op_error(OpRequestRef op, int err) { reply_op_error(op, err, eversion_t(), 0); } void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv) { const MOSDOp *m = static_cast(op->get_req()); ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); int flags; flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true); reply->set_reply_versions(v, uv); m->get_connection()->send_message(reply); } void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) { if (!cct->_conf->osd_debug_misdirected_ops) { return; } const MOSDOp *m = static_cast(op->get_req()); ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since); if (pg->is_ec_pg()) { /** * OSD recomputes op target based on current OSDMap. With an EC pg, we * can get this result: * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping * [CRUSH_ITEM_NONE, 2, 3]/3 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping * [3, 2, 3]/3 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary * -- misdirected op * 4) client resends and this time PG 3.9s0 having caught up to 513 gets * it and fulfils it * * We can't compute the op target based on the sending map epoch due to * splitting. The simplest thing is to detect such cases here and drop * them without an error (the client will resend anyway). */ ceph_assert(m->get_map_epoch() <= superblock.newest_map); OSDMapRef opmap = try_get_map(m->get_map_epoch()); if (!opmap) { dout(7) << __func__ << ": " << *pg << " no longer have map for " << m->get_map_epoch() << ", dropping" << dendl; return; } pg_t _pgid = m->get_raw_pg(); spg_t pgid; if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) _pgid = opmap->raw_pg_to_pg(_pgid); if (opmap->get_primary_shard(_pgid, &pgid) && pgid.shard != pg->pg_id.shard) { dout(7) << __func__ << ": " << *pg << " primary changed since " << m->get_map_epoch() << ", dropping" << dendl; return; } } dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl; clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid() << " pg " << m->get_raw_pg() << " to osd." << whoami << " not " << pg->get_acting() << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch(); } void OSDService::enqueue_back(OpQueueItem&& qi) { osd->op_shardedwq.queue(std::move(qi)); } void OSDService::enqueue_front(OpQueueItem&& qi) { osd->op_shardedwq.queue_front(std::move(qi)); } void OSDService::queue_recovery_context( PG *pg, GenContext *c) { epoch_t e = get_osdmap_epoch(); enqueue_back( OpQueueItem( unique_ptr( new PGRecoveryContext(pg->get_pgid(), c, e)), cct->_conf->osd_recovery_cost, cct->_conf->osd_recovery_priority, ceph_clock_now(), 0, e)); } void OSDService::queue_for_snap_trim(PG *pg) { dout(10) << "queueing " << *pg << " for snaptrim" << dendl; enqueue_back( OpQueueItem( unique_ptr( new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())), cct->_conf->osd_snap_trim_cost, cct->_conf->osd_snap_trim_priority, ceph_clock_now(), 0, pg->get_osdmap_epoch())); } void OSDService::queue_for_scrub(PG *pg, bool with_high_priority) { unsigned scrub_queue_priority = pg->scrubber.priority; if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) { scrub_queue_priority = cct->_conf->osd_client_op_priority; } const auto epoch = pg->get_osdmap_epoch(); enqueue_back( OpQueueItem( unique_ptr(new PGScrub(pg->get_pgid(), epoch)), cct->_conf->osd_scrub_cost, scrub_queue_priority, ceph_clock_now(), 0, epoch)); } void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e) { dout(10) << __func__ << " on " << pgid << " e " << e << dendl; enqueue_back( OpQueueItem( unique_ptr( new PGDelete(pgid, e)), cct->_conf->osd_pg_delete_cost, cct->_conf->osd_pg_delete_priority, ceph_clock_now(), 0, e)); } bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num) { return osd->try_finish_pg_delete(pg, old_pg_num); } // --- void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << pg->pg_id << dendl; ready_to_merge_source[pg->pg_id.pgid] = version; assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0); _send_ready_to_merge(); } void OSDService::set_ready_to_merge_target(PG *pg, eversion_t version, epoch_t last_epoch_started, epoch_t last_epoch_clean) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << pg->pg_id << dendl; ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, make_tuple(version, last_epoch_started, last_epoch_clean))); assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0); _send_ready_to_merge(); } void OSDService::set_not_ready_to_merge_source(pg_t source) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << source << dendl; not_ready_to_merge_source.insert(source); assert(ready_to_merge_source.count(source) == 0); _send_ready_to_merge(); } void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << target << " source " << source << dendl; not_ready_to_merge_target[target] = source; assert(ready_to_merge_target.count(target) == 0); _send_ready_to_merge(); } void OSDService::send_ready_to_merge() { std::lock_guard l(merge_lock); _send_ready_to_merge(); } void OSDService::_send_ready_to_merge() { dout(20) << __func__ << " ready_to_merge_source " << ready_to_merge_source << " not_ready_to_merge_source " << not_ready_to_merge_source << " ready_to_merge_target " << ready_to_merge_target << " not_ready_to_merge_target " << not_ready_to_merge_target << " sent_ready_to_merge_source " << sent_ready_to_merge_source << dendl; for (auto src : not_ready_to_merge_source) { if (sent_ready_to_merge_source.count(src) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( src, {}, {}, 0, 0, false, osdmap->get_epoch())); sent_ready_to_merge_source.insert(src); } } for (auto p : not_ready_to_merge_target) { if (sent_ready_to_merge_source.count(p.second) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( p.second, {}, {}, 0, 0, false, osdmap->get_epoch())); sent_ready_to_merge_source.insert(p.second); } } for (auto src : ready_to_merge_source) { if (not_ready_to_merge_source.count(src.first) || not_ready_to_merge_target.count(src.first.get_parent())) { continue; } auto p = ready_to_merge_target.find(src.first.get_parent()); if (p != ready_to_merge_target.end() && sent_ready_to_merge_source.count(src.first) == 0) { monc->send_mon_message(new MOSDPGReadyToMerge( src.first, // source pgid src.second, // src version std::get<0>(p->second), // target version std::get<1>(p->second), // PG's last_epoch_started std::get<2>(p->second), // PG's last_epoch_clean true, osdmap->get_epoch())); sent_ready_to_merge_source.insert(src.first); } } } void OSDService::clear_ready_to_merge(PG *pg) { std::lock_guard l(merge_lock); dout(10) << __func__ << " " << pg->pg_id << dendl; ready_to_merge_source.erase(pg->pg_id.pgid); ready_to_merge_target.erase(pg->pg_id.pgid); not_ready_to_merge_source.erase(pg->pg_id.pgid); not_ready_to_merge_target.erase(pg->pg_id.pgid); sent_ready_to_merge_source.erase(pg->pg_id.pgid); } void OSDService::clear_sent_ready_to_merge() { std::lock_guard l(merge_lock); sent_ready_to_merge_source.clear(); } void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap) { std::lock_guard l(merge_lock); auto i = sent_ready_to_merge_source.begin(); while (i != sent_ready_to_merge_source.end()) { if (!osdmap->pg_exists(*i)) { dout(10) << __func__ << " " << *i << dendl; i = sent_ready_to_merge_source.erase(i); } else { ++i; } } } // --- void OSDService::_queue_for_recovery( std::pair p, uint64_t reserved_pushes) { ceph_assert(recovery_lock.is_locked_by_me()); enqueue_back( OpQueueItem( unique_ptr( new PGRecovery( p.second->get_pgid(), p.first, reserved_pushes)), cct->_conf->osd_recovery_cost, cct->_conf->osd_recovery_priority, ceph_clock_now(), 0, p.first)); } // ==================================================================== // OSD #undef dout_prefix #define dout_prefix *_dout // Commands shared between OSD's console and admin console: namespace ceph { namespace osd_cmds { int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os); }} // namespace ceph::osd_cmds int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami) { int ret; OSDSuperblock sb; bufferlist sbbl; ObjectStore::CollectionHandle ch; // if we are fed a uuid for this osd, use it. store->set_fsid(cct->_conf->osd_uuid); ret = store->mkfs(); if (ret) { derr << "OSD::mkfs: ObjectStore::mkfs failed with error " << cpp_strerror(ret) << dendl; goto free_store; } store->set_cache_shards(1); // doesn't matter for mkfs! ret = store->mount(); if (ret) { derr << "OSD::mkfs: couldn't mount ObjectStore: error " << cpp_strerror(ret) << dendl; goto free_store; } ch = store->open_collection(coll_t::meta()); if (ch) { ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl); if (ret < 0) { derr << "OSD::mkfs: have meta collection but no superblock" << dendl; goto free_store; } /* if we already have superblock, check content of superblock */ dout(0) << " have superblock" << dendl; auto p = sbbl.cbegin(); decode(sb, p); if (whoami != sb.whoami) { derr << "provided osd id " << whoami << " != superblock's " << sb.whoami << dendl; ret = -EINVAL; goto umount_store; } if (fsid != sb.cluster_fsid) { derr << "provided cluster fsid " << fsid << " != superblock's " << sb.cluster_fsid << dendl; ret = -EINVAL; goto umount_store; } } else { // create superblock sb.cluster_fsid = fsid; sb.osd_fsid = store->get_fsid(); sb.whoami = whoami; sb.compat_features = get_osd_initial_compat_set(); bufferlist bl; encode(sb, bl); ObjectStore::CollectionHandle ch = store->create_new_collection( coll_t::meta()); ObjectStore::Transaction t; t.create_collection(coll_t::meta(), 0); t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); ret = store->queue_transaction(ch, std::move(t)); if (ret) { derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: " << "queue_transaction returned " << cpp_strerror(ret) << dendl; goto umount_store; } } ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami); if (ret) { derr << "OSD::mkfs: failed to write fsid file: error " << cpp_strerror(ret) << dendl; goto umount_store; } umount_store: if (ch) { ch.reset(); } store->umount(); free_store: delete store; return ret; } int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami) { char val[80]; int r; snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC); r = store->write_meta("magic", val); if (r < 0) return r; snprintf(val, sizeof(val), "%d", whoami); r = store->write_meta("whoami", val); if (r < 0) return r; cluster_fsid.print(val); r = store->write_meta("ceph_fsid", val); if (r < 0) return r; string key = cct->_conf.get_val("key"); if (key.size()) { r = store->write_meta("osd_key", key); if (r < 0) return r; } else { string keyfile = cct->_conf.get_val("keyfile"); if (!keyfile.empty()) { bufferlist keybl; string err; r = keybl.read_file(keyfile.c_str(), &err); if (r < 0) { derr << __func__ << " failed to read keyfile " << keyfile << ": " << err << ": " << cpp_strerror(r) << dendl; return r; } r = store->write_meta("osd_key", keybl.to_str()); if (r < 0) return r; } } r = store->write_meta("ready", "ready"); if (r < 0) return r; return 0; } int OSD::peek_meta(ObjectStore *store, std::string *magic, uuid_d *cluster_fsid, uuid_d *osd_fsid, int *whoami, int *require_osd_release) { string val; int r = store->read_meta("magic", &val); if (r < 0) return r; *magic = val; r = store->read_meta("whoami", &val); if (r < 0) return r; *whoami = atoi(val.c_str()); r = store->read_meta("ceph_fsid", &val); if (r < 0) return r; r = cluster_fsid->parse(val.c_str()); if (!r) return -EINVAL; r = store->read_meta("fsid", &val); if (r < 0) { *osd_fsid = uuid_d(); } else { r = osd_fsid->parse(val.c_str()); if (!r) return -EINVAL; } r = store->read_meta("require_osd_release", &val); if (r >= 0) { *require_osd_release = atoi(val.c_str()); } return 0; } #undef dout_prefix #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) // cons/des OSD::OSD(CephContext *cct_, ObjectStore *store_, int id, Messenger *internal_messenger, Messenger *external_messenger, Messenger *hb_client_front, Messenger *hb_client_back, Messenger *hb_front_serverm, Messenger *hb_back_serverm, Messenger *osdc_messenger, MonClient *mc, const std::string &dev, const std::string &jdev) : Dispatcher(cct_), osd_lock("OSD::osd_lock"), tick_timer(cct, osd_lock), tick_timer_lock("OSD::tick_timer_lock"), tick_timer_without_osd_lock(cct, tick_timer_lock), gss_ktfile_client(cct->_conf.get_val("gss_ktab_client_file")), cluster_messenger(internal_messenger), client_messenger(external_messenger), objecter_messenger(osdc_messenger), monc(mc), mgrc(cct_, client_messenger), logger(NULL), recoverystate_perf(NULL), store(store_), log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS), clog(log_client.create_channel()), whoami(id), dev_path(dev), journal_path(jdev), store_is_rotational(store->is_rotational()), trace_endpoint("0.0.0.0", 0, "osd"), asok_hook(NULL), m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val( "osd_pg_epoch_max_lag_factor")), osd_compat(get_osd_compat_set()), osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp", get_num_op_threads()), command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1), session_waiting_lock("OSD::session_waiting_lock"), osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"), heartbeat_lock("OSD::heartbeat_lock"), heartbeat_stop(false), heartbeat_need_update(true), hb_front_client_messenger(hb_client_front), hb_back_client_messenger(hb_client_back), hb_front_server_messenger(hb_front_serverm), hb_back_server_messenger(hb_back_serverm), daily_loadavg(0.0), heartbeat_thread(this), heartbeat_dispatcher(this), op_tracker(cct, cct->_conf->osd_enable_op_tracker, cct->_conf->osd_num_op_tracker_shard), test_ops_hook(NULL), op_queue(get_io_queue()), op_prio_cutoff(get_io_prio_cut()), op_shardedwq( this, cct->_conf->osd_op_thread_timeout, cct->_conf->osd_op_thread_suicide_timeout, &osd_op_tp), map_lock("OSD::map_lock"), last_pg_create_epoch(0), mon_report_lock("OSD::mon_report_lock"), boot_finisher(cct), up_thru_wanted(0), requested_full_first(0), requested_full_last(0), command_wq( this, cct->_conf->osd_command_thread_timeout, cct->_conf->osd_command_thread_suicide_timeout, &command_tp), service(this) { if (!gss_ktfile_client.empty()) { // Assert we can export environment variable /* The default client keytab is used, if it is present and readable, to automatically obtain initial credentials for GSSAPI client applications. The principal name of the first entry in the client keytab is used by default when obtaining initial credentials. 1. The KRB5_CLIENT_KTNAME environment variable. 2. The default_client_keytab_name profile variable in [libdefaults]. 3. The hardcoded default, DEFCKTNAME. */ const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", gss_ktfile_client.c_str(), 1)); ceph_assert(set_result == 0); } monc->set_messenger(client_messenger); op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, cct->_conf->osd_op_log_threshold); op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, cct->_conf->osd_op_history_duration); op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size, cct->_conf->osd_op_history_slow_op_threshold); #ifdef WITH_BLKIN std::stringstream ss; ss << "osd." << whoami; trace_endpoint.copy_name(ss.str()); #endif // initialize shards num_shards = get_num_op_shards(); for (uint32_t i = 0; i < num_shards; i++) { OSDShard *one_shard = new OSDShard( i, cct, this, cct->_conf->osd_op_pq_max_tokens_per_priority, cct->_conf->osd_op_pq_min_cost, op_queue); shards.push_back(one_shard); } } OSD::~OSD() { while (!shards.empty()) { delete shards.back(); shards.pop_back(); } delete class_handler; cct->get_perfcounters_collection()->remove(recoverystate_perf); cct->get_perfcounters_collection()->remove(logger); delete recoverystate_perf; delete logger; delete store; } double OSD::get_tick_interval() const { // vary +/- 5% to avoid scrub scheduling livelocks constexpr auto delta = 0.05; return (OSD_TICK_INTERVAL * ceph::util::generate_random_number(1.0 - delta, 1.0 + delta)); } void cls_initialize(ClassHandler *ch); void OSD::handle_signal(int signum) { ceph_assert(signum == SIGINT || signum == SIGTERM); derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; shutdown(); } int OSD::pre_init() { std::lock_guard lock(osd_lock); if (is_stopping()) return 0; if (store->test_mount_in_use()) { derr << "OSD::pre_init: object store '" << dev_path << "' is " << "currently in use. (Is ceph-osd already running?)" << dendl; return -EBUSY; } cct->_conf.add_observer(this); return 0; } int OSD::set_numa_affinity() { // storage numa node int store_node = -1; store->get_numa_node(&store_node, nullptr, nullptr); if (store_node >= 0) { dout(1) << __func__ << " storage numa node " << store_node << dendl; } // check network numa node(s) int front_node = -1, back_node = -1; string front_iface = pick_iface( cct, client_messenger->get_myaddrs().front().get_sockaddr_storage()); string back_iface = pick_iface( cct, cluster_messenger->get_myaddrs().front().get_sockaddr_storage()); int r = get_iface_numa_node(front_iface, &front_node); if (r >= 0 && front_node >= 0) { dout(1) << __func__ << " public network " << front_iface << " numa node " << front_node << dendl; r = get_iface_numa_node(back_iface, &back_node); if (r >= 0 && back_node >= 0) { dout(1) << __func__ << " cluster network " << back_iface << " numa node " << back_node << dendl; if (front_node == back_node && front_node == store_node) { dout(1) << " objectstore and network numa nodes all match" << dendl; if (g_conf().get_val("osd_numa_auto_affinity")) { numa_node = front_node; } } else if (front_node != back_node) { dout(1) << __func__ << " public and cluster network numa nodes do not match" << dendl; } else { dout(1) << __func__ << " objectstore and network numa nodes do not match" << dendl; } } else if (back_node == -2) { dout(1) << __func__ << " cluster network " << back_iface << " ports numa nodes do not match" << dendl; } else { derr << __func__ << " unable to identify cluster interface '" << back_iface << "' numa node: " << cpp_strerror(r) << dendl; } } else if (front_node == -2) { dout(1) << __func__ << " public network " << front_iface << " ports numa nodes do not match" << dendl; } else { derr << __func__ << " unable to identify public interface '" << front_iface << "' numa node: " << cpp_strerror(r) << dendl; } if (int node = g_conf().get_val("osd_numa_node"); node >= 0) { // this takes precedence over the automagic logic above numa_node = node; } if (numa_node >= 0) { int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set); if (r < 0) { dout(1) << __func__ << " unable to determine numa node " << numa_node << " CPUs" << dendl; numa_node = -1; } else { dout(1) << __func__ << " setting numa affinity to node " << numa_node << " cpus " << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set) << dendl; r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set); if (r < 0) { r = -errno; derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r) << dendl; numa_node = -1; } } } else { dout(1) << __func__ << " not setting numa affinity" << dendl; } return 0; } // asok class OSDSocketHook : public AdminSocketHook { OSD *osd; public: explicit OSDSocketHook(OSD *o) : osd(o) {} bool call(std::string_view admin_command, const cmdmap_t& cmdmap, std::string_view format, bufferlist& out) override { stringstream ss; bool r = true; try { r = osd->asok_command(admin_command, cmdmap, format, ss); } catch (const bad_cmd_get& e) { ss << e.what(); r = true; } out.append(ss); return r; } }; std::set OSD::get_mapped_pools() { std::set pools; std::vector pgids; _get_pgids(&pgids); for (const auto &pgid : pgids) { pools.insert(pgid.pool()); } return pools; } bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap, std::string_view format, ostream& ss) { Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); if (admin_command == "status") { f->open_object_section("status"); f->dump_stream("cluster_fsid") << superblock.cluster_fsid; f->dump_stream("osd_fsid") << superblock.osd_fsid; f->dump_unsigned("whoami", superblock.whoami); f->dump_string("state", get_state_name(get_state())); f->dump_unsigned("oldest_map", superblock.oldest_map); f->dump_unsigned("newest_map", superblock.newest_map); f->dump_unsigned("num_pgs", num_pgs); f->close_section(); } else if (admin_command == "flush_journal") { store->flush_journal(); } else if (admin_command == "dump_ops_in_flight" || admin_command == "ops" || admin_command == "dump_blocked_ops" || admin_command == "dump_historic_ops" || admin_command == "dump_historic_ops_by_duration" || admin_command == "dump_historic_slow_ops") { const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \ even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \ will start to track new ops received afterwards."; set filters; vector filter_str; if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) { copy(filter_str.begin(), filter_str.end(), inserter(filters, filters.end())); } if (admin_command == "dump_ops_in_flight" || admin_command == "ops") { if (!op_tracker.dump_ops_in_flight(f, false, filters)) { ss << error_str; } } if (admin_command == "dump_blocked_ops") { if (!op_tracker.dump_ops_in_flight(f, true, filters)) { ss << error_str; } } if (admin_command == "dump_historic_ops") { if (!op_tracker.dump_historic_ops(f, false, filters)) { ss << error_str; } } if (admin_command == "dump_historic_ops_by_duration") { if (!op_tracker.dump_historic_ops(f, true, filters)) { ss << error_str; } } if (admin_command == "dump_historic_slow_ops") { if (!op_tracker.dump_historic_slow_ops(f, filters)) { ss << error_str; } } } else if (admin_command == "dump_op_pq_state") { f->open_object_section("pq"); op_shardedwq.dump(f); f->close_section(); } else if (admin_command == "dump_blacklist") { list > bl; OSDMapRef curmap = service.get_osdmap(); f->open_array_section("blacklist"); curmap->get_blacklist(&bl); for (list >::iterator it = bl.begin(); it != bl.end(); ++it) { f->open_object_section("entry"); f->open_object_section("entity_addr_t"); it->first.dump(f); f->close_section(); //entity_addr_t it->second.localtime(f->dump_stream("expire_time")); f->close_section(); //entry } f->close_section(); //blacklist } else if (admin_command == "dump_watchers") { list watchers; // scan pg's vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { list pg_watchers; pg->get_watchers(&pg_watchers); watchers.splice(watchers.end(), pg_watchers); } f->open_array_section("watchers"); for (list::iterator it = watchers.begin(); it != watchers.end(); ++it) { f->open_object_section("watch"); f->dump_string("namespace", it->obj.nspace); f->dump_string("object", it->obj.oid.name); f->open_object_section("entity_name"); it->wi.name.dump(f); f->close_section(); //entity_name_t f->dump_unsigned("cookie", it->wi.cookie); f->dump_unsigned("timeout", it->wi.timeout_seconds); f->open_object_section("entity_addr_t"); it->wi.addr.dump(f); f->close_section(); //entity_addr_t f->close_section(); //watch } f->close_section(); //watchers } else if (admin_command == "dump_recovery_reservations") { f->open_object_section("reservations"); f->open_object_section("local_reservations"); service.local_reserver.dump(f); f->close_section(); f->open_object_section("remote_reservations"); service.remote_reserver.dump(f); f->close_section(); f->close_section(); } else if (admin_command == "dump_scrub_reservations") { f->open_object_section("scrub_reservations"); service.dump_scrub_reservations(f); f->close_section(); } else if (admin_command == "get_latest_osdmap") { get_latest_osdmap(); } else if (admin_command == "heap") { auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss); // Note: Failed heap profile commands won't necessarily trigger an error: f->open_object_section("result"); f->dump_string("error", cpp_strerror(result)); f->dump_bool("success", result >= 0); f->close_section(); } else if (admin_command == "set_heap_property") { string property; int64_t value = 0; string error; bool success = false; if (!cmd_getval(cct, cmdmap, "property", property)) { error = "unable to get property"; success = false; } else if (!cmd_getval(cct, cmdmap, "value", value)) { error = "unable to get value"; success = false; } else if (value < 0) { error = "negative value not allowed"; success = false; } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) { error = "invalid property"; success = false; } else { success = true; } f->open_object_section("result"); f->dump_string("error", error); f->dump_bool("success", success); f->close_section(); } else if (admin_command == "get_heap_property") { string property; size_t value = 0; string error; bool success = false; if (!cmd_getval(cct, cmdmap, "property", property)) { error = "unable to get property"; success = false; } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) { error = "invalid property"; success = false; } else { success = true; } f->open_object_section("result"); f->dump_string("error", error); f->dump_bool("success", success); f->dump_int("value", value); f->close_section(); } else if (admin_command == "dump_objectstore_kv_stats") { store->get_db_statistics(f); } else if (admin_command == "dump_scrubs") { service.dumps_scrub(f); } else if (admin_command == "calc_objectstore_db_histogram") { store->generate_db_histogram(f); } else if (admin_command == "flush_store_cache") { store->flush_cache(&ss); } else if (admin_command == "dump_pgstate_history") { f->open_object_section("pgstate_history"); vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { f->dump_stream("pg") << pg->pg_id; pg->dump_pgstate_history(f); } f->close_section(); } else if (admin_command == "compact") { dout(1) << "triggering manual compaction" << dendl; auto start = ceph::coarse_mono_clock::now(); store->compact(); auto end = ceph::coarse_mono_clock::now(); double duration = std::chrono::duration(end-start).count(); dout(1) << "finished manual compaction in " << duration << " seconds" << dendl; f->open_object_section("compact_result"); f->dump_float("elapsed_time", duration); f->close_section(); } else if (admin_command == "get_mapped_pools") { f->open_array_section("mapped_pools"); set poollist = get_mapped_pools(); for (auto pool : poollist) { f->dump_int("pool_id", pool); } f->close_section(); } else if (admin_command == "smart") { string devid; cmd_getval(cct, cmdmap, "devid", devid); probe_smart(devid, ss); } else if (admin_command == "list_devices") { set devnames; store->get_devices(&devnames); f->open_object_section("list_devices"); for (auto dev : devnames) { if (dev.find("dm-") == 0) { continue; } f->dump_string("device", "/dev/" + dev); } f->close_section(); } else if (admin_command == "send_beacon") { if (is_active()) { send_beacon(ceph::coarse_mono_clock::now()); } } else if (admin_command == "dump_osd_network") { int64_t value = 0; if (!(cmd_getval(cct, cmdmap, "value", value))) { // Convert milliseconds to microseconds value = static_cast(g_conf().get_val("mon_warn_on_slow_ping_time")) * 1000; if (value == 0) { double ratio = g_conf().get_val("mon_warn_on_slow_ping_ratio"); value = g_conf().get_val("osd_heartbeat_grace"); value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio } } else { // Convert user input to microseconds value *= 1000; } if (value < 0) value = 0; struct osd_ping_time_t { uint32_t pingtime; int to; bool back; std::array times; std::array min; std::array max; uint32_t last; uint32_t last_update; bool operator<(const osd_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) return true; if (pingtime > rhs.pingtime) return false; if (to < rhs.to) return true; if (to > rhs.to) return false; return back; } }; set sorted; // Get pingtimes under lock and not on the stack map *pingtimes = new map; service.get_hb_pingtime(pingtimes); for (auto j : *pingtimes) { if (j.second.last_update == 0) continue; osd_ping_time_t item; item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]); if (item.pingtime >= value) { item.to = j.first; item.times[0] = j.second.back_pingtime[0]; item.times[1] = j.second.back_pingtime[1]; item.times[2] = j.second.back_pingtime[2]; item.min[0] = j.second.back_min[0]; item.min[1] = j.second.back_min[1]; item.min[2] = j.second.back_min[2]; item.max[0] = j.second.back_max[0]; item.max[1] = j.second.back_max[1]; item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; item.last_update = j.second.last_update; sorted.emplace(item); } if (j.second.front_last == 0) continue; item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]); if (item.pingtime >= value) { item.to = j.first; item.times[0] = j.second.front_pingtime[0]; item.times[1] = j.second.front_pingtime[1]; item.times[2] = j.second.front_pingtime[2]; item.min[0] = j.second.front_min[0]; item.min[1] = j.second.front_min[1]; item.min[2] = j.second.front_min[2]; item.max[0] = j.second.front_max[0]; item.max[1] = j.second.front_max[1]; item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; item.last_update = j.second.last_update; item.back = false; sorted.emplace(item); } } delete pingtimes; // // Network ping times (1min 5min 15min) f->open_object_section("network_ping_times"); f->dump_int("threshold", value / 1000); f->open_array_section("entries"); for (auto &sitem : boost::adaptors::reverse(sorted)) { ceph_assert(sitem.pingtime >= value); f->open_object_section("entry"); const time_t lu(sitem.last_update); char buffer[26]; string lustr(ctime_r(&lu, buffer)); lustr.pop_back(); // Remove trailing \n auto stale = cct->_conf.get_val("osd_heartbeat_stale"); f->dump_string("last update", lustr); f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", whoami); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); f->open_object_section("average"); f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str()); f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str()); f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str()); f->close_section(); // average f->open_object_section("min"); f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); f->close_section(); // min f->open_object_section("max"); f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); f->close_section(); // max f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str()); f->close_section(); // entry } f->close_section(); // entries f->close_section(); // network_ping_times } else { ceph_abort_msg("broken asok registration"); } f->flush(ss); delete f; return true; } class TestOpsSocketHook : public AdminSocketHook { OSDService *service; ObjectStore *store; public: TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {} bool call(std::string_view command, const cmdmap_t& cmdmap, std::string_view format, bufferlist& out) override { stringstream ss; try { test_ops(service, store, command, cmdmap, ss); } catch (const bad_cmd_get& e) { ss << e.what(); } out.append(ss); return true; } void test_ops(OSDService *service, ObjectStore *store, std::string_view command, const cmdmap_t& cmdmap, ostream &ss); }; class OSD::C_Tick : public Context { OSD *osd; public: explicit C_Tick(OSD *o) : osd(o) {} void finish(int r) override { osd->tick(); } }; class OSD::C_Tick_WithoutOSDLock : public Context { OSD *osd; public: explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {} void finish(int r) override { osd->tick_without_osd_lock(); } }; int OSD::enable_disable_fuse(bool stop) { #ifdef HAVE_LIBFUSE int r; string mntpath = cct->_conf->osd_data + "/fuse"; if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) { dout(1) << __func__ << " disabling" << dendl; fuse_store->stop(); delete fuse_store; fuse_store = NULL; r = ::rmdir(mntpath.c_str()); if (r < 0) { r = -errno; derr << __func__ << " failed to rmdir " << mntpath << ": " << cpp_strerror(r) << dendl; return r; } return 0; } if (!fuse_store && cct->_conf->osd_objectstore_fuse) { dout(1) << __func__ << " enabling" << dendl; r = ::mkdir(mntpath.c_str(), 0700); if (r < 0) r = -errno; if (r < 0 && r != -EEXIST) { derr << __func__ << " unable to create " << mntpath << ": " << cpp_strerror(r) << dendl; return r; } fuse_store = new FuseStore(store, mntpath); r = fuse_store->start(); if (r < 0) { derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl; delete fuse_store; fuse_store = NULL; return r; } } #endif // HAVE_LIBFUSE return 0; } int OSD::get_num_op_shards() { if (cct->_conf->osd_op_num_shards) return cct->_conf->osd_op_num_shards; if (store_is_rotational) return cct->_conf->osd_op_num_shards_hdd; else return cct->_conf->osd_op_num_shards_ssd; } int OSD::get_num_op_threads() { if (cct->_conf->osd_op_num_threads_per_shard) return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard; if (store_is_rotational) return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd; else return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd; } float OSD::get_osd_recovery_sleep() { if (cct->_conf->osd_recovery_sleep) return cct->_conf->osd_recovery_sleep; if (!store_is_rotational && !journal_is_rotational) return cct->_conf->osd_recovery_sleep_ssd; else if (store_is_rotational && !journal_is_rotational) return cct->_conf.get_val("osd_recovery_sleep_hybrid"); else return cct->_conf->osd_recovery_sleep_hdd; } float OSD::get_osd_delete_sleep() { float osd_delete_sleep = cct->_conf.get_val("osd_delete_sleep"); if (osd_delete_sleep > 0) return osd_delete_sleep; if (!store_is_rotational && !journal_is_rotational) return cct->_conf.get_val("osd_delete_sleep_ssd"); if (store_is_rotational && !journal_is_rotational) return cct->_conf.get_val("osd_delete_sleep_hybrid"); return cct->_conf.get_val("osd_delete_sleep_hdd"); } float OSD::get_osd_snap_trim_sleep() { float osd_snap_trim_sleep = cct->_conf.get_val("osd_snap_trim_sleep"); if (osd_snap_trim_sleep > 0) return osd_snap_trim_sleep; if (!store_is_rotational && !journal_is_rotational) return cct->_conf.get_val("osd_snap_trim_sleep_ssd"); if (store_is_rotational && !journal_is_rotational) return cct->_conf.get_val("osd_snap_trim_sleep_hybrid"); return cct->_conf.get_val("osd_snap_trim_sleep_hdd"); } int OSD::init() { OSDMapRef osdmap; CompatSet initial, diff; std::lock_guard lock(osd_lock); if (is_stopping()) return 0; tick_timer.init(); tick_timer_without_osd_lock.init(); service.recovery_request_timer.init(); service.sleep_timer.init(); boot_finisher.start(); { string val; store->read_meta("require_osd_release", &val); last_require_osd_release = atoi(val.c_str()); } // mount. dout(2) << "init " << dev_path << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")" << dendl; dout(2) << "journal " << journal_path << dendl; ceph_assert(store); // call pre_init() first! store->set_cache_shards(get_num_op_shards()); int r = store->mount(); if (r < 0) { derr << "OSD:init: unable to mount object store" << dendl; return r; } journal_is_rotational = store->is_journal_rotational(); dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd") << dendl; enable_disable_fuse(false); dout(2) << "boot" << dendl; service.meta_ch = store->open_collection(coll_t::meta()); // initialize the daily loadavg with current 15min loadavg double loadavgs[3]; if (getloadavg(loadavgs, 3) == 3) { daily_loadavg = loadavgs[2]; } else { derr << "OSD::init() : couldn't read loadavgs\n" << dendl; daily_loadavg = 1.0; } int rotating_auth_attempts = 0; auto rotating_auth_timeout = g_conf().get_val("rotating_keys_bootstrap_timeout"); // sanity check long object name handling { hobject_t l; l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n'); l.set_key(string(cct->_conf->osd_max_object_name_len, 'k')); l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's'); r = store->validate_hobject_key(l); if (r < 0) { derr << "backend (" << store->get_type() << ") is unable to support max " << "object name[space] len" << dendl; derr << " osd max object name len = " << cct->_conf->osd_max_object_name_len << dendl; derr << " osd max object namespace len = " << cct->_conf->osd_max_object_namespace_len << dendl; derr << cpp_strerror(r) << dendl; if (cct->_conf->osd_check_max_object_name_len_on_startup) { goto out; } derr << "osd_check_max_object_name_len_on_startup = false, starting anyway" << dendl; } else { dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl; } } // read superblock r = read_superblock(); if (r < 0) { derr << "OSD::init() : unable to read osd superblock" << dendl; r = -EINVAL; goto out; } if (osd_compat.compare(superblock.compat_features) < 0) { derr << "The disk uses features unsupported by the executable." << dendl; derr << " ondisk features " << superblock.compat_features << dendl; derr << " daemon features " << osd_compat << dendl; if (osd_compat.writeable(superblock.compat_features)) { CompatSet diff = osd_compat.unsupported(superblock.compat_features); derr << "it is still writeable, though. Missing features: " << diff << dendl; r = -EOPNOTSUPP; goto out; } else { CompatSet diff = osd_compat.unsupported(superblock.compat_features); derr << "Cannot write to disk! Missing features: " << diff << dendl; r = -EOPNOTSUPP; goto out; } } assert_warn(whoami == superblock.whoami); if (whoami != superblock.whoami) { derr << "OSD::init: superblock says osd" << superblock.whoami << " but I am osd." << whoami << dendl; r = -EINVAL; goto out; } // load up "current" osdmap assert_warn(!get_osdmap()); if (get_osdmap()) { derr << "OSD::init: unable to read current osdmap" << dendl; r = -EINVAL; goto out; } osdmap = get_map(superblock.current_epoch); set_osdmap(osdmap); // make sure we don't have legacy pgs deleting { vector ls; int r = store->list_collections(ls); ceph_assert(r >= 0); for (auto c : ls) { spg_t pgid; if (c.is_pg(&pgid) && !osdmap->have_pg_pool(pgid.pool())) { ghobject_t oid = make_final_pool_info_oid(pgid.pool()); if (!store->exists(service.meta_ch, oid)) { derr << __func__ << " missing pg_pool_t for deleted pool " << pgid.pool() << " for pg " << pgid << "; please downgrade to luminous and allow " << "pg deletion to complete before upgrading" << dendl; ceph_abort(); } } } } initial = get_osd_initial_compat_set(); diff = superblock.compat_features.unsupported(initial); if (superblock.compat_features.merge(initial)) { // We need to persist the new compat_set before we // do anything else dout(5) << "Upgrading superblock adding: " << diff << dendl; ObjectStore::Transaction t; write_superblock(t); r = store->queue_transaction(service.meta_ch, std::move(t)); if (r < 0) goto out; } // make sure snap mapper object exists if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) { dout(10) << "init creating/touching snapmapper object" << dendl; ObjectStore::Transaction t; t.touch(coll_t::meta(), OSD::make_snapmapper_oid()); r = store->queue_transaction(service.meta_ch, std::move(t)); if (r < 0) goto out; } class_handler = new ClassHandler(cct); cls_initialize(class_handler); if (cct->_conf->osd_open_classes_on_start) { int r = class_handler->open_all_classes(); if (r) dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl; } check_osdmap_features(); create_recoverystate_perf(); { epoch_t bind_epoch = osdmap->get_epoch(); service.set_epochs(NULL, NULL, &bind_epoch); } clear_temp_objects(); // initialize osdmap references in sharded wq for (auto& shard : shards) { std::lock_guard l(shard->osdmap_lock); shard->shard_osdmap = osdmap; } // load up pgs (as they previously existed) load_pgs(); dout(2) << "superblock: I am osd." << superblock.whoami << dendl; dout(0) << "using " << op_queue << " op queue with priority op cut off at " << op_prio_cutoff << "." << dendl; create_logger(); // prime osd stats { struct store_statfs_t stbuf; osd_alert_list_t alerts; int r = store->statfs(&stbuf, &alerts); ceph_assert(r == 0); service.set_statfs(stbuf, alerts); } // client_messenger auth_client is already set up by monc. for (auto m : { cluster_messenger, objecter_messenger, hb_front_client_messenger, hb_back_client_messenger, hb_front_server_messenger, hb_back_server_messenger } ) { m->set_auth_client(monc); } for (auto m : { client_messenger, cluster_messenger, hb_front_server_messenger, hb_back_server_messenger }) { m->set_auth_server(monc); } monc->set_handle_authentication_dispatcher(this); monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MGR); r = monc->init(); if (r < 0) goto out; mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); }); mgrc.set_perf_metric_query_cb( [this](const std::map &queries) { set_perf_queries(queries); }, [this](std::map *reports) { get_perf_reports(reports); }); mgrc.init(); // tell monc about log_client so it will know about mon session resets monc->set_log_client(&log_client); update_log_config(); // i'm ready! client_messenger->add_dispatcher_tail(&mgrc); client_messenger->add_dispatcher_tail(this); cluster_messenger->add_dispatcher_head(this); hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher); hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher); hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); objecter_messenger->add_dispatcher_head(service.objecter); service.init(); service.publish_map(osdmap); service.publish_superblock(superblock); service.max_oldest_map = superblock.oldest_map; for (auto& shard : shards) { // put PGs in a temporary set because we may modify pg_slots // unordered_map below. set pgs; for (auto& i : shard->pg_slots) { PGRef pg = i.second->pg; if (!pg) { continue; } pgs.insert(pg); } for (auto pg : pgs) { pg->lock(); set> new_children; set> merge_pgs; service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id, &new_children, &merge_pgs); if (!new_children.empty()) { for (auto shard : shards) { shard->prime_splits(osdmap, &new_children); } assert(new_children.empty()); } if (!merge_pgs.empty()) { for (auto shard : shards) { shard->prime_merges(osdmap, &merge_pgs); } assert(merge_pgs.empty()); } pg->unlock(); } } osd_op_tp.start(); command_tp.start(); // start the heartbeat heartbeat_thread.create("osd_srv_heartbt"); // tick tick_timer.add_event_after(get_tick_interval(), new C_Tick(this)); { std::lock_guard l(tick_timer_lock); tick_timer_without_osd_lock.add_event_after(get_tick_interval(), new C_Tick_WithoutOSDLock(this)); } osd_lock.Unlock(); r = monc->authenticate(); if (r < 0) { derr << __func__ << " authentication failed: " << cpp_strerror(r) << dendl; exit(1); } while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) { derr << "unable to obtain rotating service keys; retrying" << dendl; ++rotating_auth_attempts; if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) { derr << __func__ << " wait_auth_rotating timed out" << dendl; exit(1); } } r = update_crush_device_class(); if (r < 0) { derr << __func__ << " unable to update_crush_device_class: " << cpp_strerror(r) << dendl; exit(1); } r = update_crush_location(); if (r < 0) { derr << __func__ << " unable to update_crush_location: " << cpp_strerror(r) << dendl; exit(1); } osd_lock.Lock(); if (is_stopping()) return 0; // start objecter *after* we have authenticated, so that we don't ignore // the OSDMaps it requests. service.final_init(); check_config(); dout(10) << "ensuring pgs have consumed prior maps" << dendl; consume_map(); dout(0) << "done with init, starting boot process" << dendl; // subscribe to any pg creations monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); // MgrClient needs this (it doesn't have MonClient reference itself) monc->sub_want("mgrmap", 0, 0); // we don't need to ask for an osdmap here; objecter will //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME); monc->renew_subs(); start_boot(); return 0; out: enable_disable_fuse(true); store->umount(); delete store; store = NULL; return r; } void OSD::final_init() { AdminSocket *admin_socket = cct->get_admin_socket(); asok_hook = new OSDSocketHook(this); int r = admin_socket->register_command("status", "status", asok_hook, "high-level status of OSD"); ceph_assert(r == 0); r = admin_socket->register_command("flush_journal", "flush_journal", asok_hook, "flush the journal to permanent store"); ceph_assert(r == 0); r = admin_socket->register_command("dump_ops_in_flight", "dump_ops_in_flight " \ "name=filterstr,type=CephString,n=N,req=false", asok_hook, "show the ops currently in flight"); ceph_assert(r == 0); r = admin_socket->register_command("ops", "ops " \ "name=filterstr,type=CephString,n=N,req=false", asok_hook, "show the ops currently in flight"); ceph_assert(r == 0); r = admin_socket->register_command("dump_blocked_ops", "dump_blocked_ops " \ "name=filterstr,type=CephString,n=N,req=false", asok_hook, "show the blocked ops currently in flight"); ceph_assert(r == 0); r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops " \ "name=filterstr,type=CephString,n=N,req=false", asok_hook, "show recent ops"); ceph_assert(r == 0); r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops " \ "name=filterstr,type=CephString,n=N,req=false", asok_hook, "show slowest recent ops"); ceph_assert(r == 0); r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration " \ "name=filterstr,type=CephString,n=N,req=false", asok_hook, "show slowest recent ops, sorted by duration"); ceph_assert(r == 0); r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state", asok_hook, "dump op priority queue state"); ceph_assert(r == 0); r = admin_socket->register_command("dump_blacklist", "dump_blacklist", asok_hook, "dump blacklisted clients and times"); ceph_assert(r == 0); r = admin_socket->register_command("dump_watchers", "dump_watchers", asok_hook, "show clients which have active watches," " and on which objects"); ceph_assert(r == 0); r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations", asok_hook, "show recovery reservations"); ceph_assert(r == 0); r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations", asok_hook, "show scrub reservations"); ceph_assert(r == 0); r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap", asok_hook, "force osd to update the latest map from " "the mon"); ceph_assert(r == 0); r = admin_socket->register_command( "heap", "heap " \ "name=heapcmd,type=CephString " \ "name=value,type=CephString,req=false", asok_hook, "show heap usage info (available only if " "compiled with tcmalloc)"); ceph_assert(r == 0); r = admin_socket->register_command("set_heap_property", "set_heap_property " \ "name=property,type=CephString " \ "name=value,type=CephInt", asok_hook, "update malloc extension heap property"); ceph_assert(r == 0); r = admin_socket->register_command("get_heap_property", "get_heap_property " \ "name=property,type=CephString", asok_hook, "get malloc extension heap property"); ceph_assert(r == 0); r = admin_socket->register_command("dump_objectstore_kv_stats", "dump_objectstore_kv_stats", asok_hook, "print statistics of kvdb which used by bluestore"); ceph_assert(r == 0); r = admin_socket->register_command("dump_scrubs", "dump_scrubs", asok_hook, "print scheduled scrubs"); ceph_assert(r == 0); r = admin_socket->register_command("calc_objectstore_db_histogram", "calc_objectstore_db_histogram", asok_hook, "Generate key value histogram of kvdb(rocksdb) which used by bluestore"); ceph_assert(r == 0); r = admin_socket->register_command("flush_store_cache", "flush_store_cache", asok_hook, "Flush bluestore internal cache"); ceph_assert(r == 0); r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history", asok_hook, "show recent state history"); ceph_assert(r == 0); r = admin_socket->register_command("compact", "compact", asok_hook, "Commpact object store's omap." " WARNING: Compaction probably slows your requests"); ceph_assert(r == 0); r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools", asok_hook, "dump pools whose PG(s) are mapped to this OSD."); ceph_assert(r == 0); r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False", asok_hook, "probe OSD devices for SMART data."); ceph_assert(r == 0); r = admin_socket->register_command("list_devices", "list_devices", asok_hook, "list OSD devices."); r = admin_socket->register_command("send_beacon", "send_beacon", asok_hook, "send OSD beacon to mon immediately"); r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook, "Dump osd heartbeat network ping times"); ceph_assert(r == 0); test_ops_hook = new TestOpsSocketHook(&(this->service), this->store); // Note: pools are CephString instead of CephPoolname because // these commands traditionally support both pool names and numbers r = admin_socket->register_command( "setomapval", "setomapval " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname " \ "name=key,type=CephString "\ "name=val,type=CephString", test_ops_hook, "set omap key"); ceph_assert(r == 0); r = admin_socket->register_command( "rmomapkey", "rmomapkey " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname " \ "name=key,type=CephString", test_ops_hook, "remove omap key"); ceph_assert(r == 0); r = admin_socket->register_command( "setomapheader", "setomapheader " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname " \ "name=header,type=CephString", test_ops_hook, "set omap header"); ceph_assert(r == 0); r = admin_socket->register_command( "getomap", "getomap " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname", test_ops_hook, "output entire object map"); ceph_assert(r == 0); r = admin_socket->register_command( "truncobj", "truncobj " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname " \ "name=len,type=CephInt", test_ops_hook, "truncate object to length"); ceph_assert(r == 0); r = admin_socket->register_command( "injectdataerr", "injectdataerr " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname " \ "name=shardid,type=CephInt,req=false,range=0|255", test_ops_hook, "inject data error to an object"); ceph_assert(r == 0); r = admin_socket->register_command( "injectmdataerr", "injectmdataerr " \ "name=pool,type=CephString " \ "name=objname,type=CephObjectname " \ "name=shardid,type=CephInt,req=false,range=0|255", test_ops_hook, "inject metadata error to an object"); ceph_assert(r == 0); r = admin_socket->register_command( "set_recovery_delay", "set_recovery_delay " \ "name=utime,type=CephInt,req=false", test_ops_hook, "Delay osd recovery by specified seconds"); ceph_assert(r == 0); r = admin_socket->register_command( "trigger_scrub", "trigger_scrub " \ "name=pgid,type=CephString " \ "name=time,type=CephInt,req=false", test_ops_hook, "Trigger a scheduled scrub "); ceph_assert(r == 0); r = admin_socket->register_command( "trigger_deep_scrub", "trigger_deep_scrub " \ "name=pgid,type=CephString " \ "name=time,type=CephInt,req=false", test_ops_hook, "Trigger a scheduled deep scrub "); ceph_assert(r == 0); r = admin_socket->register_command( "injectfull", "injectfull " \ "name=type,type=CephString,req=false " \ "name=count,type=CephInt,req=false ", test_ops_hook, "Inject a full disk (optional count times)"); ceph_assert(r == 0); } void OSD::create_logger() { dout(10) << "create_logger" << dendl; PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last); // Latency axis configuration for op histograms, values are in nanoseconds PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ "Latency (usec)", PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale 0, ///< Start at 0 100000, ///< Quantization unit is 100usec 32, ///< Enough to cover much longer than slow requests }; // Op size axis configuration for op histograms, values are in bytes PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ "Request size (bytes)", PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale 0, ///< Start at 0 512, ///< Quantization unit is 512 bytes 32, ///< Enough to cover requests larger than GB }; // All the basic OSD operation stats are to be considered useful osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); osd_plb.add_u64( l_osd_op_wip, "op_wip", "Replication operations currently being processed (primary)"); osd_plb.add_u64_counter( l_osd_op, "op", "Client operations", "ops", PerfCountersBuilder::PRIO_CRITICAL); osd_plb.add_u64_counter( l_osd_op_inb, "op_in_bytes", "Client operations total write size", "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); osd_plb.add_u64_counter( l_osd_op_outb, "op_out_bytes", "Client operations total read size", "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_op_lat, "op_latency", "Latency of client operations (including queue time)", "l", 9); osd_plb.add_time_avg( l_osd_op_process_lat, "op_process_latency", "Latency of client operations (excluding queue time)"); osd_plb.add_time_avg( l_osd_op_prepare_lat, "op_prepare_latency", "Latency of client operations (excluding queue time and wait for finished)"); osd_plb.add_u64_counter( l_osd_op_r, "op_r", "Client read operations"); osd_plb.add_u64_counter( l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_op_r_lat, "op_r_latency", "Latency of read operation (including queue time)"); osd_plb.add_u64_counter_histogram( l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram", op_hist_x_axis_config, op_hist_y_axis_config, "Histogram of operation latency (including queue time) + data read"); osd_plb.add_time_avg( l_osd_op_r_process_lat, "op_r_process_latency", "Latency of read operation (excluding queue time)"); osd_plb.add_time_avg( l_osd_op_r_prepare_lat, "op_r_prepare_latency", "Latency of read operations (excluding queue time and wait for finished)"); osd_plb.add_u64_counter( l_osd_op_w, "op_w", "Client write operations"); osd_plb.add_u64_counter( l_osd_op_w_inb, "op_w_in_bytes", "Client data written"); osd_plb.add_time_avg( l_osd_op_w_lat, "op_w_latency", "Latency of write operation (including queue time)"); osd_plb.add_u64_counter_histogram( l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram", op_hist_x_axis_config, op_hist_y_axis_config, "Histogram of operation latency (including queue time) + data written"); osd_plb.add_time_avg( l_osd_op_w_process_lat, "op_w_process_latency", "Latency of write operation (excluding queue time)"); osd_plb.add_time_avg( l_osd_op_w_prepare_lat, "op_w_prepare_latency", "Latency of write operations (excluding queue time and wait for finished)"); osd_plb.add_u64_counter( l_osd_op_rw, "op_rw", "Client read-modify-write operations"); osd_plb.add_u64_counter( l_osd_op_rw_inb, "op_rw_in_bytes", "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); osd_plb.add_u64_counter( l_osd_op_rw_outb,"op_rw_out_bytes", "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_op_rw_lat, "op_rw_latency", "Latency of read-modify-write operation (including queue time)"); osd_plb.add_u64_counter_histogram( l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram", op_hist_x_axis_config, op_hist_y_axis_config, "Histogram of rw operation latency (including queue time) + data written"); osd_plb.add_u64_counter_histogram( l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram", op_hist_x_axis_config, op_hist_y_axis_config, "Histogram of rw operation latency (including queue time) + data read"); osd_plb.add_time_avg( l_osd_op_rw_process_lat, "op_rw_process_latency", "Latency of read-modify-write operation (excluding queue time)"); osd_plb.add_time_avg( l_osd_op_rw_prepare_lat, "op_rw_prepare_latency", "Latency of read-modify-write operations (excluding queue time and wait for finished)"); // Now we move on to some more obscure stats, revert to assuming things // are low priority unless otherwise specified. osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY); osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat", "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat", "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency osd_plb.add_u64_counter( l_osd_sop, "subop", "Suboperations"); osd_plb.add_u64_counter( l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES)); osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency"); osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes"); osd_plb.add_u64_counter( l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency"); osd_plb.add_u64_counter( l_osd_sop_pull, "subop_pull", "Suboperations pull requests"); osd_plb.add_time_avg( l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency"); osd_plb.add_u64_counter( l_osd_sop_push, "subop_push", "Suboperations push messages"); osd_plb.add_u64_counter( l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES)); osd_plb.add_time_avg( l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency"); osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent"); osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent"); osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES)); osd_plb.add_u64_counter( l_osd_rop, "recovery_ops", "Started recovery operations", "rop", PerfCountersBuilder::PRIO_INTERESTING); osd_plb.add_u64_counter( l_osd_rbytes, "recovery_bytes", "recovery bytes", "rbt", PerfCountersBuilder::PRIO_INTERESTING); osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load"); osd_plb.add_u64( l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache"); osd_plb.add_u64( l_osd_cached_crc_adjusted, "cached_crc_adjusted", "Total number getting crc from crc_cache with adjusting"); osd_plb.add_u64(l_osd_missed_crc, "missed_crc", "Total number of crc cache misses"); osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups", "pgs", PerfCountersBuilder::PRIO_USEFUL); osd_plb.add_u64( l_osd_pg_primary, "numpg_primary", "Placement groups for which this osd is primary"); osd_plb.add_u64( l_osd_pg_replica, "numpg_replica", "Placement groups for which this osd is replica"); osd_plb.add_u64( l_osd_pg_stray, "numpg_stray", "Placement groups ready to be deleted from this osd"); osd_plb.add_u64( l_osd_pg_removing, "numpg_removing", "Placement groups queued for local deletion", "pgsr", PerfCountersBuilder::PRIO_USEFUL); osd_plb.add_u64( l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to"); osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages"); osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs"); osd_plb.add_u64_counter( l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); osd_plb.add_u64_counter( l_osd_waiting_for_map, "messages_delayed_for_map", "Operations waiting for OSD map"); osd_plb.add_u64_counter( l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit"); osd_plb.add_u64_counter( l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss"); osd_plb.add_u64_counter( l_osd_map_cache_miss_low, "osd_map_cache_miss_low", "osdmap cache miss below cache lower bound"); osd_plb.add_u64_avg( l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg", "osdmap cache miss, avg distance below cache lower bound"); osd_plb.add_u64_counter( l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit", "OSDMap buffer cache hits"); osd_plb.add_u64_counter( l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss", "OSDMap buffer cache misses"); osd_plb.add_u64( l_osd_stat_bytes, "stat_bytes", "OSD size", "size", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); osd_plb.add_u64( l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES)); osd_plb.add_u64_counter( l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations"); osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions"); osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes"); osd_plb.add_u64_counter( l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes"); osd_plb.add_u64_counter( l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts"); osd_plb.add_u64_counter( l_osd_tier_try_flush_fail, "tier_try_flush_fail", "Failed tier flush attempts"); osd_plb.add_u64_counter( l_osd_tier_evict, "tier_evict", "Tier evictions"); osd_plb.add_u64_counter( l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts"); osd_plb.add_u64_counter( l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set"); osd_plb.add_u64_counter( l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned"); osd_plb.add_u64_counter( l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)"); osd_plb.add_u64_counter( l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads"); osd_plb.add_u64_counter( l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes"); osd_plb.add_u64_counter( l_osd_agent_wake, "agent_wake", "Tiering agent wake up"); osd_plb.add_u64_counter( l_osd_agent_skip, "agent_skip", "Objects skipped by agent"); osd_plb.add_u64_counter( l_osd_agent_flush, "agent_flush", "Tiering agent flushes"); osd_plb.add_u64_counter( l_osd_agent_evict, "agent_evict", "Tiering agent evictions"); osd_plb.add_u64_counter( l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits"); osd_plb.add_u64_counter( l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups"); osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit"); osd_plb.add_time_avg( l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency"); osd_plb.add_time_avg( l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency"); osd_plb.add_time_avg( l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency"); osd_plb.add_u64_counter( l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)"); osd_plb.add_u64_counter( l_osd_pg_fastinfo, "osd_pg_fastinfo", "PG updated its info using fastinfo attr"); osd_plb.add_u64_counter( l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr"); logger = osd_plb.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); } void OSD::create_recoverystate_perf() { dout(10) << "create_recoverystate_perf" << dendl; PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last); rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency"); rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency"); rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency"); rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency"); rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency"); rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency"); rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency"); rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency"); rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency"); rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency"); rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency"); rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency"); rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency"); rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency"); rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency"); rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency"); rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency"); rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency"); rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency"); rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency"); rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency"); rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency"); rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency"); rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency"); rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency"); rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency"); rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency"); rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency"); rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency"); rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency"); rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency"); recoverystate_perf = rs_perf.create_perf_counters(); cct->get_perfcounters_collection()->add(recoverystate_perf); } int OSD::shutdown() { if (cct->_conf->osd_fast_shutdown) { derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl; cct->_log->flush(); _exit(0); } if (!service.prepare_to_stop()) return 0; // already shutting down osd_lock.Lock(); if (is_stopping()) { osd_lock.Unlock(); return 0; } dout(0) << "shutdown" << dendl; set_state(STATE_STOPPING); // Debugging if (cct->_conf.get_val("osd_debug_shutdown")) { cct->_conf.set_val("debug_osd", "100"); cct->_conf.set_val("debug_journal", "100"); cct->_conf.set_val("debug_filestore", "100"); cct->_conf.set_val("debug_bluestore", "100"); cct->_conf.set_val("debug_ms", "100"); cct->_conf.apply_changes(nullptr); } // stop MgrClient earlier as it's more like an internal consumer of OSD mgrc.shutdown(); service.start_shutdown(); // stop sending work to pgs. this just prevents any new work in _process // from racing with on_shutdown and potentially entering the pg after. op_shardedwq.drain(); // Shutdown PGs { vector pgs; _get_pgs(&pgs); for (auto pg : pgs) { pg->shutdown(); } } // drain op queue again (in case PGs requeued something) op_shardedwq.drain(); { finished.clear(); // zap waiters (bleh, this is messy) waiting_for_osdmap.clear(); } // unregister commands cct->get_admin_socket()->unregister_commands(asok_hook); delete asok_hook; asok_hook = NULL; cct->get_admin_socket()->unregister_commands(test_ops_hook); delete test_ops_hook; test_ops_hook = NULL; osd_lock.Unlock(); heartbeat_lock.Lock(); heartbeat_stop = true; heartbeat_cond.Signal(); heartbeat_lock.Unlock(); heartbeat_thread.join(); osd_op_tp.drain(); osd_op_tp.stop(); dout(10) << "op sharded tp stopped" << dendl; command_tp.drain(); command_tp.stop(); dout(10) << "command tp stopped" << dendl; dout(10) << "stopping agent" << dendl; service.agent_stop(); boot_finisher.wait_for_empty(); osd_lock.Lock(); boot_finisher.stop(); reset_heartbeat_peers(true); tick_timer.shutdown(); { std::lock_guard l(tick_timer_lock); tick_timer_without_osd_lock.shutdown(); } // note unmount epoch dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl; superblock.mounted = service.get_boot_epoch(); superblock.clean_thru = get_osdmap_epoch(); ObjectStore::Transaction t; write_superblock(t); int r = store->queue_transaction(service.meta_ch, std::move(t)); if (r) { derr << "OSD::shutdown: error writing superblock: " << cpp_strerror(r) << dendl; } service.shutdown_reserver(); // Remove PGs #ifdef PG_DEBUG_REFS service.dump_live_pgids(); #endif while (true) { vector pgs; _get_pgs(&pgs, true); if (pgs.empty()) { break; } for (auto& pg : pgs) { if (pg->is_deleted()) { continue; } dout(20) << " kicking pg " << pg << dendl; pg->lock(); if (pg->get_num_ref() != 1) { derr << "pgid " << pg->get_pgid() << " has ref count of " << pg->get_num_ref() << dendl; #ifdef PG_DEBUG_REFS pg->dump_live_ids(); #endif if (cct->_conf->osd_shutdown_pgref_assert) { ceph_abort(); } } pg->ch.reset(); pg->unlock(); } } #ifdef PG_DEBUG_REFS service.dump_live_pgids(); #endif osd_lock.Unlock(); cct->_conf.remove_observer(this); osd_lock.Lock(); service.meta_ch.reset(); dout(10) << "syncing store" << dendl; enable_disable_fuse(true); if (cct->_conf->osd_journal_flush_on_shutdown) { dout(10) << "flushing journal" << dendl; store->flush_journal(); } monc->shutdown(); osd_lock.Unlock(); map_lock.get_write(); set_osdmap(OSDMapRef()); map_lock.put_write(); for (auto s : shards) { std::lock_guard l(s->osdmap_lock); s->shard_osdmap = OSDMapRef(); } service.shutdown(); std::lock_guard lock(osd_lock); store->umount(); delete store; store = nullptr; dout(10) << "Store synced" << dendl; op_tracker.on_shutdown(); class_handler->shutdown(); client_messenger->shutdown(); cluster_messenger->shutdown(); hb_front_client_messenger->shutdown(); hb_back_client_messenger->shutdown(); objecter_messenger->shutdown(); hb_front_server_messenger->shutdown(); hb_back_server_messenger->shutdown(); return r; } int OSD::mon_cmd_maybe_osd_create(string &cmd) { bool created = false; while (true) { dout(10) << __func__ << " cmd: " << cmd << dendl; vector vcmd{cmd}; bufferlist inbl; C_SaferCond w; string outs; monc->start_mon_command(vcmd, inbl, NULL, &outs, &w); int r = w.wait(); if (r < 0) { if (r == -ENOENT && !created) { string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami) + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}"; vector vnewcmd{newcmd}; bufferlist inbl; C_SaferCond w; string outs; monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w); int r = w.wait(); if (r < 0) { derr << __func__ << " fail: osd does not exist and created failed: " << cpp_strerror(r) << dendl; return r; } created = true; continue; } derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl; return r; } break; } return 0; } int OSD::update_crush_location() { if (!cct->_conf->osd_crush_update_on_start) { dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl; return 0; } char weight[32]; if (cct->_conf->osd_crush_initial_weight >= 0) { snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight); } else { struct store_statfs_t st; osd_alert_list_t alerts; int r = store->statfs(&st, &alerts); if (r < 0) { derr << "statfs: " << cpp_strerror(r) << dendl; return r; } snprintf(weight, sizeof(weight), "%.4lf", std::max(.00001, double(st.total) / double(1ull << 40 /* TB */))); } std::multimap loc = cct->crush_location.get_location(); dout(10) << __func__ << " crush location is " << loc << dendl; string cmd = string("{\"prefix\": \"osd crush create-or-move\", ") + string("\"id\": ") + stringify(whoami) + string(", ") + string("\"weight\":") + weight + string(", ") + string("\"args\": ["); for (multimap::iterator p = loc.begin(); p != loc.end(); ++p) { if (p != loc.begin()) cmd += ", "; cmd += "\"" + p->first + "=" + p->second + "\""; } cmd += "]}"; return mon_cmd_maybe_osd_create(cmd); } int OSD::update_crush_device_class() { if (!cct->_conf->osd_class_update_on_start) { dout(10) << __func__ << " osd_class_update_on_start = false" << dendl; return 0; } string device_class; int r = store->read_meta("crush_device_class", &device_class); if (r < 0 || device_class.empty()) { device_class = store->get_default_device_class(); } if (device_class.empty()) { dout(20) << __func__ << " no device class stored locally" << dendl; return 0; } string cmd = string("{\"prefix\": \"osd crush set-device-class\", ") + string("\"class\": \"") + device_class + string("\", ") + string("\"ids\": [\"") + stringify(whoami) + string("\"]}"); r = mon_cmd_maybe_osd_create(cmd); if (r == -EBUSY) { // good, already bound to a device-class return 0; } else { return r; } } void OSD::write_superblock(ObjectStore::Transaction& t) { dout(10) << "write_superblock " << superblock << dendl; //hack: at minimum it's using the baseline feature set if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE)) superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); bufferlist bl; encode(superblock, bl); t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl); } int OSD::read_superblock() { bufferlist bl; int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); if (r < 0) return r; auto p = bl.cbegin(); decode(superblock, p); dout(10) << "read_superblock " << superblock << dendl; return 0; } void OSD::clear_temp_objects() { dout(10) << __func__ << dendl; vector ls; store->list_collections(ls); for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { spg_t pgid; if (!p->is_pg(&pgid)) continue; // list temp objects dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl; vector temps; ghobject_t next; while (1) { vector objects; auto ch = store->open_collection(*p); ceph_assert(ch); store->collection_list(ch, next, ghobject_t::get_max(), store->get_ideal_list_max(), &objects, &next); if (objects.empty()) break; vector::iterator q; for (q = objects.begin(); q != objects.end(); ++q) { // Hammer set pool for temps to -1, so check for clean-up if (q->hobj.is_temp() || (q->hobj.pool == -1)) { temps.push_back(*q); } else { break; } } // If we saw a non-temp object and hit the break above we can // break out of the while loop too. if (q != objects.end()) break; } if (!temps.empty()) { ObjectStore::Transaction t; int removed = 0; for (vector::iterator q = temps.begin(); q != temps.end(); ++q) { dout(20) << " removing " << *p << " object " << *q << dendl; t.remove(*p, *q); if (++removed > cct->_conf->osd_target_transaction_size) { store->queue_transaction(service.meta_ch, std::move(t)); t = ObjectStore::Transaction(); removed = 0; } } if (removed) { store->queue_transaction(service.meta_ch, std::move(t)); } } } } void OSD::recursive_remove_collection(CephContext* cct, ObjectStore *store, spg_t pgid, coll_t tmp) { OSDriver driver( store, coll_t(), make_snapmapper_oid()); ObjectStore::CollectionHandle ch = store->open_collection(tmp); ObjectStore::Transaction t; SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard); ghobject_t next; int max = cct->_conf->osd_target_transaction_size; vector objects; objects.reserve(max); while (true) { objects.clear(); store->collection_list(ch, next, ghobject_t::get_max(), max, &objects, &next); generic_dout(10) << __func__ << " " << objects << dendl; if (objects.empty()) break; for (auto& p: objects) { OSDriver::OSTransaction _t(driver.get_transaction(&t)); int r = mapper.remove_oid(p.hobj, &_t); if (r != 0 && r != -ENOENT) ceph_abort(); t.remove(tmp, p); } int r = store->queue_transaction(ch, std::move(t)); ceph_assert(r == 0); t = ObjectStore::Transaction(); } t.remove_collection(tmp); int r = store->queue_transaction(ch, std::move(t)); ceph_assert(r == 0); C_SaferCond waiter; if (!ch->flush_commit(&waiter)) { waiter.wait(); } } // ====================================================== // PG's PG* OSD::_make_pg( OSDMapRef createmap, spg_t pgid) { dout(10) << __func__ << " " << pgid << dendl; pg_pool_t pi; map ec_profile; string name; if (createmap->have_pg_pool(pgid.pool())) { pi = *createmap->get_pg_pool(pgid.pool()); name = createmap->get_pool_name(pgid.pool()); if (pi.is_erasure()) { ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile); } } else { // pool was deleted; grab final pg_pool_t off disk. ghobject_t oid = make_final_pool_info_oid(pgid.pool()); bufferlist bl; int r = store->read(service.meta_ch, oid, 0, 0, bl); if (r < 0) { derr << __func__ << " missing pool " << pgid.pool() << " tombstone" << dendl; return nullptr; } ceph_assert(r >= 0); auto p = bl.cbegin(); decode(pi, p); decode(name, p); if (p.end()) { // dev release v13.0.2 did not include ec_profile derr << __func__ << " missing ec_profile from pool " << pgid.pool() << " tombstone" << dendl; return nullptr; } decode(ec_profile, p); } PGPool pool(cct, createmap, pgid.pool(), pi, name); PG *pg; if (pi.type == pg_pool_t::TYPE_REPLICATED || pi.type == pg_pool_t::TYPE_ERASURE) pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid); else ceph_abort(); return pg; } void OSD::_get_pgs(vector *v, bool clear_too) { v->clear(); v->reserve(get_num_pgs()); for (auto& s : shards) { std::lock_guard l(s->shard_lock); for (auto& j : s->pg_slots) { if (j.second->pg && !j.second->pg->is_deleted()) { v->push_back(j.second->pg); if (clear_too) { s->_detach_pg(j.second.get()); } } } } } void OSD::_get_pgids(vector *v) { v->clear(); v->reserve(get_num_pgs()); for (auto& s : shards) { std::lock_guard l(s->shard_lock); for (auto& j : s->pg_slots) { if (j.second->pg && !j.second->pg->is_deleted()) { v->push_back(j.first); } } } } void OSD::register_pg(PGRef pg) { spg_t pgid = pg->get_pgid(); uint32_t shard_index = pgid.hash_to_shard(num_shards); auto sdata = shards[shard_index]; std::lock_guard l(sdata->shard_lock); auto r = sdata->pg_slots.emplace(pgid, make_unique()); ceph_assert(r.second); auto *slot = r.first->second.get(); dout(20) << __func__ << " " << pgid << " " << pg << dendl; sdata->_attach_pg(slot, pg.get()); } bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num) { auto sdata = pg->osd_shard; ceph_assert(sdata); { std::lock_guard l(sdata->shard_lock); auto p = sdata->pg_slots.find(pg->pg_id); if (p == sdata->pg_slots.end() || !p->second->pg) { dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl; return false; } if (p->second->waiting_for_merge_epoch) { dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl; return false; } dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl; sdata->_detach_pg(p->second.get()); } for (auto shard : shards) { shard->unprime_split_children(pg->pg_id, old_pg_num); } // update pg count now since we might not get an osdmap any time soon. if (pg->is_primary()) service.logger->dec(l_osd_pg_primary); else if (pg->is_replica()) service.logger->dec(l_osd_pg_replica); else service.logger->dec(l_osd_pg_stray); return true; } PGRef OSD::_lookup_pg(spg_t pgid) { uint32_t shard_index = pgid.hash_to_shard(num_shards); auto sdata = shards[shard_index]; std::lock_guard l(sdata->shard_lock); auto p = sdata->pg_slots.find(pgid); if (p == sdata->pg_slots.end()) { return nullptr; } return p->second->pg; } PGRef OSD::_lookup_lock_pg(spg_t pgid) { PGRef pg = _lookup_pg(pgid); if (!pg) { return nullptr; } pg->lock(); if (!pg->is_deleted()) { return pg; } pg->unlock(); return nullptr; } PGRef OSD::lookup_lock_pg(spg_t pgid) { return _lookup_lock_pg(pgid); } void OSD::load_pgs() { ceph_assert(osd_lock.is_locked()); dout(0) << "load_pgs" << dendl; { auto pghist = make_pg_num_history_oid(); bufferlist bl; int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0); if (r >= 0 && bl.length() > 0) { auto p = bl.cbegin(); decode(pg_num_history, p); } dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; } vector ls; int r = store->list_collections(ls); if (r < 0) { derr << "failed to list pgs: " << cpp_strerror(-r) << dendl; } int num = 0; for (vector::iterator it = ls.begin(); it != ls.end(); ++it) { spg_t pgid; if (it->is_temp(&pgid) || (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) { dout(10) << "load_pgs " << *it << " removing, legacy or flagged for removal pg" << dendl; recursive_remove_collection(cct, store, pgid, *it); continue; } if (!it->is_pg(&pgid)) { dout(10) << "load_pgs ignoring unrecognized " << *it << dendl; continue; } dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl; epoch_t map_epoch = 0; int r = PG::peek_map_epoch(store, pgid, &map_epoch); if (r < 0) { derr << __func__ << " unable to peek at " << pgid << " metadata, skipping" << dendl; continue; } PGRef pg; if (map_epoch > 0) { OSDMapRef pgosdmap = service.try_get_map(map_epoch); if (!pgosdmap) { if (!get_osdmap()->have_pg_pool(pgid.pool())) { derr << __func__ << ": could not find map for epoch " << map_epoch << " on pg " << pgid << ", but the pool is not present in the " << "current map, so this is probably a result of bug 10617. " << "Skipping the pg for now, you can use ceph-objectstore-tool " << "to clean it up later." << dendl; continue; } else { derr << __func__ << ": have pgid " << pgid << " at epoch " << map_epoch << ", but missing map. Crashing." << dendl; ceph_abort_msg("Missing map in load_pgs"); } } pg = _make_pg(pgosdmap, pgid); } else { pg = _make_pg(get_osdmap(), pgid); } if (!pg) { recursive_remove_collection(cct, store, pgid, *it); continue; } // there can be no waiters here, so we don't call _wake_pg_slot pg->lock(); pg->ch = store->open_collection(pg->coll); // read pg state, log pg->read_state(store); if (pg->dne()) { dout(10) << "load_pgs " << *it << " deleting dne" << dendl; pg->ch = nullptr; pg->unlock(); recursive_remove_collection(cct, store, pgid, *it); continue; } { uint32_t shard_index = pgid.hash_to_shard(shards.size()); assert(NULL != shards[shard_index]); store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue)); } pg->reg_next_scrub(); dout(10) << __func__ << " loaded " << *pg << dendl; pg->unlock(); register_pg(pg); ++num; } dout(0) << __func__ << " opened " << num << " pgs" << dendl; } PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap, const PGCreateInfo *info) { spg_t pgid = info->pgid; if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) { dout(10) << __func__ << " hit max pg, dropping" << dendl; return nullptr; } PG::RecoveryCtx rctx = create_context(); OSDMapRef startmap = get_map(info->epoch); if (info->by_mon) { int64_t pool_id = pgid.pgid.pool(); const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); if (!pool) { dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl; return nullptr; } if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS && !pool->has_flag(pg_pool_t::FLAG_CREATING)) { // this ensures we do not process old creating messages after the // pool's initial pgs have been created (and pg are subsequently // allowed to split or merge). dout(20) << __func__ << " dropping " << pgid << "create, pool does not have CREATING flag set" << dendl; return nullptr; } } int up_primary, acting_primary; vector up, acting; startmap->pg_to_up_acting_osds( pgid.pgid, &up, &up_primary, &acting, &acting_primary); const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool()); if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) && store->get_type() != "bluestore") { clog->warn() << "pg " << pgid << " is at risk of silent data corruption: " << "the pool allows ec overwrites but is not stored in " << "bluestore, so deep scrubbing will not detect bitrot"; } PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); PG::_init(*rctx.transaction, pgid, pp); int role = startmap->calc_pg_role(whoami, acting, acting.size()); if (!pp->is_replicated() && role != pgid.shard) { role = -1; } PGRef pg = _make_pg(startmap, pgid); pg->ch = store->create_new_collection(pg->coll); { uint32_t shard_index = pgid.hash_to_shard(shards.size()); assert(NULL != shards[shard_index]); store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue)); } pg->lock(true); // we are holding the shard lock ceph_assert(!pg->is_deleted()); pg->init( role, up, up_primary, acting, acting_primary, info->history, info->past_intervals, false, rctx.transaction); pg->init_collection_pool_opts(); if (pg->is_primary()) { Mutex::Locker locker(m_perf_queries_lock); pg->set_dynamic_perf_stats_queries(m_perf_queries); } pg->handle_initialize(&rctx); pg->handle_activate_map(&rctx); dispatch_context(rctx, pg.get(), osdmap, nullptr); dout(10) << __func__ << " new pg " << *pg << dendl; return pg; } bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap, spg_t pgid, bool is_mon_create) { const auto max_pgs_per_osd = (cct->_conf.get_val("mon_max_pg_per_osd") * cct->_conf.get_val("osd_max_pg_per_osd_hard_ratio")); if (num_pgs < max_pgs_per_osd) { return false; } std::lock_guard l(pending_creates_lock); if (is_mon_create) { pending_creates_from_mon++; } else { bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0; pending_creates_from_osd.emplace(pgid.pgid, is_primary); } dout(1) << __func__ << " withhold creation of pg " << pgid << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl; return true; } // to re-trigger a peering, we have to twiddle the pg mapping a little bit, // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn // to up set if pg_temp is empty. so an empty pg_temp won't work. static vector twiddle(const vector& acting) { if (acting.size() > 1) { return {acting[0]}; } else { vector twiddled(acting.begin(), acting.end()); twiddled.push_back(-1); return twiddled; } } void OSD::resume_creating_pg() { bool do_sub_pg_creates = false; bool have_pending_creates = false; { const auto max_pgs_per_osd = (cct->_conf.get_val("mon_max_pg_per_osd") * cct->_conf.get_val("osd_max_pg_per_osd_hard_ratio")); if (max_pgs_per_osd <= num_pgs) { // this could happen if admin decreases this setting before a PG is removed return; } unsigned spare_pgs = max_pgs_per_osd - num_pgs; std::lock_guard l(pending_creates_lock); if (pending_creates_from_mon > 0) { dout(20) << __func__ << " pending_creates_from_mon " << pending_creates_from_mon << dendl; do_sub_pg_creates = true; if (pending_creates_from_mon >= spare_pgs) { spare_pgs = pending_creates_from_mon = 0; } else { spare_pgs -= pending_creates_from_mon; pending_creates_from_mon = 0; } } auto pg = pending_creates_from_osd.cbegin(); while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) { dout(20) << __func__ << " pg " << pg->first << dendl; vector acting; get_osdmap()->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr); service.queue_want_pg_temp(pg->first, twiddle(acting), true); pg = pending_creates_from_osd.erase(pg); do_sub_pg_creates = true; spare_pgs--; } have_pending_creates = (pending_creates_from_mon > 0 || !pending_creates_from_osd.empty()); } bool do_renew_subs = false; if (do_sub_pg_creates) { if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) { dout(4) << __func__ << ": resolicit pg creates from mon since " << last_pg_create_epoch << dendl; do_renew_subs = true; } } version_t start = get_osdmap_epoch() + 1; if (have_pending_creates) { // don't miss any new osdmap deleting PGs if (monc->sub_want("osdmap", start, 0)) { dout(4) << __func__ << ": resolicit osdmap from mon since " << start << dendl; do_renew_subs = true; } } else if (do_sub_pg_creates) { // no need to subscribe the osdmap continuously anymore // once the pgtemp and/or mon_subscribe(pg_creates) is sent if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) { dout(4) << __func__ << ": re-subscribe osdmap(onetime) since " << start << dendl; do_renew_subs = true; } } if (do_renew_subs) { monc->renew_subs(); } service.send_pg_temp(); } void OSD::build_initial_pg_history( spg_t pgid, epoch_t created, utime_t created_stamp, pg_history_t *h, PastIntervals *pi) { dout(10) << __func__ << " " << pgid << " created " << created << dendl; h->epoch_created = created; h->epoch_pool_created = created; h->same_interval_since = created; h->same_up_since = created; h->same_primary_since = created; h->last_scrub_stamp = created_stamp; h->last_deep_scrub_stamp = created_stamp; h->last_clean_scrub_stamp = created_stamp; OSDMapRef lastmap = service.get_map(created); int up_primary, acting_primary; vector up, acting; lastmap->pg_to_up_acting_osds( pgid.pgid, &up, &up_primary, &acting, &acting_primary); ostringstream debug; for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) { OSDMapRef osdmap = service.get_map(e); int new_up_primary, new_acting_primary; vector new_up, new_acting; osdmap->pg_to_up_acting_osds( pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary); // this is a bit imprecise, but sufficient? struct min_size_predicate_t : public IsPGRecoverablePredicate { const pg_pool_t *pi; bool operator()(const set &have) const { return have.size() >= pi->min_size; } explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {} } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool())); bool new_interval = PastIntervals::check_new_interval( acting_primary, new_acting_primary, acting, new_acting, up_primary, new_up_primary, up, new_up, h->same_interval_since, h->last_epoch_clean, osdmap, lastmap, pgid.pgid, &min_size_predicate, pi, &debug); if (new_interval) { h->same_interval_since = e; if (up != new_up) { h->same_up_since = e; } if (acting_primary != new_acting_primary) { h->same_primary_since = e; } if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()), osdmap->get_pg_num(pgid.pgid.pool()), nullptr)) { h->last_epoch_split = e; } up = new_up; acting = new_acting; up_primary = new_up_primary; acting_primary = new_acting_primary; } lastmap = osdmap; } dout(20) << __func__ << " " << debug.str() << dendl; dout(10) << __func__ << " " << *h << " " << *pi << " [" << (pi->empty() ? pair(0,0) : pi->get_bounds()) << ")" << dendl; } void OSD::_add_heartbeat_peer(int p) { if (p == whoami) return; HeartbeatInfo *hi; map::iterator i = heartbeat_peers.find(p); if (i == heartbeat_peers.end()) { pair cons = service.get_con_osd_hb(p, get_osdmap_epoch()); if (!cons.first) return; hi = &heartbeat_peers[p]; hi->peer = p; RefCountedPtr s{new HeartbeatSession{p}, false}; hi->hb_interval_start = ceph_clock_now(); hi->con_back = cons.first.get(); hi->con_back->set_priv(s); if (cons.second) { hi->con_front = cons.second.get(); hi->con_front->set_priv(s); dout(10) << "_add_heartbeat_peer: new peer osd." << p << " " << hi->con_back->get_peer_addr() << " " << hi->con_front->get_peer_addr() << dendl; } else { hi->con_front.reset(NULL); dout(10) << "_add_heartbeat_peer: new peer osd." << p << " " << hi->con_back->get_peer_addr() << dendl; } } else { hi = &i->second; } hi->epoch = get_osdmap_epoch(); } void OSD::_remove_heartbeat_peer(int n) { map::iterator q = heartbeat_peers.find(n); ceph_assert(q != heartbeat_peers.end()); dout(20) << " removing heartbeat peer osd." << n << " " << q->second.con_back->get_peer_addr() << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t()) << dendl; q->second.con_back->mark_down(); if (q->second.con_front) { q->second.con_front->mark_down(); } heartbeat_peers.erase(q); } void OSD::need_heartbeat_peer_update() { if (is_stopping()) return; dout(20) << "need_heartbeat_peer_update" << dendl; heartbeat_set_peers_need_update(); } void OSD::maybe_update_heartbeat_peers() { ceph_assert(osd_lock.is_locked()); if (is_waiting_for_healthy() || is_active()) { utime_t now = ceph_clock_now(); if (last_heartbeat_resample == utime_t()) { last_heartbeat_resample = now; heartbeat_set_peers_need_update(); } else if (!heartbeat_peers_need_update()) { utime_t dur = now - last_heartbeat_resample; if (dur > cct->_conf->osd_heartbeat_grace) { dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl; heartbeat_set_peers_need_update(); last_heartbeat_resample = now; // automatically clean up any stale heartbeat peers // if we are unhealthy, then clean all reset_heartbeat_peers(is_waiting_for_healthy()); } } } if (!heartbeat_peers_need_update()) return; heartbeat_clear_peers_need_update(); std::lock_guard l(heartbeat_lock); dout(10) << "maybe_update_heartbeat_peers updating" << dendl; // build heartbeat from set if (is_active()) { vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { pg->with_heartbeat_peers([&](int peer) { if (get_osdmap()->is_up(peer)) { _add_heartbeat_peer(peer); } }); } } // include next and previous up osds to ensure we have a fully-connected set set want, extras; const int next = get_osdmap()->get_next_up_osd_after(whoami); if (next >= 0) want.insert(next); int prev = get_osdmap()->get_previous_up_osd_before(whoami); if (prev >= 0 && prev != next) want.insert(prev); // make sure we have at least **min_down** osds coming from different // subtree level (e.g., hosts) for fast failure detection. auto min_down = cct->_conf.get_val("mon_osd_min_down_reporters"); auto subtree = cct->_conf.get_val("mon_osd_reporter_subtree_level"); get_osdmap()->get_random_up_osds_by_subtree( whoami, subtree, min_down, want, &want); for (set::iterator p = want.begin(); p != want.end(); ++p) { dout(10) << " adding neighbor peer osd." << *p << dendl; extras.insert(*p); _add_heartbeat_peer(*p); } // remove down peers; enumerate extras map::iterator p = heartbeat_peers.begin(); while (p != heartbeat_peers.end()) { if (!get_osdmap()->is_up(p->first)) { int o = p->first; ++p; _remove_heartbeat_peer(o); continue; } if (p->second.epoch < get_osdmap_epoch()) { extras.insert(p->first); } ++p; } // too few? for (int n = next; n >= 0; ) { if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers) break; if (!extras.count(n) && !want.count(n) && n != whoami) { dout(10) << " adding random peer osd." << n << dendl; extras.insert(n); _add_heartbeat_peer(n); } n = get_osdmap()->get_next_up_osd_after(n); if (n == next) break; // came full circle; stop } // too many? for (set::iterator p = extras.begin(); (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end(); ++p) { if (want.count(*p)) continue; _remove_heartbeat_peer(*p); } dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl; } void OSD::reset_heartbeat_peers(bool all) { ceph_assert(osd_lock.is_locked()); dout(10) << "reset_heartbeat_peers" << dendl; utime_t stale = ceph_clock_now(); stale -= cct->_conf.get_val("osd_heartbeat_stale"); std::lock_guard l(heartbeat_lock); for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) { HeartbeatInfo& hi = it->second; if (all || hi.is_stale(stale)) { hi.con_back->mark_down(); if (hi.con_front) { hi.con_front->mark_down(); } // stop sending failure_report to mon too failure_queue.erase(it->first); heartbeat_peers.erase(it++); } else { it++; } } } void OSD::handle_osd_ping(MOSDPing *m) { if (superblock.cluster_fsid != m->fsid) { dout(20) << "handle_osd_ping from " << m->get_source_inst() << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl; m->put(); return; } int from = m->get_source().num(); heartbeat_lock.Lock(); if (is_stopping()) { heartbeat_lock.Unlock(); m->put(); return; } OSDMapRef curmap = service.get_osdmap(); if (!curmap) { heartbeat_lock.Unlock(); m->put(); return; } switch (m->op) { case MOSDPing::PING: { if (cct->_conf->osd_debug_drop_ping_probability > 0) { auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from); if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) { if (heartbeat_drop->second == 0) { debug_heartbeat_drops_remaining.erase(heartbeat_drop); } else { --heartbeat_drop->second; dout(5) << "Dropping heartbeat from " << from << ", " << heartbeat_drop->second << " remaining to drop" << dendl; break; } } else if (cct->_conf->osd_debug_drop_ping_probability > ((((double)(rand()%100))/100.0))) { heartbeat_drop = debug_heartbeat_drops_remaining.insert(std::make_pair(from, cct->_conf->osd_debug_drop_ping_duration)).first; dout(5) << "Dropping heartbeat from " << from << ", " << heartbeat_drop->second << " remaining to drop" << dendl; break; } } if (!cct->get_heartbeat_map()->is_healthy()) { dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl; break; } Message *r = new MOSDPing(monc->get_fsid(), curmap->get_epoch(), MOSDPing::PING_REPLY, m->stamp, cct->_conf->osd_heartbeat_min_size); m->get_connection()->send_message(r); if (curmap->is_up(from)) { service.note_peer_epoch(from, m->map_epoch); if (is_active()) { ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch()); if (con) { service.share_map_peer(from, con.get()); } } } else if (!curmap->exists(from) || curmap->get_down_at(from) > m->map_epoch) { // tell them they have died Message *r = new MOSDPing(monc->get_fsid(), curmap->get_epoch(), MOSDPing::YOU_DIED, m->stamp, cct->_conf->osd_heartbeat_min_size); m->get_connection()->send_message(r); } } break; case MOSDPing::PING_REPLY: { map::iterator i = heartbeat_peers.find(from); if (i != heartbeat_peers.end()) { auto acked = i->second.ping_history.find(m->stamp); if (acked != i->second.ping_history.end()) { utime_t now = ceph_clock_now(); int &unacknowledged = acked->second.second; if (m->get_connection() == i->second.con_back) { dout(25) << "handle_osd_ping got reply from osd." << from << " first_tx " << i->second.first_tx << " last_tx " << i->second.last_tx << " last_rx_back " << i->second.last_rx_back << " -> " << now << " last_rx_front " << i->second.last_rx_front << dendl; i->second.last_rx_back = now; ceph_assert(unacknowledged > 0); --unacknowledged; // if there is no front con, set both stamps. if (i->second.con_front == NULL) { i->second.last_rx_front = now; ceph_assert(unacknowledged > 0); --unacknowledged; } } else if (m->get_connection() == i->second.con_front) { dout(25) << "handle_osd_ping got reply from osd." << from << " first_tx " << i->second.first_tx << " last_tx " << i->second.last_tx << " last_rx_back " << i->second.last_rx_back << " last_rx_front " << i->second.last_rx_front << " -> " << now << dendl; i->second.last_rx_front = now; ceph_assert(unacknowledged > 0); --unacknowledged; } if (unacknowledged == 0) { // succeeded in getting all replies dout(25) << "handle_osd_ping got all replies from osd." << from << " , erase pending ping(sent at " << m->stamp << ")" << " and older pending ping(s)" << dendl; #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5) ++i->second.hb_average_count; uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp); i->second.hb_total_back += back_pingtime; if (back_pingtime < i->second.hb_min_back) i->second.hb_min_back = back_pingtime; if (back_pingtime > i->second.hb_max_back) i->second.hb_max_back = back_pingtime; uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp); i->second.hb_total_front += front_pingtime; if (front_pingtime < i->second.hb_min_front) i->second.hb_min_front = front_pingtime; if (front_pingtime > i->second.hb_max_front) i->second.hb_max_front = front_pingtime; ceph_assert(i->second.hb_interval_start != utime_t()); if (i->second.hb_interval_start == utime_t()) i->second.hb_interval_start = now; int64_t hb_avg_time_period = 60; if (cct->_conf.get_val("debug_heartbeat_testing_span")) { hb_avg_time_period = cct->_conf.get_val("debug_heartbeat_testing_span"); } if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) { uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count; uint32_t back_min = i->second.hb_min_back; uint32_t back_max = i->second.hb_max_back; uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count; uint32_t front_min = i->second.hb_min_front; uint32_t front_max = i->second.hb_max_front; // Reset for new interval i->second.hb_average_count = 0; i->second.hb_interval_start = now; i->second.hb_total_back = i->second.hb_max_back = 0; i->second.hb_min_back = UINT_MAX; i->second.hb_total_front = i->second.hb_max_front = 0; i->second.hb_min_front = UINT_MAX; // Record per osd interace ping times // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval if (i->second.hb_back_pingtime.size() == 0) { ceph_assert(i->second.hb_front_pingtime.size() == 0); for (unsigned k = 0 ; k < hb_vector_size; ++k) { i->second.hb_back_pingtime.push_back(back_avg); i->second.hb_back_min.push_back(back_min); i->second.hb_back_max.push_back(back_max); i->second.hb_front_pingtime.push_back(front_avg); i->second.hb_front_min.push_back(front_min); i->second.hb_front_max.push_back(front_max); ++i->second.hb_index; } } else { int index = i->second.hb_index & (hb_vector_size - 1); i->second.hb_back_pingtime[index] = back_avg; i->second.hb_back_min[index] = back_min; i->second.hb_back_max[index] = back_max; i->second.hb_front_pingtime[index] = front_avg; i->second.hb_front_min[index] = front_min; i->second.hb_front_max[index] = front_max; ++i->second.hb_index; } { std::lock_guard l(service.stat_lock); service.osd_stat.hb_pingtime[from].last_update = now.sec(); service.osd_stat.hb_pingtime[from].back_last = back_pingtime; uint32_t total = 0; uint32_t min = UINT_MAX; uint32_t max = 0; uint32_t count = 0; uint32_t which = 0; uint32_t size = (uint32_t)i->second.hb_back_pingtime.size(); for (int32_t k = size - 1 ; k >= 0; --k) { ++count; int index = (i->second.hb_index + k) % size; total += i->second.hb_back_pingtime[index]; if (i->second.hb_back_min[index] < min) min = i->second.hb_back_min[index]; if (i->second.hb_back_max[index] > max) max = i->second.hb_back_max[index]; if (count == 1 || count == 5 || count == 15) { service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count; service.osd_stat.hb_pingtime[from].back_min[which] = min; service.osd_stat.hb_pingtime[from].back_max[which] = max; which++; if (count == 15) break; } } if (i->second.con_front != NULL) { service.osd_stat.hb_pingtime[from].front_last = front_pingtime; total = 0; min = UINT_MAX; max = 0; count = 0; which = 0; for (int32_t k = size - 1 ; k >= 0; --k) { ++count; int index = (i->second.hb_index + k) % size; total += i->second.hb_front_pingtime[index]; if (i->second.hb_front_min[index] < min) min = i->second.hb_front_min[index]; if (i->second.hb_front_max[index] > max) max = i->second.hb_front_max[index]; if (count == 1 || count == 5 || count == 15) { service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count; service.osd_stat.hb_pingtime[from].front_min[which] = min; service.osd_stat.hb_pingtime[from].front_max[which] = max; which++; if (count == 15) break; } } } } } else { std::lock_guard l(service.stat_lock); service.osd_stat.hb_pingtime[from].back_last = back_pingtime; if (i->second.con_front != NULL) service.osd_stat.hb_pingtime[from].front_last = front_pingtime; } i->second.ping_history.erase(i->second.ping_history.begin(), ++acked); } if (i->second.is_healthy(now)) { // Cancel false reports auto failure_queue_entry = failure_queue.find(from); if (failure_queue_entry != failure_queue.end()) { dout(10) << "handle_osd_ping canceling queued " << "failure report for osd." << from << dendl; failure_queue.erase(failure_queue_entry); } auto failure_pending_entry = failure_pending.find(from); if (failure_pending_entry != failure_pending.end()) { dout(10) << "handle_osd_ping canceling in-flight " << "failure report for osd." << from << dendl; send_still_alive(curmap->get_epoch(), from, failure_pending_entry->second.second); failure_pending.erase(failure_pending_entry); } } } else { // old replies, deprecated by newly sent pings. dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp << ") is found, treat as covered by newly sent pings " << "and ignore" << dendl; } } if (m->map_epoch && curmap->is_up(from)) { service.note_peer_epoch(from, m->map_epoch); if (is_active()) { ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch()); if (con) { service.share_map_peer(from, con.get()); } } } } break; case MOSDPing::YOU_DIED: dout(10) << "handle_osd_ping " << m->get_source_inst() << " says i am down in " << m->map_epoch << dendl; osdmap_subscribe(curmap->get_epoch()+1, false); break; } heartbeat_lock.Unlock(); m->put(); } void OSD::heartbeat_entry() { std::lock_guard l(heartbeat_lock); if (is_stopping()) return; while (!heartbeat_stop) { heartbeat(); double wait; if (cct->_conf.get_val("debug_disable_randomized_ping")) { wait = (float)cct->_conf->osd_heartbeat_interval; } else { wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval; } utime_t w; w.set_from_double(wait); dout(30) << "heartbeat_entry sleeping for " << wait << dendl; heartbeat_cond.WaitInterval(heartbeat_lock, w); if (is_stopping()) return; dout(30) << "heartbeat_entry woke up" << dendl; } } void OSD::heartbeat_check() { ceph_assert(heartbeat_lock.is_locked()); utime_t now = ceph_clock_now(); // check for incoming heartbeats (move me elsewhere?) for (map::iterator p = heartbeat_peers.begin(); p != heartbeat_peers.end(); ++p) { if (p->second.first_tx == utime_t()) { dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first << " yet, skipping" << dendl; continue; } dout(25) << "heartbeat_check osd." << p->first << " first_tx " << p->second.first_tx << " last_tx " << p->second.last_tx << " last_rx_back " << p->second.last_rx_back << " last_rx_front " << p->second.last_rx_front << dendl; if (p->second.is_unhealthy(now)) { utime_t oldest_deadline = p->second.ping_history.begin()->second.first; if (p->second.last_rx_back == utime_t() || p->second.last_rx_front == utime_t()) { derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr() << " osd." << p->first << " ever on either front or back, first ping sent " << p->second.first_tx << " (oldest deadline " << oldest_deadline << ")" << dendl; // fail failure_queue[p->first] = p->second.first_tx; } else { derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr() << " osd." << p->first << " since back " << p->second.last_rx_back << " front " << p->second.last_rx_front << " (oldest deadline " << oldest_deadline << ")" << dendl; // fail failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front); } } } } void OSD::heartbeat() { ceph_assert(heartbeat_lock.is_locked_by_me()); dout(30) << "heartbeat" << dendl; // get CPU load avg double loadavgs[1]; int hb_interval = cct->_conf->osd_heartbeat_interval; int n_samples = 86400; if (hb_interval > 1) { n_samples /= hb_interval; if (n_samples < 1) n_samples = 1; } if (getloadavg(loadavgs, 1) == 1) { logger->set(l_osd_loadavg, 100 * loadavgs[0]); daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples; dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl; } dout(30) << "heartbeat checking stats" << dendl; // refresh peer list and osd stats vector hb_peers; for (map::iterator p = heartbeat_peers.begin(); p != heartbeat_peers.end(); ++p) hb_peers.push_back(p->first); auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs()); dout(5) << __func__ << " " << new_stat << dendl; ceph_assert(new_stat.statfs.total); float pratio; float ratio = service.compute_adjusted_ratio(new_stat, &pratio); service.check_full_status(ratio, pratio); utime_t now = ceph_clock_now(); utime_t deadline = now; deadline += cct->_conf->osd_heartbeat_grace; // send heartbeats for (map::iterator i = heartbeat_peers.begin(); i != heartbeat_peers.end(); ++i) { int peer = i->first; i->second.last_tx = now; if (i->second.first_tx == utime_t()) i->second.first_tx = now; i->second.ping_history[now] = make_pair(deadline, HeartbeatInfo::HEARTBEAT_MAX_CONN); if (i->second.hb_interval_start == utime_t()) i->second.hb_interval_start = now; dout(30) << "heartbeat sending ping to osd." << peer << dendl; i->second.con_back->send_message(new MOSDPing(monc->get_fsid(), service.get_osdmap_epoch(), MOSDPing::PING, now, cct->_conf->osd_heartbeat_min_size)); if (i->second.con_front) i->second.con_front->send_message(new MOSDPing(monc->get_fsid(), service.get_osdmap_epoch(), MOSDPing::PING, now, cct->_conf->osd_heartbeat_min_size)); } logger->set(l_osd_hb_to, heartbeat_peers.size()); // hmm.. am i all alone? dout(30) << "heartbeat lonely?" << dendl; if (heartbeat_peers.empty()) { if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) { last_mon_heartbeat = now; dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl; osdmap_subscribe(get_osdmap_epoch() + 1, false); } } dout(30) << "heartbeat done" << dendl; } bool OSD::heartbeat_reset(Connection *con) { std::lock_guard l(heartbeat_lock); auto s = con->get_priv(); con->set_priv(nullptr); if (s) { if (is_stopping()) { return true; } auto heartbeat_session = static_cast(s.get()); auto p = heartbeat_peers.find(heartbeat_session->peer); if (p != heartbeat_peers.end() && (p->second.con_back == con || p->second.con_front == con)) { dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer << ", reopening" << dendl; if (con != p->second.con_back) { p->second.con_back->mark_down(); } p->second.con_back.reset(NULL); if (p->second.con_front && con != p->second.con_front) { p->second.con_front->mark_down(); } p->second.con_front.reset(NULL); pair newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); if (newcon.first) { p->second.con_back = newcon.first.get(); p->second.con_back->set_priv(s); if (newcon.second) { p->second.con_front = newcon.second.get(); p->second.con_front->set_priv(s); } p->second.ping_history.clear(); } else { dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer << ", raced with osdmap update, closing out peer" << dendl; heartbeat_peers.erase(p); } } else { dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl; } } return true; } // ========================================= void OSD::tick() { ceph_assert(osd_lock.is_locked()); dout(10) << "tick" << dendl; if (is_active() || is_waiting_for_healthy()) { maybe_update_heartbeat_peers(); } if (is_waiting_for_healthy()) { start_boot(); } if (is_waiting_for_healthy() || is_booting()) { std::lock_guard l(heartbeat_lock); utime_t now = ceph_clock_now(); if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) { last_mon_heartbeat = now; dout(1) << __func__ << " checking mon for new map" << dendl; osdmap_subscribe(get_osdmap_epoch() + 1, false); } } do_waiters(); tick_timer.add_event_after(get_tick_interval(), new C_Tick(this)); } void OSD::tick_without_osd_lock() { ceph_assert(tick_timer_lock.is_locked()); dout(10) << "tick_without_osd_lock" << dendl; logger->set(l_osd_cached_crc, buffer::get_cached_crc()); logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted()); logger->set(l_osd_missed_crc, buffer::get_missed_crc()); // refresh osd stats struct store_statfs_t stbuf; osd_alert_list_t alerts; int r = store->statfs(&stbuf, &alerts); ceph_assert(r == 0); service.set_statfs(stbuf, alerts); // osd_lock is not being held, which means the OSD state // might change when doing the monitor report if (is_active() || is_waiting_for_healthy()) { heartbeat_lock.Lock(); heartbeat_check(); heartbeat_lock.Unlock(); map_lock.get_read(); std::lock_guard l(mon_report_lock); // mon report? utime_t now = ceph_clock_now(); if (service.need_fullness_update() || now - last_mon_report > cct->_conf->osd_mon_report_interval) { last_mon_report = now; send_full_update(); send_failures(); } map_lock.put_read(); epoch_t max_waiting_epoch = 0; for (auto s : shards) { max_waiting_epoch = std::max(max_waiting_epoch, s->get_max_waiting_epoch()); } if (max_waiting_epoch > get_osdmap()->get_epoch()) { dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch << ", requesting new map" << dendl; osdmap_subscribe(superblock.newest_map + 1, false); } } if (is_active()) { if (!scrub_random_backoff()) { sched_scrub(); } service.promote_throttle_recalibrate(); resume_creating_pg(); bool need_send_beacon = false; const auto now = ceph::coarse_mono_clock::now(); { // borrow lec lock to pretect last_sent_beacon from changing std::lock_guard l{min_last_epoch_clean_lock}; const auto elapsed = now - last_sent_beacon; if (chrono::duration_cast(elapsed).count() > cct->_conf->osd_beacon_report_interval) { need_send_beacon = true; } } if (need_send_beacon) { send_beacon(now); } } mgrc.update_daemon_health(get_health_metrics()); service.kick_recovery_queue(); tick_timer_without_osd_lock.add_event_after(get_tick_interval(), new C_Tick_WithoutOSDLock(this)); } // Usage: // setomapval [namespace/] // rmomapkey [namespace/] // setomapheader [namespace/]
// getomap [namespace/] // truncobj [namespace/] // injectmdataerr [namespace/] [shardid] // injectdataerr [namespace/] [shardid] // // set_recovery_delay [utime] void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, std::string_view command, const cmdmap_t& cmdmap, ostream &ss) { //Test support //Support changing the omap on a single osd by using the Admin Socket to //directly request the osd make a change. if (command == "setomapval" || command == "rmomapkey" || command == "setomapheader" || command == "getomap" || command == "truncobj" || command == "injectmdataerr" || command == "injectdataerr" ) { pg_t rawpg; int64_t pool; OSDMapRef curmap = service->get_osdmap(); int r = -1; string poolstr; cmd_getval(service->cct, cmdmap, "pool", poolstr); pool = curmap->lookup_pg_pool_name(poolstr); //If we can't find it by name then maybe id specified if (pool < 0 && isdigit(poolstr[0])) pool = atoll(poolstr.c_str()); if (pool < 0) { ss << "Invalid pool '" << poolstr << "''"; return; } string objname, nspace; cmd_getval(service->cct, cmdmap, "objname", objname); std::size_t found = objname.find_first_of('/'); if (found != string::npos) { nspace = objname.substr(0, found); objname = objname.substr(found+1); } object_locator_t oloc(pool, nspace); r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg); if (r < 0) { ss << "Invalid namespace/objname"; return; } int64_t shardid; cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD)); hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace); ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid))); spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid)); if (curmap->pg_is_ec(rawpg)) { if ((command != "injectdataerr") && (command != "injectmdataerr")) { ss << "Must not call on ec pool, except injectdataerr or injectmdataerr"; return; } } ObjectStore::Transaction t; if (command == "setomapval") { map newattrs; bufferlist val; string key, valstr; cmd_getval(service->cct, cmdmap, "key", key); cmd_getval(service->cct, cmdmap, "val", valstr); val.append(valstr); newattrs[key] = val; t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs); r = store->queue_transaction(service->meta_ch, std::move(t)); if (r < 0) ss << "error=" << r; else ss << "ok"; } else if (command == "rmomapkey") { string key; set keys; cmd_getval(service->cct, cmdmap, "key", key); keys.insert(key); t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys); r = store->queue_transaction(service->meta_ch, std::move(t)); if (r < 0) ss << "error=" << r; else ss << "ok"; } else if (command == "setomapheader") { bufferlist newheader; string headerstr; cmd_getval(service->cct, cmdmap, "header", headerstr); newheader.append(headerstr); t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader); r = store->queue_transaction(service->meta_ch, std::move(t)); if (r < 0) ss << "error=" << r; else ss << "ok"; } else if (command == "getomap") { //Debug: Output entire omap bufferlist hdrbl; map keyvals; auto ch = store->open_collection(coll_t(pgid)); if (!ch) { ss << "unable to open collection for " << pgid; r = -ENOENT; } else { r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals); if (r >= 0) { ss << "header=" << string(hdrbl.c_str(), hdrbl.length()); for (map::iterator it = keyvals.begin(); it != keyvals.end(); ++it) ss << " key=" << (*it).first << " val=" << string((*it).second.c_str(), (*it).second.length()); } else { ss << "error=" << r; } } } else if (command == "truncobj") { int64_t trunclen; cmd_getval(service->cct, cmdmap, "len", trunclen); t.truncate(coll_t(pgid), ghobject_t(obj), trunclen); r = store->queue_transaction(service->meta_ch, std::move(t)); if (r < 0) ss << "error=" << r; else ss << "ok"; } else if (command == "injectdataerr") { store->inject_data_error(gobj); ss << "ok"; } else if (command == "injectmdataerr") { store->inject_mdata_error(gobj); ss << "ok"; } return; } if (command == "set_recovery_delay") { int64_t delay; cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0); ostringstream oss; oss << delay; int r = service->cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str()); if (r != 0) { ss << "set_recovery_delay: error setting " << "osd_recovery_delay_start to '" << delay << "': error " << r; return; } service->cct->_conf.apply_changes(nullptr); ss << "set_recovery_delay: set osd_recovery_delay_start " << "to " << service->cct->_conf->osd_recovery_delay_start; return; } if (command == "trigger_scrub" || command == "trigger_deep_scrub") { spg_t pgid; bool deep = (command == "trigger_deep_scrub"); OSDMapRef curmap = service->get_osdmap(); string pgidstr; cmd_getval(service->cct, cmdmap, "pgid", pgidstr); if (!pgid.parse(pgidstr.c_str())) { ss << "Invalid pgid specified"; return; } int64_t time; cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0); PGRef pg = service->osd->_lookup_lock_pg(pgid); if (pg == nullptr) { ss << "Can't find pg " << pgid; return; } if (pg->is_primary()) { pg->unreg_next_scrub(); const pg_pool_t *p = curmap->get_pg_pool(pgid.pool()); double pool_scrub_max_interval = 0; double scrub_max_interval; if (deep) { p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval); scrub_max_interval = pool_scrub_max_interval > 0 ? pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval; } else { p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval); scrub_max_interval = pool_scrub_max_interval > 0 ? pool_scrub_max_interval : g_conf()->osd_scrub_max_interval; } // Instead of marking must_scrub force a schedule scrub utime_t stamp = ceph_clock_now(); if (time == 0) stamp -= scrub_max_interval; else stamp -= (float)time; stamp -= 100.0; // push back last scrub more for good measure if (deep) { pg->set_last_deep_scrub_stamp(stamp); } else { pg->set_last_scrub_stamp(stamp); } pg->reg_next_scrub(); pg->publish_stats_to_osd(); ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp; } else { ss << "Not primary"; } pg->unlock(); return; } if (command == "injectfull") { int64_t count; string type; OSDService::s_names state; cmd_getval(service->cct, cmdmap, "type", type, string("full")); cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1); if (type == "none" || count == 0) { type = "none"; count = 0; } state = service->get_full_state(type); if (state == OSDService::s_names::INVALID) { ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)"; return; } service->set_injectfull(state, count); return; } ss << "Internal error - command=" << command; } // ========================================= void OSD::ms_handle_connect(Connection *con) { dout(10) << __func__ << " con " << con << dendl; if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { std::lock_guard l(osd_lock); if (is_stopping()) return; dout(10) << __func__ << " on mon" << dendl; if (is_preboot()) { start_boot(); } else if (is_booting()) { _send_boot(); // resend boot message } else { map_lock.get_read(); std::lock_guard l2(mon_report_lock); utime_t now = ceph_clock_now(); last_mon_report = now; // resend everything, it's a new session send_full_update(); send_alive(); service.requeue_pg_temp(); service.clear_sent_ready_to_merge(); service.send_pg_temp(); service.send_ready_to_merge(); service.send_pg_created(); requeue_failures(); send_failures(); map_lock.put_read(); if (is_active()) { send_beacon(ceph::coarse_mono_clock::now()); } } // full map requests may happen while active or pre-boot if (requested_full_first) { rerequest_full_maps(); } } } void OSD::ms_handle_fast_connect(Connection *con) { if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { auto priv = con->get_priv(); auto s = static_cast(priv.get()); if (!s) { s = new Session{cct, con}; con->set_priv(RefCountedPtr{s, false}); dout(10) << " new session (outgoing) " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl; // we don't connect to clients ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD); s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD); } } } void OSD::ms_handle_fast_accept(Connection *con) { if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { auto priv = con->get_priv(); auto s = static_cast(priv.get()); if (!s) { s = new Session{cct, con}; con->set_priv(RefCountedPtr{s, false}); dout(10) << "new session (incoming)" << s << " con=" << con << " addr=" << con->get_peer_addr() << " must have raced with connect" << dendl; ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD); s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD); } } } bool OSD::ms_handle_reset(Connection *con) { auto s = con->get_priv(); auto session = static_cast(s.get()); dout(2) << "ms_handle_reset con " << con << " session " << session << dendl; if (!session) return false; session->wstate.reset(con); session->con->set_priv(nullptr); session->con.reset(); // break con <-> session ref cycle // note that we break session->con *before* the session_handle_reset // cleanup below. this avoids a race between us and // PG::add_backoff, Session::check_backoff, etc. session_handle_reset(SessionRef{session}); return true; } bool OSD::ms_handle_refused(Connection *con) { if (!cct->_conf->osd_fast_fail_on_connection_refused) return false; auto priv = con->get_priv(); auto session = static_cast(priv.get()); dout(2) << "ms_handle_refused con " << con << " session " << session << dendl; if (!session) return false; int type = con->get_peer_type(); // handle only OSD failures here if (monc && (type == CEPH_ENTITY_TYPE_OSD)) { OSDMapRef osdmap = get_osdmap(); if (osdmap) { int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr()); if (id >= 0 && osdmap->is_up(id)) { // I'm cheating mon heartbeat grace logic, because we know it's not going // to respawn alone. +1 so we won't hit any boundary case. monc->send_mon_message( new MOSDFailure( monc->get_fsid(), id, osdmap->get_addrs(id), cct->_conf->osd_heartbeat_grace + 1, osdmap->get_epoch(), MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED )); } } } return true; } struct C_OSD_GetVersion : public Context { OSD *osd; uint64_t oldest, newest; explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {} void finish(int r) override { if (r >= 0) osd->_got_mon_epochs(oldest, newest); } }; void OSD::start_boot() { if (!_is_healthy()) { // if we are not healthy, do not mark ourselves up (yet) dout(1) << "not healthy; waiting to boot" << dendl; if (!is_waiting_for_healthy()) start_waiting_for_healthy(); // send pings sooner rather than later heartbeat_kick(); return; } dout(1) << __func__ << dendl; set_state(STATE_PREBOOT); dout(10) << "start_boot - have maps " << superblock.oldest_map << ".." << superblock.newest_map << dendl; C_OSD_GetVersion *c = new C_OSD_GetVersion(this); monc->get_version("osdmap", &c->newest, &c->oldest, c); } void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest) { std::lock_guard l(osd_lock); if (is_preboot()) { _preboot(oldest, newest); } } void OSD::_preboot(epoch_t oldest, epoch_t newest) { ceph_assert(is_preboot()); dout(10) << __func__ << " _preboot mon has osdmaps " << oldest << ".." << newest << dendl; // ensure our local fullness awareness is accurate { std::lock_guard l(heartbeat_lock); heartbeat(); } const auto osdmap = get_osdmap(); // if our map within recent history, try to add ourselves to the osdmap. if (osdmap->get_epoch() == 0) { derr << "waiting for initial osdmap" << dendl; } else if (osdmap->is_destroyed(whoami)) { derr << "osdmap says I am destroyed" << dendl; // provide a small margin so we don't livelock seeing if we // un-destroyed ourselves. if (osdmap->get_epoch() > newest - 1) { exit(0); } } else if (osdmap->is_noup(whoami)) { derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl; } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it" << dendl; } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) { derr << "osdmap require_osd_release < luminous; please upgrade to luminous" << dendl; } else if (service.need_fullness_update()) { derr << "osdmap fullness state needs update" << dendl; send_full_update(); } else if (osdmap->get_epoch() >= oldest - 1 && osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) { // wait for pgs to fully catch up in a different thread, since // this thread might be required for splitting and merging PGs to // make progress. boot_finisher.queue( new FunctionContext( [this](int r) { std::lock_guard l(osd_lock); if (is_preboot()) { dout(10) << __func__ << " waiting for peering work to drain" << dendl; osd_lock.Unlock(); for (auto shard : shards) { shard->wait_min_pg_epoch(get_osdmap_epoch()); } osd_lock.Lock(); } if (is_preboot()) { _send_boot(); } })); return; } // get all the latest maps if (osdmap->get_epoch() + 1 >= oldest) osdmap_subscribe(osdmap->get_epoch() + 1, false); else osdmap_subscribe(oldest - 1, true); } void OSD::send_full_update() { if (!service.need_fullness_update()) return; unsigned state = 0; if (service.is_full()) { state = CEPH_OSD_FULL; } else if (service.is_backfillfull()) { state = CEPH_OSD_BACKFILLFULL; } else if (service.is_nearfull()) { state = CEPH_OSD_NEARFULL; } set s; OSDMap::calc_state_set(state, s); dout(10) << __func__ << " want state " << s << dendl; monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state)); } void OSD::start_waiting_for_healthy() { dout(1) << "start_waiting_for_healthy" << dendl; set_state(STATE_WAITING_FOR_HEALTHY); last_heartbeat_resample = utime_t(); // subscribe to osdmap updates, in case our peers really are known to be dead osdmap_subscribe(get_osdmap_epoch() + 1, false); } bool OSD::_is_healthy() { if (!cct->get_heartbeat_map()->is_healthy()) { dout(1) << "is_healthy false -- internal heartbeat failed" << dendl; return false; } if (is_waiting_for_healthy()) { utime_t now = ceph_clock_now(); utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0); while (!osd_markdown_log.empty() && osd_markdown_log.front() + grace < now) osd_markdown_log.pop_front(); if (osd_markdown_log.size() <= 1) { dout(5) << __func__ << " first time marked as down," << " try reboot unconditionally" << dendl; return true; } std::lock_guard l(heartbeat_lock); int num = 0, up = 0; for (map::iterator p = heartbeat_peers.begin(); p != heartbeat_peers.end(); ++p) { if (p->second.is_healthy(now)) ++up; ++num; } if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) { dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than " << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl; return false; } } return true; } void OSD::_send_boot() { dout(10) << "_send_boot" << dendl; Connection *local_connection = cluster_messenger->get_loopback_connection().get(); entity_addrvec_t client_addrs = client_messenger->get_myaddrs(); entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs(); entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs(); entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs(); dout(20) << " initial client_addrs " << client_addrs << ", cluster_addrs " << cluster_addrs << ", hb_back_addrs " << hb_back_addrs << ", hb_front_addrs " << hb_front_addrs << dendl; if (cluster_messenger->set_addr_unknowns(client_addrs)) { dout(10) << " assuming cluster_addrs match client_addrs " << client_addrs << dendl; cluster_addrs = cluster_messenger->get_myaddrs(); } if (auto session = local_connection->get_priv(); !session) { cluster_messenger->ms_deliver_handle_fast_connect(local_connection); } local_connection = hb_back_server_messenger->get_loopback_connection().get(); if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) { dout(10) << " assuming hb_back_addrs match cluster_addrs " << cluster_addrs << dendl; hb_back_addrs = hb_back_server_messenger->get_myaddrs(); } if (auto session = local_connection->get_priv(); !session) { hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection); } local_connection = hb_front_server_messenger->get_loopback_connection().get(); if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) { dout(10) << " assuming hb_front_addrs match client_addrs " << client_addrs << dendl; hb_front_addrs = hb_front_server_messenger->get_myaddrs(); } if (auto session = local_connection->get_priv(); !session) { hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection); } // we now know what our front and back addrs will be, and we are // about to tell the mon what our metadata (including numa bindings) // are, so now is a good time! set_numa_affinity(); MOSDBoot *mboot = new MOSDBoot( superblock, get_osdmap_epoch(), service.get_boot_epoch(), hb_back_addrs, hb_front_addrs, cluster_addrs, CEPH_FEATURES_ALL); dout(10) << " final client_addrs " << client_addrs << ", cluster_addrs " << cluster_addrs << ", hb_back_addrs " << hb_back_addrs << ", hb_front_addrs " << hb_front_addrs << dendl; _collect_metadata(&mboot->metadata); monc->send_mon_message(mboot); set_state(STATE_BOOTING); } void OSD::_collect_metadata(map *pm) { // config info (*pm)["osd_data"] = dev_path; if (store->get_type() == "filestore") { // not applicable for bluestore (*pm)["osd_journal"] = journal_path; } (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs()); (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs()); (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs()); (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs()); // backend (*pm)["osd_objectstore"] = store->get_type(); (*pm)["rotational"] = store_is_rotational ? "1" : "0"; (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0"; (*pm)["default_device_class"] = store->get_default_device_class(); store->collect_metadata(pm); collect_sys_info(pm, cct); (*pm)["front_iface"] = pick_iface( cct, client_messenger->get_myaddrs().front().get_sockaddr_storage()); (*pm)["back_iface"] = pick_iface( cct, cluster_messenger->get_myaddrs().front().get_sockaddr_storage()); // network numa { int node = -1; set nodes; set unknown; for (auto nm : { "front_iface", "back_iface" }) { if (!(*pm)[nm].size()) { unknown.insert(nm); continue; } int n = -1; int r = get_iface_numa_node((*pm)[nm], &n); if (r < 0) { unknown.insert((*pm)[nm]); continue; } nodes.insert(n); if (node < 0) { node = n; } } if (unknown.size()) { (*pm)["network_numa_unknown_ifaces"] = stringify(unknown); } if (!nodes.empty()) { (*pm)["network_numa_nodes"] = stringify(nodes); } if (node >= 0 && nodes.size() == 1 && unknown.empty()) { (*pm)["network_numa_node"] = stringify(node); } } if (numa_node >= 0) { (*pm)["numa_node"] = stringify(numa_node); (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set); } set devnames; store->get_devices(&devnames); map errs; get_device_metadata(devnames, pm, &errs); for (auto& i : errs) { dout(1) << __func__ << " " << i.first << ": " << i.second << dendl; } dout(10) << __func__ << " " << *pm << dendl; } void OSD::queue_want_up_thru(epoch_t want) { map_lock.get_read(); epoch_t cur = get_osdmap()->get_up_thru(whoami); std::lock_guard l(mon_report_lock); if (want > up_thru_wanted) { dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")" << ", currently " << cur << dendl; up_thru_wanted = want; send_alive(); } else { dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted << ", currently " << cur << dendl; } map_lock.put_read(); } void OSD::send_alive() { ceph_assert(mon_report_lock.is_locked()); const auto osdmap = get_osdmap(); if (!osdmap->exists(whoami)) return; epoch_t up_thru = osdmap->get_up_thru(whoami); dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl; if (up_thru_wanted > up_thru) { dout(10) << "send_alive want " << up_thru_wanted << dendl; monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted)); } } void OSD::request_full_map(epoch_t first, epoch_t last) { dout(10) << __func__ << " " << first << ".." << last << ", previously requested " << requested_full_first << ".." << requested_full_last << dendl; ceph_assert(osd_lock.is_locked()); ceph_assert(first > 0 && last > 0); ceph_assert(first <= last); ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps if (requested_full_first == 0) { // first request requested_full_first = first; requested_full_last = last; } else if (last <= requested_full_last) { // dup return; } else { // additional request first = requested_full_last + 1; requested_full_last = last; } MMonGetOSDMap *req = new MMonGetOSDMap; req->request_full(first, last); monc->send_mon_message(req); } void OSD::got_full_map(epoch_t e) { ceph_assert(requested_full_first <= requested_full_last); ceph_assert(osd_lock.is_locked()); if (requested_full_first == 0) { dout(20) << __func__ << " " << e << ", nothing requested" << dendl; return; } if (e < requested_full_first) { dout(10) << __func__ << " " << e << ", requested " << requested_full_first << ".." << requested_full_last << ", ignoring" << dendl; return; } if (e >= requested_full_last) { dout(10) << __func__ << " " << e << ", requested " << requested_full_first << ".." << requested_full_last << ", resetting" << dendl; requested_full_first = requested_full_last = 0; return; } requested_full_first = e + 1; dout(10) << __func__ << " " << e << ", requested " << requested_full_first << ".." << requested_full_last << ", still need more" << dendl; } void OSD::requeue_failures() { std::lock_guard l(heartbeat_lock); unsigned old_queue = failure_queue.size(); unsigned old_pending = failure_pending.size(); for (auto p = failure_pending.begin(); p != failure_pending.end(); ) { failure_queue[p->first] = p->second.first; failure_pending.erase(p++); } dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> " << failure_queue.size() << dendl; } void OSD::send_failures() { ceph_assert(map_lock.is_locked()); ceph_assert(mon_report_lock.is_locked()); std::lock_guard l(heartbeat_lock); utime_t now = ceph_clock_now(); const auto osdmap = get_osdmap(); while (!failure_queue.empty()) { int osd = failure_queue.begin()->first; if (!failure_pending.count(osd)) { int failed_for = (int)(double)(now - failure_queue.begin()->second); monc->send_mon_message( new MOSDFailure( monc->get_fsid(), osd, osdmap->get_addrs(osd), failed_for, osdmap->get_epoch())); failure_pending[osd] = make_pair(failure_queue.begin()->second, osdmap->get_addrs(osd)); } failure_queue.erase(osd); } } void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs) { MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch, MOSDFailure::FLAG_ALIVE); monc->send_mon_message(m); } void OSD::cancel_pending_failures() { std::lock_guard l(heartbeat_lock); auto it = failure_pending.begin(); while (it != failure_pending.end()) { dout(10) << __func__ << " canceling in-flight failure report for osd." << it->first << dendl; send_still_alive(get_osdmap_epoch(), it->first, it->second.second); failure_pending.erase(it++); } } void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now) { const auto& monmap = monc->monmap; // send beacon to mon even if we are just connected, and the monmap is not // initialized yet by then. if (monmap.epoch > 0 && monmap.get_required_features().contains_all( ceph::features::mon::FEATURE_LUMINOUS)) { dout(20) << __func__ << " sending" << dendl; MOSDBeacon* beacon = nullptr; { std::lock_guard l{min_last_epoch_clean_lock}; beacon = new MOSDBeacon(get_osdmap_epoch(), min_last_epoch_clean); beacon->pgs = min_last_epoch_clean_pgs; last_sent_beacon = now; } monc->send_mon_message(beacon); } else { dout(20) << __func__ << " not sending" << dendl; } } void OSD::handle_command(MMonCommand *m) { if (!require_mon_peer(m)) { m->put(); return; } Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL); command_wq.queue(c); m->put(); } void OSD::handle_command(MCommand *m) { ConnectionRef con = m->get_connection(); auto priv = con->get_priv(); auto session = static_cast(priv.get()); if (!session) { con->send_message(new MCommandReply(m, -EPERM)); m->put(); return; } OSDCap& caps = session->caps; priv.reset(); if (!caps.allow_all() || m->get_source().is_mon()) { con->send_message(new MCommandReply(m, -EPERM)); m->put(); return; } Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get()); command_wq.queue(c); m->put(); } struct OSDCommand { string cmdstring; string helpstring; string module; string perm; } osd_commands[] = { #define COMMAND(parsesig, helptext, module, perm) \ {parsesig, helptext, module, perm}, // yes, these are really pg commands, but there's a limit to how // much work it's worth. The OSD returns all of them. Make this // form (pg ) valid only for the cli. // Rest uses "tell " COMMAND("pg " \ "name=pgid,type=CephPgid " \ "name=cmd,type=CephChoices,strings=query", \ "show details of a specific pg", "osd", "r") COMMAND("pg " \ "name=pgid,type=CephPgid " \ "name=cmd,type=CephChoices,strings=mark_unfound_lost " \ "name=mulcmd,type=CephChoices,strings=revert|delete", \ "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available", "osd", "rw") COMMAND("pg " \ "name=pgid,type=CephPgid " \ "name=cmd,type=CephChoices,strings=list_unfound " \ "name=offset,type=CephString,req=false", "list unfound objects on this pg, perhaps starting at an offset given in JSON", "osd", "r") // new form: tell for both cli and rest COMMAND("query", "show details of a specific pg", "osd", "r") COMMAND("mark_unfound_lost " \ "name=mulcmd,type=CephChoices,strings=revert|delete", \ "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available", "osd", "rw") COMMAND("list_unfound " \ "name=offset,type=CephString,req=false", "list unfound objects on this pg, perhaps starting at an offset given in JSON", "osd", "r") COMMAND("perf histogram dump " "name=logger,type=CephString,req=false " "name=counter,type=CephString,req=false", "Get histogram data", "osd", "r") // tell commands. Validation of osd.n must be special-cased in client COMMAND("version", "report version of OSD", "osd", "r") COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r") COMMAND("injectargs " \ "name=injected_args,type=CephString,n=N", "inject configuration arguments into running OSD", "osd", "rw") COMMAND("config set " \ "name=key,type=CephString name=value,type=CephString", "Set a configuration option at runtime (not persistent)", "osd", "rw") COMMAND("config get " \ "name=key,type=CephString", "Get a configuration option at runtime", "osd", "r") COMMAND("config unset " \ "name=key,type=CephString", "Unset a configuration option at runtime (not persistent)", "osd", "rw") COMMAND("cluster_log " \ "name=level,type=CephChoices,strings=error,warning,info,debug " \ "name=message,type=CephString,n=N", "log a message to the cluster log", "osd", "rw") COMMAND("clear_shards_repaired " \ "name=count,type=CephInt,req=false", "clear num_shards_repaired to clear health warning", "osd", "rw") COMMAND("bench " \ "name=count,type=CephInt,req=false " \ "name=size,type=CephInt,req=false " \ "name=object_size,type=CephInt,req=false " \ "name=object_num,type=CephInt,req=false ", \ "OSD benchmark: write -byte objects(with ), " \ "(default count=1G default size=4MB). Results in log.", "osd", "rw") COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw") COMMAND("heap " \ "name=heapcmd,type=CephChoices,strings="\ "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \ "name=value,type=CephString,req=false", "show heap usage info (available only if compiled with tcmalloc)", "osd", "rw") COMMAND("debug dump_missing " \ "name=filename,type=CephFilepath", "dump missing objects to a named file", "osd", "r") COMMAND("debug kick_recovery_wq " \ "name=delay,type=CephInt,range=0", "set osd_recovery_delay_start to ", "osd", "rw") COMMAND("cpu_profiler " \ "name=arg,type=CephChoices,strings=status|flush", "run cpu profiling on daemon", "osd", "rw") COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics", "osd", "r") COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics", "osd", "rw") COMMAND("compact", "compact object store's omap. " "WARNING: Compaction probably slows your requests", "osd", "rw") COMMAND("smart name=devid,type=CephString,req=False", "runs smartctl on this osd devices. ", "osd", "rw") COMMAND("cache drop", "Drop all OSD caches", "osd", "rwx") COMMAND("cache status", "Get OSD caches statistics", "osd", "r") COMMAND("send_beacon", "Send OSD beacon to mon immediately", "osd", "r") }; void OSD::do_command( Connection *con, ceph_tid_t tid, vector& cmd, bufferlist& data) { dout(20) << "do_command tid " << tid << " " << cmd << dendl; int r = 0; stringstream ss, ds; bufferlist odata; cmdmap_t cmdmap; if (cmd.empty()) { ss << "no command given"; goto out; } if (!cmdmap_from_json(cmd, &cmdmap, ss)) { r = -EINVAL; goto out; } try { r = _do_command(con, cmdmap, tid, data, odata, ss, ds); } catch (const bad_cmd_get& e) { r = -EINVAL; ss << e.what(); } if (r == -EAGAIN) { return; } out: string rs = ss.str(); odata.append(ds); dout(0) << "do_command r=" << r << " " << rs << dendl; clog->info() << rs; if (con) { MCommandReply *reply = new MCommandReply(r, rs); reply->set_tid(tid); reply->set_data(odata); con->send_message(reply); } } namespace { class unlock_guard { Mutex& m; public: explicit unlock_guard(Mutex& mutex) : m(mutex) { m.unlock(); } unlock_guard(unlock_guard&) = delete; ~unlock_guard() { m.lock(); } }; } int OSD::_do_command( Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data, bufferlist& odata, stringstream& ss, stringstream& ds) { int r = 0; string prefix; string format; string pgidstr; boost::scoped_ptr f; cmd_getval(cct, cmdmap, "prefix", prefix); if (prefix == "get_command_descriptions") { int cmdnum = 0; JSONFormatter *f = new JSONFormatter(); f->open_object_section("command_descriptions"); for (OSDCommand *cp = osd_commands; cp < &osd_commands[std::size(osd_commands)]; cp++) { ostringstream secname; secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; dump_cmddesc_to_json(f, con->get_features(), secname.str(), cp->cmdstring, cp->helpstring, cp->module, cp->perm, 0); cmdnum++; } f->close_section(); // command_descriptions f->flush(ds); delete f; goto out; } cmd_getval(cct, cmdmap, "format", format); f.reset(Formatter::create(format)); if (prefix == "version") { if (f) { f->open_object_section("version"); f->dump_string("version", pretty_version_to_str()); f->close_section(); f->flush(ds); } else { ds << pretty_version_to_str(); } goto out; } else if (prefix == "injectargs") { vector argsvec; cmd_getval(cct, cmdmap, "injected_args", argsvec); if (argsvec.empty()) { r = -EINVAL; ss << "ignoring empty injectargs"; goto out; } string args = argsvec.front(); for (vector::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a) args += " " + *a; unlock_guard unlock{osd_lock}; r = cct->_conf.injectargs(args, &ss); } else if (prefix == "config set") { std::string key; std::string val; cmd_getval(cct, cmdmap, "key", key); cmd_getval(cct, cmdmap, "value", val); unlock_guard unlock{osd_lock}; r = cct->_conf.set_val(key, val, &ss); if (r == 0) { cct->_conf.apply_changes(nullptr); } } else if (prefix == "config get") { std::string key; cmd_getval(cct, cmdmap, "key", key); unlock_guard unlock{osd_lock}; std::string val; r = cct->_conf.get_val(key, &val); if (r == 0) { ds << val; } } else if (prefix == "config unset") { std::string key; cmd_getval(cct, cmdmap, "key", key); unlock_guard unlock{osd_lock}; r = cct->_conf.rm_val(key); if (r == 0) { cct->_conf.apply_changes(nullptr); } if (r == -ENOENT) { r = 0; // make command idempotent } } else if (prefix == "cluster_log") { vector msg; cmd_getval(cct, cmdmap, "message", msg); if (msg.empty()) { r = -EINVAL; ss << "ignoring empty log message"; goto out; } string message = msg.front(); for (vector::iterator a = ++msg.begin(); a != msg.end(); ++a) message += " " + *a; string lvl; cmd_getval(cct, cmdmap, "level", lvl); clog_type level = string_to_clog_type(lvl); if (level < 0) { r = -EINVAL; ss << "unknown level '" << lvl << "'"; goto out; } clog->do_log(level, message); } else if (prefix == "clear_shards_repaired") { int64_t count; cmd_getval(cct, cmdmap, "count", count, (int64_t) 0); service.set_osd_stat_repaired(count); } // either 'pg ' or // 'tell ' (which comes in without any of that prefix)? else if (prefix == "pg" || prefix == "query" || prefix == "mark_unfound_lost" || prefix == "list_unfound" ) { pg_t pgid; if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) { ss << "no pgid specified"; r = -EINVAL; } else if (!pgid.parse(pgidstr.c_str())) { ss << "couldn't parse pgid '" << pgidstr << "'"; r = -EINVAL; } else { spg_t pcand; PGRef pg; if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) { if (pg->is_primary()) { // simulate pg cmd= for pg->do-command if (prefix != "pg") cmd_putval(cct, cmdmap, "cmd", prefix); try { r = pg->do_command(cmdmap, ss, data, odata, con, tid); } catch (const bad_cmd_get& e) { pg->unlock(); ss << e.what(); return -EINVAL; } if (r == -EAGAIN) { pg->unlock(); // don't reply, pg will do so async return -EAGAIN; } } else { ss << "not primary for pgid " << pgid; // send them the latest diff to ensure they realize the mapping // has changed. service.send_incremental_map(get_osdmap_epoch() - 1, con, get_osdmap()); // do not reply; they will get newer maps and realize they // need to resend. pg->unlock(); return -EAGAIN; } pg->unlock(); } else { ss << "i don't have pgid " << pgid; r = -ENOENT; } } } else if (prefix == "bench") { int64_t count; int64_t bsize; int64_t osize, onum; // default count 1G, size 4MB cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30); cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20); cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0); cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0); uint32_t duration = cct->_conf->osd_bench_duration; if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) { // let us limit the block size because the next checks rely on it // having a sane value. If we allow any block size to be set things // can still go sideways. ss << "block 'size' values are capped at " << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use" << " a higher value, please adjust 'osd_bench_max_block_size'"; r = -EINVAL; goto out; } else if (bsize < (int64_t) (1 << 20)) { // entering the realm of small block sizes. // limit the count to a sane value, assuming a configurable amount of // IOPS and duration, so that the OSD doesn't get hung up on this, // preventing timeouts from going off int64_t max_count = bsize * duration * cct->_conf->osd_bench_small_size_max_iops; if (count > max_count) { ss << "'count' values greater than " << max_count << " for a block size of " << byte_u_t(bsize) << ", assuming " << cct->_conf->osd_bench_small_size_max_iops << " IOPS," << " for " << duration << " seconds," << " can cause ill effects on osd. " << " Please adjust 'osd_bench_small_size_max_iops' with a higher" << " value if you wish to use a higher 'count'."; r = -EINVAL; goto out; } } else { // 1MB block sizes are big enough so that we get more stuff done. // However, to avoid the osd from getting hung on this and having // timers being triggered, we are going to limit the count assuming // a configurable throughput and duration. // NOTE: max_count is the total amount of bytes that we believe we // will be able to write during 'duration' for the given // throughput. The block size hardly impacts this unless it's // way too big. Given we already check how big the block size // is, it's safe to assume everything will check out. int64_t max_count = cct->_conf->osd_bench_large_size_max_throughput * duration; if (count > max_count) { ss << "'count' values greater than " << max_count << " for a block size of " << byte_u_t(bsize) << ", assuming " << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s," << " for " << duration << " seconds," << " can cause ill effects on osd. " << " Please adjust 'osd_bench_large_size_max_throughput'" << " with a higher value if you wish to use a higher 'count'."; r = -EINVAL; goto out; } } if (osize && bsize > osize) bsize = osize; dout(1) << " bench count " << count << " bsize " << byte_u_t(bsize) << dendl; ObjectStore::Transaction cleanupt; if (osize && onum) { bufferlist bl; bufferptr bp(osize); bp.zero(); bl.push_back(std::move(bp)); bl.rebuild_page_aligned(); for (int i=0; iqueue_transaction(service.meta_ch, std::move(t), NULL); cleanupt.remove(coll_t(), ghobject_t(soid)); } } bufferlist bl; bufferptr bp(bsize); bp.zero(); bl.push_back(std::move(bp)); bl.rebuild_page_aligned(); { C_SaferCond waiter; if (!service.meta_ch->flush_commit(&waiter)) { waiter.wait(); } } utime_t start = ceph_clock_now(); for (int64_t pos = 0; pos < count; pos += bsize) { char nm[30]; unsigned offset = 0; if (onum && osize) { snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum)); offset = rand() % (osize / bsize) * bsize; } else { snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos); } object_t oid(nm); hobject_t soid(sobject_t(oid, 0)); ObjectStore::Transaction t; t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl); store->queue_transaction(service.meta_ch, std::move(t), NULL); if (!onum || !osize) cleanupt.remove(coll_t::meta(), ghobject_t(soid)); } { C_SaferCond waiter; if (!service.meta_ch->flush_commit(&waiter)) { waiter.wait(); } } utime_t end = ceph_clock_now(); // clean up store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL); { C_SaferCond waiter; if (!service.meta_ch->flush_commit(&waiter)) { waiter.wait(); } } double elapsed = end - start; double rate = count / elapsed; double iops = rate / bsize; if (f) { f->open_object_section("osd_bench_results"); f->dump_int("bytes_written", count); f->dump_int("blocksize", bsize); f->dump_float("elapsed_sec", elapsed); f->dump_float("bytes_per_sec", rate); f->dump_float("iops", iops); f->close_section(); f->flush(ds); } else { ds << "bench: wrote " << byte_u_t(count) << " in blocks of " << byte_u_t(bsize) << " in " << elapsed << " sec at " << byte_u_t(rate) << "/sec " << si_u_t(iops) << " IOPS"; } } else if (prefix == "flush_pg_stats") { mgrc.send_pgstats(); ds << service.get_osd_stat_seq() << "\n"; } else if (prefix == "heap") { r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds); } else if (prefix == "debug dump_missing") { if (!f) { f.reset(new JSONFormatter(true)); } f->open_array_section("pgs"); vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { string s = stringify(pg->pg_id); f->open_array_section(s.c_str()); pg->lock(); pg->dump_missing(f.get()); pg->unlock(); f->close_section(); } f->close_section(); f->flush(ds); } else if (prefix == "debug kick_recovery_wq") { int64_t delay; cmd_getval(cct, cmdmap, "delay", delay); ostringstream oss; oss << delay; unlock_guard unlock{osd_lock}; r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str()); if (r != 0) { ss << "kick_recovery_wq: error setting " << "osd_recovery_delay_start to '" << delay << "': error " << r; goto out; } cct->_conf.apply_changes(nullptr); ss << "kicking recovery queue. set osd_recovery_delay_start " << "to " << cct->_conf->osd_recovery_delay_start; } else if (prefix == "cpu_profiler") { string arg; cmd_getval(cct, cmdmap, "arg", arg); vector argvec; get_str_vec(arg, argvec); cpu_profiler_handle_command(argvec, ds); } else if (prefix == "dump_pg_recovery_stats") { stringstream s; if (f) { pg_recovery_stats.dump_formatted(f.get()); f->flush(ds); } else { pg_recovery_stats.dump(s); ds << "dump pg recovery stats: " << s.str(); } } else if (prefix == "reset_pg_recovery_stats") { ss << "reset pg recovery stats"; pg_recovery_stats.reset(); } else if (prefix == "perf histogram dump") { std::string logger; std::string counter; cmd_getval(cct, cmdmap, "logger", logger); cmd_getval(cct, cmdmap, "counter", counter); if (f) { cct->get_perfcounters_collection()->dump_formatted_histograms( f.get(), false, logger, counter); f->flush(ds); } } else if (prefix == "compact") { dout(1) << "triggering manual compaction" << dendl; auto start = ceph::coarse_mono_clock::now(); store->compact(); auto end = ceph::coarse_mono_clock::now(); double duration = std::chrono::duration(end-start).count(); dout(1) << "finished manual compaction in " << duration << " seconds" << dendl; ss << "compacted omap in " << duration << " seconds"; } else if (prefix == "smart") { string devid; cmd_getval(cct, cmdmap, "devid", devid); probe_smart(devid, ds); } else if (prefix == "cache drop") { dout(20) << "clearing all caches" << dendl; // Clear the objectstore's cache - onode and buffer for Bluestore, // system's pagecache for Filestore r = store->flush_cache(&ss); if (r < 0) { ds << "Error flushing objectstore cache: " << cpp_strerror(r); goto out; } // Clear the objectcontext cache (per PG) vector pgs; _get_pgs(&pgs); for (auto& pg: pgs) { pg->clear_cache(); } } else if (prefix == "cache status") { int obj_ctx_count = 0; vector pgs; _get_pgs(&pgs); for (auto& pg: pgs) { obj_ctx_count += pg->get_cache_obj_count(); } if (f) { f->open_object_section("cache_status"); f->dump_int("object_ctx", obj_ctx_count); store->dump_cache_stats(f.get()); f->close_section(); f->flush(ds); } else { ds << "object_ctx: " << obj_ctx_count; store->dump_cache_stats(ds); } } else if (prefix == "send_beacon") { if (is_active()) { send_beacon(ceph::coarse_mono_clock::now()); } } else { ss << "unrecognized command '" << prefix << "'"; r = -EINVAL; } out: return r; } void OSD::probe_smart(const string& only_devid, ostream& ss) { set devnames; store->get_devices(&devnames); uint64_t smart_timeout = cct->_conf.get_val( "osd_smart_report_timeout"); // == typedef std::map mObject; json_spirit::mObject json_map; for (auto dev : devnames) { // smartctl works only on physical devices; filter out any logical device if (dev.find("dm-") == 0) { continue; } string err; string devid = get_device_id(dev, &err); if (devid.size() == 0) { dout(10) << __func__ << " no unique id for dev " << dev << " (" << err << "), skipping" << dendl; continue; } if (only_devid.size() && devid != only_devid) { continue; } json_spirit::mValue smart_json; if (block_device_get_metrics(dev, smart_timeout, &smart_json)) { dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl; continue; } json_map[devid] = smart_json; } json_spirit::write(json_map, ss, json_spirit::pretty_print); } bool OSD::heartbeat_dispatch(Message *m) { dout(30) << "heartbeat_dispatch " << m << dendl; switch (m->get_type()) { case CEPH_MSG_PING: dout(10) << "ping from " << m->get_source_inst() << dendl; m->put(); break; case MSG_OSD_PING: handle_osd_ping(static_cast(m)); break; default: dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl; m->put(); } return true; } bool OSD::ms_dispatch(Message *m) { dout(20) << "OSD::ms_dispatch: " << *m << dendl; if (m->get_type() == MSG_OSD_MARK_ME_DOWN) { service.got_stop_ack(); m->put(); return true; } // lock! osd_lock.Lock(); if (is_stopping()) { osd_lock.Unlock(); m->put(); return true; } do_waiters(); _dispatch(m); osd_lock.Unlock(); return true; } void OSD::maybe_share_map( Session *session, OpRequestRef op, OSDMapRef osdmap) { if (!op->check_send_map) { return; } epoch_t last_sent_epoch = 0; session->sent_epoch_lock.lock(); last_sent_epoch = session->last_sent_epoch; session->sent_epoch_lock.unlock(); // assume the peer has the newer of the op's sent_epoch and what // we think we sent them. epoch_t from = std::max(last_sent_epoch, op->sent_epoch); const Message *m = op->get_req(); service.share_map( m->get_source(), m->get_connection().get(), from, osdmap, session ? &last_sent_epoch : NULL); session->sent_epoch_lock.lock(); if (session->last_sent_epoch < last_sent_epoch) { session->last_sent_epoch = last_sent_epoch; } session->sent_epoch_lock.unlock(); op->check_send_map = false; } void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap) { ceph_assert(session->session_dispatch_lock.is_locked()); auto i = session->waiting_on_map.begin(); while (i != session->waiting_on_map.end()) { OpRequestRef op = &(*i); ceph_assert(ms_can_fast_dispatch(op->get_req())); const MOSDFastDispatchOp *m = static_cast( op->get_req()); if (m->get_min_epoch() > osdmap->get_epoch()) { break; } session->waiting_on_map.erase(i++); op->put(); spg_t pgid; if (m->get_type() == CEPH_MSG_OSD_OP) { pg_t actual_pgid = osdmap->raw_pg_to_pg( static_cast(m)->get_pg()); if (!osdmap->get_primary_shard(actual_pgid, &pgid)) { continue; } } else { pgid = m->get_spg(); } enqueue_op(pgid, std::move(op), m->get_map_epoch()); } if (session->waiting_on_map.empty()) { clear_session_waiting_on_map(session); } else { register_session_waiting_on_map(session); } } void OSD::ms_fast_dispatch(Message *m) { FUNCTRACE(cct); if (service.is_stopping()) { m->put(); return; } // peering event? switch (m->get_type()) { case CEPH_MSG_PING: dout(10) << "ping from " << m->get_source() << dendl; m->put(); return; case MSG_MON_COMMAND: handle_command(static_cast(m)); return; case MSG_OSD_FORCE_RECOVERY: handle_fast_force_recovery(static_cast(m)); return; case MSG_OSD_SCRUB2: handle_fast_scrub(static_cast(m)); return; case MSG_OSD_PG_CREATE2: return handle_fast_pg_create(static_cast(m)); case MSG_OSD_PG_QUERY: return handle_fast_pg_query(static_cast(m)); case MSG_OSD_PG_NOTIFY: return handle_fast_pg_notify(static_cast(m)); case MSG_OSD_PG_INFO: return handle_fast_pg_info(static_cast(m)); case MSG_OSD_PG_REMOVE: return handle_fast_pg_remove(static_cast(m)); // these are single-pg messages that handle themselves case MSG_OSD_PG_LOG: case MSG_OSD_PG_TRIM: case MSG_OSD_BACKFILL_RESERVE: case MSG_OSD_RECOVERY_RESERVE: { MOSDPeeringOp *pm = static_cast(m); if (require_osd_peer(pm)) { enqueue_peering_evt( pm->get_spg(), PGPeeringEventRef(pm->get_event())); } pm->put(); return; } } OpRequestRef op = op_tracker.create_request(m); { #ifdef WITH_LTTNG osd_reqid_t reqid = op->get_reqid(); #endif tracepoint(osd, ms_fast_dispatch, reqid.name._type, reqid.name._num, reqid.tid, reqid.inc); } if (m->trace) op->osd_trace.init("osd op", &trace_endpoint, &m->trace); // note sender epoch, min req's epoch op->sent_epoch = static_cast(m)->get_map_epoch(); op->min_epoch = static_cast(m)->get_min_epoch(); ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check! service.maybe_inject_dispatch_delay(); if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) || m->get_type() != CEPH_MSG_OSD_OP) { // queue it directly enqueue_op( static_cast(m)->get_spg(), std::move(op), static_cast(m)->get_map_epoch()); } else { // legacy client, and this is an MOSDOp (the *only* fast dispatch // message that didn't have an explicit spg_t); we need to map // them to an spg_t while preserving delivery order. auto priv = m->get_connection()->get_priv(); if (auto session = static_cast(priv.get()); session) { std::lock_guard l{session->session_dispatch_lock}; op->get(); session->waiting_on_map.push_back(*op); OSDMapRef nextmap = service.get_nextmap_reserved(); dispatch_session_waiting(session, nextmap); service.release_map(nextmap); } } OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false); } bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) { dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl; if (is_stopping()) { dout(10) << __func__ << " bailing, we are shutting down" << dendl; return false; } if (dest_type == CEPH_ENTITY_TYPE_MON) return true; *authorizer = monc->build_authorizer(dest_type); return *authorizer != NULL; } KeyStore *OSD::ms_get_auth1_authorizer_keystore() { return monc->rotating_secrets.get(); } int OSD::ms_handle_authentication(Connection *con) { int ret = 0; auto priv = con->get_priv(); Session *s = static_cast(priv.get()); if (!s) { s = new Session(cct, con); con->set_priv(RefCountedPtr{s, false}); s->entity_name = con->get_peer_entity_name(); dout(10) << __func__ << " new session " << s << " con " << s->con << " entity " << s->entity_name << " addr " << con->get_peer_addrs() << dendl; } else { dout(10) << __func__ << " existing session " << s << " con " << s->con << " entity " << s->entity_name << " addr " << con->get_peer_addrs() << dendl; } AuthCapsInfo &caps_info = con->get_peer_caps_info(); if (caps_info.allow_all) s->caps.set_allow_all(); if (caps_info.caps.length() > 0) { bufferlist::const_iterator p = caps_info.caps.cbegin(); string str; try { decode(str, p); } catch (buffer::error& e) { dout(10) << __func__ << " session " << s << " " << s->entity_name << " failed to decode caps string" << dendl; ret = -EPERM; } if (!ret) { bool success = s->caps.parse(str); if (success) { dout(10) << __func__ << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl; ret = 1; } else { dout(10) << __func__ << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl; ret = -EPERM; } } } return ret; } void OSD::do_waiters() { ceph_assert(osd_lock.is_locked()); dout(10) << "do_waiters -- start" << dendl; while (!finished.empty()) { OpRequestRef next = finished.front(); finished.pop_front(); dispatch_op(next); } dout(10) << "do_waiters -- finish" << dendl; } void OSD::dispatch_op(OpRequestRef op) { switch (op->get_req()->get_type()) { case MSG_OSD_PG_CREATE: handle_pg_create(op); break; } } void OSD::_dispatch(Message *m) { ceph_assert(osd_lock.is_locked()); dout(20) << "_dispatch " << m << " " << *m << dendl; switch (m->get_type()) { // -- don't need OSDMap -- // map and replication case CEPH_MSG_OSD_MAP: handle_osd_map(static_cast(m)); break; // osd case MSG_OSD_SCRUB: handle_scrub(static_cast(m)); break; case MSG_COMMAND: handle_command(static_cast(m)); return; // -- need OSDMap -- case MSG_OSD_PG_CREATE: { OpRequestRef op = op_tracker.create_request(m); if (m->trace) op->osd_trace.init("osd op", &trace_endpoint, &m->trace); // no map? starting up? if (!get_osdmap()) { dout(7) << "no OSDMap, not booted" << dendl; logger->inc(l_osd_waiting_for_map); waiting_for_osdmap.push_back(op); op->mark_delayed("no osdmap"); break; } // need OSDMap dispatch_op(op); } } } // remove me post-nautilus void OSD::handle_scrub(MOSDScrub *m) { dout(10) << "handle_scrub " << *m << dendl; if (!require_mon_or_mgr_peer(m)) { m->put(); return; } if (m->fsid != monc->get_fsid()) { dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl; m->put(); return; } vector spgs; _get_pgids(&spgs); if (!m->scrub_pgs.empty()) { vector v; for (auto pgid : m->scrub_pgs) { spg_t pcand; if (get_osdmap()->get_primary_shard(pgid, &pcand) && std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) { v.push_back(pcand); } } spgs.swap(v); } for (auto pgid : spgs) { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), PG::RequestScrub(m->deep, m->repair)))); } m->put(); } void OSD::handle_fast_scrub(MOSDScrub2 *m) { dout(10) << __func__ << " " << *m << dendl; if (!require_mon_or_mgr_peer(m)) { m->put(); return; } if (m->fsid != monc->get_fsid()) { dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid() << dendl; m->put(); return; } for (auto pgid : m->scrub_pgs) { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( m->epoch, m->epoch, PG::RequestScrub(m->deep, m->repair)))); } m->put(); } bool OSD::scrub_random_backoff() { bool coin_flip = (rand() / (double)RAND_MAX >= cct->_conf->osd_scrub_backoff_ratio); if (!coin_flip) { dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl; return true; } return false; } OSDService::ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, const utime_t& timestamp, double pool_scrub_min_interval, double pool_scrub_max_interval, bool must) : cct(cct), pgid(pg), sched_time(timestamp), deadline(timestamp) { // if not explicitly requested, postpone the scrub with a random delay if (!must) { double scrub_min_interval = pool_scrub_min_interval > 0 ? pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval; double scrub_max_interval = pool_scrub_max_interval > 0 ? pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval; sched_time += scrub_min_interval; double r = rand() / (double)RAND_MAX; sched_time += scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r; if (scrub_max_interval == 0) { deadline = utime_t(); } else { deadline += scrub_max_interval; } } } bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const { if (sched_time < rhs.sched_time) return true; if (sched_time > rhs.sched_time) return false; return pgid < rhs.pgid; } bool OSD::scrub_time_permit(utime_t now) { struct tm bdt; time_t tt = now.sec(); localtime_r(&tt, &bdt); bool day_permit = false; if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) { if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) { day_permit = true; } } else { if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) { day_permit = true; } } if (!day_permit) { dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day << " - " << cct->_conf->osd_scrub_end_week_day << " now " << bdt.tm_wday << " = no" << dendl; return false; } bool time_permit = false; if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) { if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) { time_permit = true; } } else { if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) { time_permit = true; } } if (!time_permit) { dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour << " - " << cct->_conf->osd_scrub_end_hour << " now " << bdt.tm_hour << " = no" << dendl; } else { dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour << " - " << cct->_conf->osd_scrub_end_hour << " now " << bdt.tm_hour << " = yes" << dendl; } return time_permit; } bool OSD::scrub_load_below_threshold() { double loadavgs[3]; if (getloadavg(loadavgs, 3) != 3) { dout(10) << __func__ << " couldn't read loadavgs\n" << dendl; return false; } // allow scrub if below configured threshold long cpus = sysconf(_SC_NPROCESSORS_ONLN); double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0]; if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) { dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu << " < max " << cct->_conf->osd_scrub_load_threshold << " = yes" << dendl; return true; } // allow scrub if below daily avg and currently decreasing if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) { dout(20) << __func__ << " loadavg " << loadavgs[0] << " < daily_loadavg " << daily_loadavg << " and < 15m avg " << loadavgs[2] << " = yes" << dendl; return true; } dout(20) << __func__ << " loadavg " << loadavgs[0] << " >= max " << cct->_conf->osd_scrub_load_threshold << " and ( >= daily_loadavg " << daily_loadavg << " or >= 15m avg " << loadavgs[2] << ") = no" << dendl; return false; } void OSD::sched_scrub() { // if not permitted, fail fast if (!service.can_inc_scrubs()) { return; } bool allow_requested_repair_only = false; if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) { if (!cct->_conf->osd_repair_during_recovery) { dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl; return; } dout(10) << __func__ << " will only schedule explicitly requested repair due to active recovery" << dendl; allow_requested_repair_only = true; } utime_t now = ceph_clock_now(); bool time_permit = scrub_time_permit(now); bool load_is_low = scrub_load_below_threshold(); dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl; OSDService::ScrubJob scrub; if (service.first_scrub_stamp(&scrub)) { do { dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl; if (scrub.sched_time > now) { // save ourselves some effort dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time << " > " << now << dendl; break; } if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) { dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to " << (!time_permit ? "time not permit" : "high load") << dendl; continue; } PGRef pg = _lookup_lock_pg(scrub.pgid); if (!pg) continue; // This has already started, so go on to the next scrub job if (pg->scrubber.active) { pg->unlock(); dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl; continue; } // Skip other kinds of scrubing if only explicitly requested repairing is allowed if (allow_requested_repair_only && !pg->scrubber.must_repair) { pg->unlock(); dout(10) << __func__ << " skip " << scrub.pgid << " because repairing is not explicitly requested on it" << dendl; continue; } // If it is reserving, let it resolve before going to the next scrub job if (pg->scrubber.local_reserved && !pg->scrubber.active) { pg->unlock(); dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl; break; } dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time << (pg->get_must_scrub() ? ", explicitly requested" : (load_is_low ? ", load_is_low" : " deadline < now")) << dendl; if (pg->sched_scrub()) { pg->unlock(); break; } pg->unlock(); } while (service.next_scrub_stamp(scrub, &scrub)); } dout(20) << "sched_scrub done" << dendl; } void OSD::resched_all_scrubs() { dout(10) << __func__ << ": start" << dendl; OSDService::ScrubJob scrub; if (service.first_scrub_stamp(&scrub)) { do { dout(20) << __func__ << ": examine " << scrub.pgid << dendl; PGRef pg = _lookup_lock_pg(scrub.pgid); if (!pg) continue; if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) { dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl; pg->on_info_history_change(); } pg->unlock(); } while (service.next_scrub_stamp(scrub, &scrub)); } dout(10) << __func__ << ": done" << dendl; } MPGStats* OSD::collect_pg_stats() { // This implementation unconditionally sends every is_primary PG's // stats every time we're called. This has equivalent cost to the // previous implementation's worst case where all PGs are busy and // their stats are always enqueued for sending. RWLock::RLocker l(map_lock); utime_t had_for = ceph_clock_now() - had_map_since; osd_stat_t cur_stat = service.get_osd_stat(); cur_stat.os_perf_stat = store->get_cur_stats(); auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch(), had_for); m->osd_stat = cur_stat; std::lock_guard lec{min_last_epoch_clean_lock}; min_last_epoch_clean = get_osdmap_epoch(); min_last_epoch_clean_pgs.clear(); std::set pool_set; vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { auto pool = pg->pg_id.pgid.pool(); pool_set.emplace((int64_t)pool); if (!pg->is_primary()) { continue; } pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) { m->pg_stat[pg->pg_id.pgid] = s; min_last_epoch_clean = min(min_last_epoch_clean, lec); min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid); }); } store_statfs_t st; bool per_pool_stats = false; for (auto p : pool_set) { int r = store->pool_statfs(p, &st); if (r == -ENOTSUP) { break; } else { assert(r >= 0); m->pool_stat[p] = st; per_pool_stats = true; } } // indicate whether we are reporting per-pool stats m->osd_stat.num_osds = 1; m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0; return m; } vector OSD::get_health_metrics() { vector metrics; { utime_t oldest_secs; const utime_t now = ceph_clock_now(); auto too_old = now; too_old -= cct->_conf.get_val("osd_op_complaint_time"); int slow = 0; TrackedOpRef oldest_op; auto count_slow_ops = [&](TrackedOp& op) { if (op.get_initiated() < too_old) { stringstream ss; ss << "slow request " << op.get_desc() << " initiated " << op.get_initiated() << " currently " << op.state_string(); lgeneric_subdout(cct,osd,20) << ss.str() << dendl; clog->warn() << ss.str(); slow++; if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) { oldest_op = &op; } return true; } else { return false; } }; if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) { if (slow) { derr << __func__ << " reporting " << slow << " slow ops, oldest is " << oldest_op->get_desc() << dendl; } metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs); } else { // no news is not good news. metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0); } } { std::lock_guard l(pending_creates_lock); auto n_primaries = pending_creates_from_mon; for (const auto& create : pending_creates_from_osd) { if (create.second) { n_primaries++; } } metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries); } return metrics; } // ===================================================== // MAP void OSD::wait_for_new_map(OpRequestRef op) { // ask? if (waiting_for_osdmap.empty()) { osdmap_subscribe(get_osdmap_epoch() + 1, false); } logger->inc(l_osd_waiting_for_map); waiting_for_osdmap.push_back(op); op->mark_delayed("wait for new map"); } /** update_map * assimilate new OSDMap(s). scan pgs, etc. */ void OSD::note_down_osd(int peer) { ceph_assert(osd_lock.is_locked()); cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer)); heartbeat_lock.Lock(); failure_queue.erase(peer); failure_pending.erase(peer); map::iterator p = heartbeat_peers.find(peer); if (p != heartbeat_peers.end()) { p->second.con_back->mark_down(); if (p->second.con_front) { p->second.con_front->mark_down(); } heartbeat_peers.erase(p); } heartbeat_lock.Unlock(); } void OSD::note_up_osd(int peer) { service.forget_peer_epoch(peer, get_osdmap_epoch() - 1); heartbeat_set_peers_need_update(); } struct C_OnMapCommit : public Context { OSD *osd; epoch_t first, last; MOSDMap *msg; C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m) : osd(o), first(f), last(l), msg(m) {} void finish(int r) override { osd->_committed_osd_maps(first, last, msg); msg->put(); } }; void OSD::osdmap_subscribe(version_t epoch, bool force_request) { std::lock_guard l(osdmap_subscribe_lock); if (latest_subscribed_epoch >= epoch && !force_request) return; latest_subscribed_epoch = std::max(epoch, latest_subscribed_epoch); if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || force_request) { monc->renew_subs(); } } void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps) { epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound()); if (min <= superblock.oldest_map) return; int num = 0; ObjectStore::Transaction t; for (epoch_t e = superblock.oldest_map; e < min; ++e) { dout(20) << " removing old osdmap epoch " << e << dendl; t.remove(coll_t::meta(), get_osdmap_pobject_name(e)); t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e)); superblock.oldest_map = e + 1; num++; if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) { service.publish_superblock(superblock); write_superblock(t); int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); ceph_assert(tr == 0); num = 0; if (!skip_maps) { // skip_maps leaves us with a range of old maps if we fail to remove all // of them before moving superblock.oldest_map forward to the first map // in the incoming MOSDMap msg. so we should continue removing them in // this case, even we could do huge series of delete transactions all at // once. break; } } } if (num > 0) { service.publish_superblock(superblock); write_superblock(t); int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr); ceph_assert(tr == 0); } // we should not remove the cached maps ceph_assert(min <= service.map_cache.cached_key_lower_bound()); } void OSD::handle_osd_map(MOSDMap *m) { // wait for pgs to catch up { // we extend the map cache pins to accomodate pgs slow to consume maps // for some period, until we hit the max_lag_factor bound, at which point // we block here to stop injesting more maps than they are able to keep // up with. epoch_t max_lag = cct->_conf->osd_map_cache_size * m_osd_pg_epoch_max_lag_factor; ceph_assert(max_lag > 0); epoch_t osd_min = 0; for (auto shard : shards) { epoch_t min = shard->get_min_pg_epoch(); if (osd_min == 0 || min < osd_min) { osd_min = min; } } epoch_t osdmap_epoch = get_osdmap_epoch(); if (osd_min > 0 && osdmap_epoch > max_lag && osdmap_epoch - max_lag > osd_min) { epoch_t need = osdmap_epoch - max_lag; dout(10) << __func__ << " waiting for pgs to catch up (need " << need << " max_lag " << max_lag << ")" << dendl; for (auto shard : shards) { epoch_t min = shard->get_min_pg_epoch(); if (need > min) { dout(10) << __func__ << " waiting for pgs to consume " << need << " (shard " << shard->shard_id << " min " << min << ", map cache is " << cct->_conf->osd_map_cache_size << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor << ")" << dendl; unlock_guard unlock{osd_lock}; shard->wait_min_pg_epoch(need); } } } } ceph_assert(osd_lock.is_locked()); map added_maps; map added_maps_bl; if (m->fsid != monc->get_fsid()) { dout(0) << "handle_osd_map fsid " << m->fsid << " != " << monc->get_fsid() << dendl; m->put(); return; } if (is_initializing()) { dout(0) << "ignoring osdmap until we have initialized" << dendl; m->put(); return; } auto priv = m->get_connection()->get_priv(); if (auto session = static_cast(priv.get()); session && !(session->entity_name.is_mon() || session->entity_name.is_osd())) { //not enough perms! dout(10) << "got osd map from Session " << session << " which we can't take maps from (not a mon or osd)" << dendl; m->put(); return; } // share with the objecter if (!is_preboot()) service.objecter->handle_osd_map(m); epoch_t first = m->get_first(); epoch_t last = m->get_last(); dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have " << superblock.newest_map << ", src has [" << m->oldest_map << "," << m->newest_map << "]" << dendl; logger->inc(l_osd_map); logger->inc(l_osd_mape, last - first + 1); if (first <= superblock.newest_map) logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1); if (service.max_oldest_map < m->oldest_map) { service.max_oldest_map = m->oldest_map; ceph_assert(service.max_oldest_map >= superblock.oldest_map); } // make sure there is something new, here, before we bother flushing // the queues and such if (last <= superblock.newest_map) { dout(10) << " no new maps here, dropping" << dendl; m->put(); return; } // missing some? bool skip_maps = false; if (first > superblock.newest_map + 1) { dout(10) << "handle_osd_map message skips epochs " << superblock.newest_map + 1 << ".." << (first-1) << dendl; if (m->oldest_map <= superblock.newest_map + 1) { osdmap_subscribe(superblock.newest_map + 1, false); m->put(); return; } // always try to get the full range of maps--as many as we can. this // 1- is good to have // 2- is at present the only way to ensure that we get a *full* map as // the first map! if (m->oldest_map < first) { osdmap_subscribe(m->oldest_map - 1, true); m->put(); return; } skip_maps = true; } ObjectStore::Transaction t; uint64_t txn_size = 0; // store new maps: queue for disk and put in the osdmap cache epoch_t start = std::max(superblock.newest_map + 1, first); for (epoch_t e = start; e <= last; e++) { if (txn_size >= t.get_num_bytes()) { derr << __func__ << " transaction size overflowed" << dendl; ceph_assert(txn_size < t.get_num_bytes()); } txn_size = t.get_num_bytes(); map::iterator p; p = m->maps.find(e); if (p != m->maps.end()) { dout(10) << "handle_osd_map got full map for epoch " << e << dendl; OSDMap *o = new OSDMap; bufferlist& bl = p->second; o->decode(bl); ghobject_t fulloid = get_osdmap_pobject_name(e); t.write(coll_t::meta(), fulloid, 0, bl.length(), bl); added_maps[e] = add_map(o); added_maps_bl[e] = bl; got_full_map(e); continue; } p = m->incremental_maps.find(e); if (p != m->incremental_maps.end()) { dout(10) << "handle_osd_map got inc map for epoch " << e << dendl; bufferlist& bl = p->second; ghobject_t oid = get_inc_osdmap_pobject_name(e); t.write(coll_t::meta(), oid, 0, bl.length(), bl); OSDMap *o = new OSDMap; if (e > 1) { bufferlist obl; bool got = get_map_bl(e - 1, obl); if (!got) { auto p = added_maps_bl.find(e - 1); ceph_assert(p != added_maps_bl.end()); obl = p->second; } o->decode(obl); } OSDMap::Incremental inc; auto p = bl.cbegin(); inc.decode(p); if (o->apply_incremental(inc) < 0) { derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl; ceph_abort_msg("bad fsid"); } bufferlist fbl; o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); bool injected_failure = false; if (cct->_conf->osd_inject_bad_map_crc_probability > 0 && (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) { derr << __func__ << " injecting map crc failure" << dendl; injected_failure = true; } if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) { dout(2) << "got incremental " << e << " but failed to encode full with correct crc; requesting" << dendl; clog->warn() << "failed to encode map e" << e << " with expected crc"; dout(20) << "my encoded map was:\n"; fbl.hexdump(*_dout); *_dout << dendl; delete o; request_full_map(e, last); last = e - 1; // don't continue committing if we failed to enc the first inc map if (last < start) { dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl; m->put(); return; } break; } got_full_map(e); ghobject_t fulloid = get_osdmap_pobject_name(e); t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl); added_maps[e] = add_map(o); added_maps_bl[e] = fbl; continue; } ceph_abort_msg("MOSDMap lied about what maps it had?"); } // even if this map isn't from a mon, we may have satisfied our subscription monc->sub_got("osdmap", last); if (!m->maps.empty() && requested_full_first) { dout(10) << __func__ << " still missing full maps " << requested_full_first << ".." << requested_full_last << dendl; rerequest_full_maps(); } if (superblock.oldest_map) { // make sure we at least keep pace with incoming maps trim_maps(m->oldest_map, last - first + 1, skip_maps); pg_num_history.prune(superblock.oldest_map); } if (!superblock.oldest_map || skip_maps) superblock.oldest_map = first; superblock.newest_map = last; superblock.current_epoch = last; // note in the superblock that we were clean thru the prior epoch epoch_t boot_epoch = service.get_boot_epoch(); if (boot_epoch && boot_epoch >= superblock.mounted) { superblock.mounted = boot_epoch; superblock.clean_thru = last; } // check for pg_num changes and deleted pools OSDMapRef lastmap; for (auto& i : added_maps) { if (!lastmap) { if (!(lastmap = service.try_get_map(i.first - 1))) { dout(10) << __func__ << " can't get previous map " << i.first - 1 << " probably first start of this osd" << dendl; continue; } } ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch()); for (auto& j : lastmap->get_pools()) { if (!i.second->have_pg_pool(j.first)) { pg_num_history.log_pool_delete(i.first, j.first); dout(10) << __func__ << " recording final pg_pool_t for pool " << j.first << dendl; // this information is needed by _make_pg() if have to restart before // the pool is deleted and need to instantiate a new (zombie) PG[Pool]. ghobject_t obj = make_final_pool_info_oid(j.first); bufferlist bl; encode(j.second, bl, CEPH_FEATURES_ALL); string name = lastmap->get_pool_name(j.first); encode(name, bl); map profile; if (lastmap->get_pg_pool(j.first)->is_erasure()) { profile = lastmap->get_erasure_code_profile( lastmap->get_pg_pool(j.first)->erasure_code_profile); } encode(profile, bl); t.write(coll_t::meta(), obj, 0, bl.length(), bl); service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num()); } else if (unsigned new_pg_num = i.second->get_pg_num(j.first); new_pg_num != j.second.get_pg_num()) { dout(10) << __func__ << " recording pool " << j.first << " pg_num " << j.second.get_pg_num() << " -> " << new_pg_num << dendl; pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num); } } for (auto& j : i.second->get_pools()) { if (!lastmap->have_pg_pool(j.first)) { dout(10) << __func__ << " recording new pool " << j.first << " pg_num " << j.second.get_pg_num() << dendl; pg_num_history.log_pg_num_change(i.first, j.first, j.second.get_pg_num()); } } lastmap = i.second; } pg_num_history.epoch = last; { bufferlist bl; ::encode(pg_num_history, bl); t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl); dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; } // superblock and commit write_superblock(t); t.register_on_commit(new C_OnMapCommit(this, start, last, m)); store->queue_transaction( service.meta_ch, std::move(t)); service.publish_superblock(superblock); } void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) { dout(10) << __func__ << " " << first << ".." << last << dendl; if (is_stopping()) { dout(10) << __func__ << " bailing, we are shutting down" << dendl; return; } std::lock_guard l(osd_lock); if (is_stopping()) { dout(10) << __func__ << " bailing, we are shutting down" << dendl; return; } map_lock.get_write(); ceph_assert(first <= last); bool do_shutdown = false; bool do_restart = false; bool network_error = false; OSDMapRef osdmap = get_osdmap(); // advance through the new maps for (epoch_t cur = first; cur <= last; cur++) { dout(10) << " advance to epoch " << cur << " (<= last " << last << " <= newest_map " << superblock.newest_map << ")" << dendl; OSDMapRef newmap = get_map(cur); ceph_assert(newmap); // we just cached it above! // start blacklisting messages sent to peers that go down. service.pre_publish_map(newmap); // kill connections to newly down osds bool waited_for_reservations = false; set old; osdmap = get_osdmap(); osdmap->get_all_osds(old); for (set::iterator p = old.begin(); p != old.end(); ++p) { if (*p != whoami && osdmap->is_up(*p) && // in old map newmap->is_down(*p)) { // but not the new one if (!waited_for_reservations) { service.await_reserved_maps(); waited_for_reservations = true; } note_down_osd(*p); } else if (*p != whoami && osdmap->is_down(*p) && newmap->is_up(*p)) { note_up_osd(*p); } } if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) { dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch() << dendl; if (is_booting()) { // this captures the case where we sent the boot message while // NOUP was being set on the mon and our boot request was // dropped, and then later it is cleared. it imperfectly // handles the case where our original boot message was not // dropped and we restart even though we might have booted, but // that is harmless (boot will just take slightly longer). do_restart = true; } } osdmap = std::move(newmap); set_osdmap(osdmap); epoch_t up_epoch; epoch_t boot_epoch; service.retrieve_epochs(&boot_epoch, &up_epoch, NULL); if (!up_epoch && osdmap->is_up(whoami) && osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) { up_epoch = osdmap->get_epoch(); dout(10) << "up_epoch is " << up_epoch << dendl; if (!boot_epoch) { boot_epoch = osdmap->get_epoch(); dout(10) << "boot_epoch is " << boot_epoch << dendl; } service.set_epochs(&boot_epoch, &up_epoch, NULL); } } had_map_since = ceph_clock_now(); epoch_t _bind_epoch = service.get_bind_epoch(); if (osdmap->is_up(whoami) && osdmap->get_addrs(whoami).legacy_equals( client_messenger->get_myaddrs()) && _bind_epoch < osdmap->get_up_from(whoami)) { if (is_booting()) { dout(1) << "state: booting -> active" << dendl; set_state(STATE_ACTIVE); do_restart = false; // set incarnation so that osd_reqid_t's we generate for our // objecter requests are unique across restarts. service.objecter->set_client_incarnation(osdmap->get_epoch()); cancel_pending_failures(); } } if (osdmap->get_epoch() > 0 && is_active()) { if (!osdmap->exists(whoami)) { dout(0) << "map says i do not exist. shutting down." << dendl; do_shutdown = true; // don't call shutdown() while we have // everything paused } else if (!osdmap->is_up(whoami) || !osdmap->get_addrs(whoami).legacy_equals( client_messenger->get_myaddrs()) || !osdmap->get_cluster_addrs(whoami).legacy_equals( cluster_messenger->get_myaddrs()) || !osdmap->get_hb_back_addrs(whoami).legacy_equals( hb_back_server_messenger->get_myaddrs()) || !osdmap->get_hb_front_addrs(whoami).legacy_equals( hb_front_server_messenger->get_myaddrs())) { if (!osdmap->is_up(whoami)) { if (service.is_preparing_to_stop() || service.is_stopping()) { service.got_stop_ack(); } else { clog->warn() << "Monitor daemon marked osd." << whoami << " down, " "but it is still running"; clog->debug() << "map e" << osdmap->get_epoch() << " wrongly marked me down at e" << osdmap->get_down_at(whoami); } } else if (!osdmap->get_addrs(whoami).legacy_equals( client_messenger->get_myaddrs())) { clog->error() << "map e" << osdmap->get_epoch() << " had wrong client addr (" << osdmap->get_addrs(whoami) << " != my " << client_messenger->get_myaddrs() << ")"; } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals( cluster_messenger->get_myaddrs())) { clog->error() << "map e" << osdmap->get_epoch() << " had wrong cluster addr (" << osdmap->get_cluster_addrs(whoami) << " != my " << cluster_messenger->get_myaddrs() << ")"; } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals( hb_back_server_messenger->get_myaddrs())) { clog->error() << "map e" << osdmap->get_epoch() << " had wrong heartbeat back addr (" << osdmap->get_hb_back_addrs(whoami) << " != my " << hb_back_server_messenger->get_myaddrs() << ")"; } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals( hb_front_server_messenger->get_myaddrs())) { clog->error() << "map e" << osdmap->get_epoch() << " had wrong heartbeat front addr (" << osdmap->get_hb_front_addrs(whoami) << " != my " << hb_front_server_messenger->get_myaddrs() << ")"; } if (!service.is_stopping()) { epoch_t up_epoch = 0; epoch_t bind_epoch = osdmap->get_epoch(); service.set_epochs(NULL,&up_epoch, &bind_epoch); do_restart = true; //add markdown log utime_t now = ceph_clock_now(); utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0); osd_markdown_log.push_back(now); //clear all out-of-date log while (!osd_markdown_log.empty() && osd_markdown_log.front() + grace < now) osd_markdown_log.pop_front(); if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) { dout(0) << __func__ << " marked down " << osd_markdown_log.size() << " > osd_max_markdown_count " << cct->_conf->osd_max_markdown_count << " in last " << grace << " seconds, shutting down" << dendl; do_restart = false; do_shutdown = true; } start_waiting_for_healthy(); set avoid_ports; #if defined(__FreeBSD__) // prevent FreeBSD from grabbing the client_messenger port during // rebinding. In which case a cluster_meesneger will connect also // to the same port client_messenger->get_myaddrs().get_ports(&avoid_ports); #endif cluster_messenger->get_myaddrs().get_ports(&avoid_ports); hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports); hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports); int r = cluster_messenger->rebind(avoid_ports); if (r != 0) { do_shutdown = true; // FIXME: do_restart? network_error = true; dout(0) << __func__ << " marked down:" << " rebind cluster_messenger failed" << dendl; } r = hb_back_server_messenger->rebind(avoid_ports); if (r != 0) { do_shutdown = true; // FIXME: do_restart? network_error = true; dout(0) << __func__ << " marked down:" << " rebind hb_back_server_messenger failed" << dendl; } r = hb_front_server_messenger->rebind(avoid_ports); if (r != 0) { do_shutdown = true; // FIXME: do_restart? network_error = true; dout(0) << __func__ << " marked down:" << " rebind hb_front_server_messenger failed" << dendl; } hb_front_client_messenger->mark_down_all(); hb_back_client_messenger->mark_down_all(); reset_heartbeat_peers(true); } } } map_lock.put_write(); check_osdmap_features(); // yay! consume_map(); if (is_active() || is_waiting_for_healthy()) maybe_update_heartbeat_peers(); if (is_active()) { activate_map(); } if (do_shutdown) { if (network_error) { cancel_pending_failures(); } // trigger shutdown in a different thread dout(0) << __func__ << " shutdown OSD via async signal" << dendl; queue_async_signal(SIGINT); } else if (m->newest_map && m->newest_map > last) { dout(10) << " msg say newest map is " << m->newest_map << ", requesting more" << dendl; osdmap_subscribe(osdmap->get_epoch()+1, false); } else if (is_preboot()) { if (m->get_source().is_mon()) _preboot(m->oldest_map, m->newest_map); else start_boot(); } else if (do_restart) start_boot(); } void OSD::check_osdmap_features() { // adjust required feature bits? // we have to be a bit careful here, because we are accessing the // Policy structures without taking any lock. in particular, only // modify integer values that can safely be read by a racing CPU. // since we are only accessing existing Policy structures a their // current memory location, and setting or clearing bits in integer // fields, and we are the only writer, this is not a problem. const auto osdmap = get_osdmap(); { Messenger::Policy p = client_messenger->get_default_policy(); uint64_t mask; uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask); if ((p.features_required & mask) != features) { dout(0) << "crush map has features " << features << ", adjusting msgr requires for clients" << dendl; p.features_required = (p.features_required & ~mask) | features; client_messenger->set_default_policy(p); } } { Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON); uint64_t mask; uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask); if ((p.features_required & mask) != features) { dout(0) << "crush map has features " << features << " was " << p.features_required << ", adjusting msgr requires for mons" << dendl; p.features_required = (p.features_required & ~mask) | features; client_messenger->set_policy(entity_name_t::TYPE_MON, p); } } { Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD); uint64_t mask; uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask); if ((p.features_required & mask) != features) { dout(0) << "crush map has features " << features << ", adjusting msgr requires for osds" << dendl; p.features_required = (p.features_required & ~mask) | features; cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p); } if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) { dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl; superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); ObjectStore::Transaction t; write_superblock(t); int err = store->queue_transaction(service.meta_ch, std::move(t), NULL); ceph_assert(err == 0); } } if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) { heartbeat_dispatcher.ms_set_require_authorizer(false); } if (osdmap->require_osd_release != last_require_osd_release) { dout(1) << __func__ << " require_osd_release " << last_require_osd_release << " -> " << to_string(osdmap->require_osd_release) << dendl; store->write_meta("require_osd_release", stringify((int)osdmap->require_osd_release)); last_require_osd_release = osdmap->require_osd_release; } } struct C_FinishSplits : public Context { OSD *osd; set pgs; C_FinishSplits(OSD *osd, const set &in) : osd(osd), pgs(in) {} void finish(int r) override { osd->_finish_splits(pgs); } }; void OSD::_finish_splits(set& pgs) { dout(10) << __func__ << " " << pgs << dendl; if (is_stopping()) return; PG::RecoveryCtx rctx = create_context(); for (set::iterator i = pgs.begin(); i != pgs.end(); ++i) { PG *pg = i->get(); pg->lock(); dout(10) << __func__ << " " << *pg << dendl; epoch_t e = pg->get_osdmap_epoch(); pg->handle_initialize(&rctx); pg->queue_null(e, e); dispatch_context_transaction(rctx, pg); pg->unlock(); unsigned shard_index = pg->pg_id.hash_to_shard(num_shards); shards[shard_index]->register_and_wake_split_child(pg); } dispatch_context(rctx, 0, service.get_osdmap()); }; bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src, unsigned need) { std::lock_guard l(merge_lock); auto& p = merge_waiters[nextmap->get_epoch()][target]; p[src->pg_id] = src; dout(10) << __func__ << " added merge_waiter " << src->pg_id << " for " << target << ", have " << p.size() << "/" << need << dendl; return p.size() == need; } bool OSD::advance_pg( epoch_t osd_epoch, PG *pg, ThreadPool::TPHandle &handle, PG::RecoveryCtx *rctx) { if (osd_epoch <= pg->get_osdmap_epoch()) { return true; } ceph_assert(pg->is_locked()); OSDMapRef lastmap = pg->get_osdmap(); ceph_assert(lastmap->get_epoch() < osd_epoch); set new_pgs; // any split children bool ret = true; unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ? lastmap->get_pg_num(pg->pg_id.pool()) : 0; for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1; next_epoch <= osd_epoch; ++next_epoch) { OSDMapRef nextmap = service.try_get_map(next_epoch); if (!nextmap) { dout(20) << __func__ << " missing map " << next_epoch << dendl; continue; } unsigned new_pg_num = (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ? nextmap->get_pg_num(pg->pg_id.pool()) : 0; if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) { // check for merge if (nextmap->have_pg_pool(pg->pg_id.pool())) { spg_t parent; if (pg->pg_id.is_merge_source( old_pg_num, new_pg_num, &parent)) { // we are merge source PGRef spg = pg; // carry a ref dout(1) << __func__ << " " << pg->pg_id << " is merge source, target is " << parent << dendl; pg->write_if_dirty(rctx); if (!new_pgs.empty()) { rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs)); new_pgs.clear(); } dispatch_context_transaction(*rctx, pg, &handle); pg->ch->flush(); // release backoffs explicitly, since the on_shutdown path // aggressively tears down backoff state. if (pg->is_primary()) { pg->release_pg_backoffs(); } pg->on_shutdown(); OSDShard *sdata = pg->osd_shard; { std::lock_guard l(sdata->shard_lock); if (pg->pg_slot) { sdata->_detach_pg(pg->pg_slot); // update pg count now since we might not get an osdmap // any time soon. if (pg->is_primary()) logger->dec(l_osd_pg_primary); else if (pg->is_replica()) logger->dec(l_osd_pg_replica); else logger->dec(l_osd_pg_stray); } } pg->unlock(); set children; parent.is_split(new_pg_num, old_pg_num, &children); if (add_merge_waiter(nextmap, parent, pg, children.size())) { enqueue_peering_evt( parent, PGPeeringEventRef( std::make_shared( nextmap->get_epoch(), nextmap->get_epoch(), NullEvt()))); } ret = false; goto out; } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) { // we are merge target set children; pg->pg_id.is_split(new_pg_num, old_pg_num, &children); dout(20) << __func__ << " " << pg->pg_id << " is merge target, sources are " << children << dendl; map sources; { std::lock_guard l(merge_lock); auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id]; unsigned need = children.size(); dout(20) << __func__ << " have " << s.size() << "/" << need << dendl; if (s.size() == need) { sources.swap(s); merge_waiters[nextmap->get_epoch()].erase(pg->pg_id); if (merge_waiters[nextmap->get_epoch()].empty()) { merge_waiters.erase(nextmap->get_epoch()); } } } if (!sources.empty()) { unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool()); unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num); dout(1) << __func__ << " merging " << pg->pg_id << dendl; pg->merge_from( sources, rctx, split_bits, nextmap->get_pg_pool( pg->pg_id.pool())->last_pg_merge_meta); pg->pg_slot->waiting_for_merge_epoch = 0; } else { dout(20) << __func__ << " not ready to merge yet" << dendl; pg->write_if_dirty(rctx); if (!new_pgs.empty()) { rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs)); new_pgs.clear(); } dispatch_context_transaction(*rctx, pg, &handle); pg->unlock(); // kick source(s) to get them ready for (auto& i : children) { dout(20) << __func__ << " kicking source " << i << dendl; enqueue_peering_evt( i, PGPeeringEventRef( std::make_shared( nextmap->get_epoch(), nextmap->get_epoch(), NullEvt()))); } ret = false; goto out; } } } } vector newup, newacting; int up_primary, acting_primary; nextmap->pg_to_up_acting_osds( pg->pg_id.pgid, &newup, &up_primary, &newacting, &acting_primary); pg->handle_advance_map( nextmap, lastmap, newup, up_primary, newacting, acting_primary, rctx); auto oldpool = lastmap->get_pools().find(pg->pg_id.pool()); auto newpool = nextmap->get_pools().find(pg->pg_id.pool()); if (oldpool != lastmap->get_pools().end() && newpool != nextmap->get_pools().end()) { dout(20) << __func__ << " new pool opts " << newpool->second.opts << " old pool opts " << oldpool->second.opts << dendl; double old_min_interval = 0, new_min_interval = 0; oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval); newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval); double old_max_interval = 0, new_max_interval = 0; oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval); newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval); // Assume if an interval is change from set to unset or vice versa the actual config // is different. Keep it simple even if it is possible to call resched_all_scrub() // unnecessarily. if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) { pg->on_info_history_change(); } } if (new_pg_num && old_pg_num != new_pg_num) { // check for split set children; if (pg->pg_id.is_split( old_pg_num, new_pg_num, &children)) { split_pgs( pg, children, &new_pgs, lastmap, nextmap, rctx); } } lastmap = nextmap; old_pg_num = new_pg_num; handle.reset_tp_timeout(); } pg->handle_activate_map(rctx); ret = true; out: if (!new_pgs.empty()) { rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs)); } return ret; } void OSD::consume_map() { ceph_assert(osd_lock.is_locked()); auto osdmap = get_osdmap(); dout(7) << "consume_map version " << osdmap->get_epoch() << dendl; /** make sure the cluster is speaking in SORTBITWISE, because we don't * speak the older sorting version any more. Be careful not to force * a shutdown if we are merely processing old maps, though. */ if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) { derr << __func__ << " SORTBITWISE flag is not set" << dendl; ceph_abort(); } service.pre_publish_map(osdmap); service.await_reserved_maps(); service.publish_map(osdmap); // prime splits and merges set> newly_split; // splits, and when set> merge_pgs; // merge participants, and when for (auto& shard : shards) { shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs); } if (!newly_split.empty()) { for (auto& shard : shards) { shard->prime_splits(osdmap, &newly_split); } ceph_assert(newly_split.empty()); } // prune sent_ready_to_merge service.prune_sent_ready_to_merge(osdmap); // FIXME, maybe: We could race against an incoming peering message // that instantiates a merge PG after identify_merges() below and // never set up its peer to complete the merge. An OSD restart // would clear it up. This is a hard race to resolve, // extraordinarily rare (we only merge PGs that are stable and // clean, so it'd have to be an imported PG to an OSD with a // slightly stale OSDMap...), so I'm ignoring it for now. We plan to // replace all of this with a seastar-based code soon anyway. if (!merge_pgs.empty()) { // mark the pgs we already have, or create new and empty merge // participants for those we are missing. do this all under the // shard lock so we don't have to worry about racing pg creates // via _process. for (auto& shard : shards) { shard->prime_merges(osdmap, &merge_pgs); } ceph_assert(merge_pgs.empty()); } service.prune_pg_created(); unsigned pushes_to_free = 0; for (auto& shard : shards) { shard->consume_map(osdmap, &pushes_to_free); } vector pgids; _get_pgids(&pgids); // count (FIXME, probably during seastar rewrite) int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0; vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { // FIXME (probably during seastar rewrite): this is lockless and // racy, but we don't want to take pg lock here. if (pg->is_primary()) num_pg_primary++; else if (pg->is_replica()) num_pg_replica++; else num_pg_stray++; } { // FIXME (as part of seastar rewrite): move to OSDShard std::lock_guard l(pending_creates_lock); for (auto pg = pending_creates_from_osd.begin(); pg != pending_creates_from_osd.end();) { if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) { dout(10) << __func__ << " pg " << pg->first << " doesn't map here, " << "discarding pending_create_from_osd" << dendl; pg = pending_creates_from_osd.erase(pg); } else { ++pg; } } } service.maybe_inject_dispatch_delay(); dispatch_sessions_waiting_on_map(); service.maybe_inject_dispatch_delay(); service.release_reserved_pushes(pushes_to_free); // queue null events to push maps down to individual PGs for (auto pgid : pgids) { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( osdmap->get_epoch(), osdmap->get_epoch(), NullEvt()))); } logger->set(l_osd_pg, pgids.size()); logger->set(l_osd_pg_primary, num_pg_primary); logger->set(l_osd_pg_replica, num_pg_replica); logger->set(l_osd_pg_stray, num_pg_stray); } void OSD::activate_map() { ceph_assert(osd_lock.is_locked()); auto osdmap = get_osdmap(); dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; if (osdmap->test_flag(CEPH_OSDMAP_FULL)) { dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl; osdmap_subscribe(osdmap->get_epoch() + 1, false); } // norecover? if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) { if (!service.recovery_is_paused()) { dout(1) << "pausing recovery (NORECOVER flag set)" << dendl; service.pause_recovery(); } } else { if (service.recovery_is_paused()) { dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl; service.unpause_recovery(); } } service.activate_map(); // process waiters take_waiters(waiting_for_osdmap); } bool OSD::require_mon_peer(const Message *m) { if (!m->get_connection()->peer_is_mon()) { dout(0) << "require_mon_peer received from non-mon " << m->get_connection()->get_peer_addr() << " " << *m << dendl; return false; } return true; } bool OSD::require_mon_or_mgr_peer(const Message *m) { if (!m->get_connection()->peer_is_mon() && !m->get_connection()->peer_is_mgr()) { dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr " << m->get_connection()->get_peer_addr() << " " << *m << dendl; return false; } return true; } bool OSD::require_osd_peer(const Message *m) { if (!m->get_connection()->peer_is_osd()) { dout(0) << "require_osd_peer received from non-osd " << m->get_connection()->get_peer_addr() << " " << *m << dendl; return false; } return true; } bool OSD::require_self_aliveness(const Message *m, epoch_t epoch) { epoch_t up_epoch = service.get_up_epoch(); if (epoch < up_epoch) { dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl; return false; } if (!is_active()) { dout(7) << "still in boot state, dropping message " << *m << dendl; return false; } return true; } bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map, bool is_fast_dispatch) { int from = m->get_source().num(); if (map->is_down(from) || (map->get_cluster_addrs(from) != m->get_source_addrs())) { dout(5) << "from dead osd." << from << ", marking down, " << " msg was " << m->get_source_inst().addr << " expected " << (map->is_up(from) ? map->get_cluster_addrs(from) : entity_addrvec_t()) << dendl; ConnectionRef con = m->get_connection(); con->mark_down(); auto priv = con->get_priv(); if (auto s = static_cast(priv.get()); s) { if (!is_fast_dispatch) s->session_dispatch_lock.Lock(); clear_session_waiting_on_map(s); con->set_priv(nullptr); // break ref <-> session cycle, if any s->con.reset(); if (!is_fast_dispatch) s->session_dispatch_lock.Unlock(); } return false; } return true; } /* * require that we have same (or newer) map, and that * the source is the pg primary. */ bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch, bool is_fast_dispatch) { const Message *m = op->get_req(); const auto osdmap = get_osdmap(); dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; ceph_assert(osd_lock.is_locked()); // do they have a newer map? if (epoch > osdmap->get_epoch()) { dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << " with " << m << dendl; wait_for_new_map(op); return false; } if (!require_self_aliveness(op->get_req(), epoch)) { return false; } // ok, our map is same or newer.. do they still exist? if (m->get_connection()->get_messenger() == cluster_messenger && !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) { return false; } return true; } // ---------------------------------------- // pg creation void OSD::split_pgs( PG *parent, const set &childpgids, set *out_pgs, OSDMapRef curmap, OSDMapRef nextmap, PG::RecoveryCtx *rctx) { unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool()); parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num)); vector updated_stats; parent->start_split_stats(childpgids, &updated_stats); vector::iterator stat_iter = updated_stats.begin(); for (set::const_iterator i = childpgids.begin(); i != childpgids.end(); ++i, ++stat_iter) { ceph_assert(stat_iter != updated_stats.end()); dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl; PG* child = _make_pg(nextmap, *i); child->lock(true); out_pgs->insert(child); child->ch = store->create_new_collection(child->coll); { uint32_t shard_index = i->hash_to_shard(shards.size()); assert(NULL != shards[shard_index]); store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue)); } unsigned split_bits = i->get_split_bits(pg_num); dout(10) << " pg_num is " << pg_num << ", m_seed " << i->ps() << ", split_bits is " << split_bits << dendl; parent->split_colls( *i, split_bits, i->ps(), &child->get_pool().info, rctx->transaction); parent->split_into( i->pgid, child, split_bits); child->init_collection_pool_opts(); child->finish_split_stats(*stat_iter, rctx->transaction); child->unlock(); } ceph_assert(stat_iter != updated_stats.end()); parent->finish_split_stats(*stat_iter, rctx->transaction); } /* * holding osd_lock */ void OSD::handle_pg_create(OpRequestRef op) { const MOSDPGCreate *m = static_cast(op->get_req()); ceph_assert(m->get_type() == MSG_OSD_PG_CREATE); dout(10) << "handle_pg_create " << *m << dendl; if (!require_mon_peer(op->get_req())) { return; } if (!require_same_or_newer_map(op, m->epoch, false)) return; op->mark_started(); const auto osdmap = get_osdmap(); map::const_iterator ci = m->ctimes.begin(); for (map::const_iterator p = m->mkpg.begin(); p != m->mkpg.end(); ++p, ++ci) { ceph_assert(ci != m->ctimes.end() && ci->first == p->first); epoch_t created = p->second.created; if (p->second.split_bits) // Skip split pgs continue; pg_t on = p->first; if (!osdmap->have_pg_pool(on.pool())) { dout(20) << "ignoring pg on deleted pool " << on << dendl; continue; } dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl; // is it still ours? vector up, acting; int up_primary = -1; int acting_primary = -1; osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary); int role = osdmap->calc_pg_role(whoami, acting, acting.size()); if (acting_primary != whoami) { dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary << "), my role=" << role << ", skipping" << dendl; continue; } spg_t pgid; bool mapped = osdmap->get_primary_shard(on, &pgid); ceph_assert(mapped); PastIntervals pi; pg_history_t history; build_initial_pg_history(pgid, created, ci->second, &history, &pi); // The mon won't resend unless the primary changed, so we ignore // same_interval_since. We'll pass this history with the current // epoch as the event. if (history.same_primary_since > m->epoch) { dout(10) << __func__ << ": got obsolete pg create on pgid " << pgid << " from epoch " << m->epoch << ", primary changed in " << history.same_primary_since << dendl; continue; } enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( osdmap->get_epoch(), osdmap->get_epoch(), NullEvt(), true, new PGCreateInfo( pgid, osdmap->get_epoch(), history, pi, true) ))); } { std::lock_guard l(pending_creates_lock); if (pending_creates_from_mon == 0) { last_pg_create_epoch = m->epoch; } } maybe_update_heartbeat_peers(); } // ---------------------------------------- // peering and recovery PG::RecoveryCtx OSD::create_context() { ObjectStore::Transaction *t = new ObjectStore::Transaction; map > *query_map = new map >; map > > *notify_list = new map > >; map > > *info_map = new map > >; PG::RecoveryCtx rctx(query_map, info_map, notify_list, t); return rctx; } void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg, ThreadPool::TPHandle *handle) { if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) { int tr = store->queue_transaction( pg->ch, std::move(*ctx.transaction), TrackedOpRef(), handle); ceph_assert(tr == 0); delete (ctx.transaction); ctx.transaction = new ObjectStore::Transaction; } } void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap, ThreadPool::TPHandle *handle) { if (!service.get_osdmap()->is_up(whoami)) { dout(20) << __func__ << " not up in osdmap" << dendl; } else if (!is_active()) { dout(20) << __func__ << " not active" << dendl; } else { do_notifies(*ctx.notify_list, curmap); do_queries(*ctx.query_map, curmap); do_infos(*ctx.info_map, curmap); } if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) { int tr = store->queue_transaction( pg->ch, std::move(*ctx.transaction), TrackedOpRef(), handle); ceph_assert(tr == 0); } delete ctx.notify_list; delete ctx.query_map; delete ctx.info_map; delete ctx.transaction; } void OSD::discard_context(PG::RecoveryCtx& ctx) { delete ctx.notify_list; delete ctx.query_map; delete ctx.info_map; delete ctx.transaction; } /** do_notifies * Send an MOSDPGNotify to a primary, with a list of PGs that I have * content for, and they are primary for. */ void OSD::do_notifies( map > >& notify_list, OSDMapRef curmap) { for (map > >::iterator it = notify_list.begin(); it != notify_list.end(); ++it) { if (!curmap->is_up(it->first)) { dout(20) << __func__ << " skipping down osd." << it->first << dendl; continue; } ConnectionRef con = service.get_con_osd_cluster( it->first, curmap->get_epoch()); if (!con) { dout(20) << __func__ << " skipping osd." << it->first << " (NULL con)" << dendl; continue; } service.share_map_peer(it->first, con.get(), curmap); dout(7) << __func__ << " osd." << it->first << " on " << it->second.size() << " PGs" << dendl; MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(), it->second); con->send_message(m); } } /** do_queries * send out pending queries for info | summaries */ void OSD::do_queries(map >& query_map, OSDMapRef curmap) { for (map >::iterator pit = query_map.begin(); pit != query_map.end(); ++pit) { if (!curmap->is_up(pit->first)) { dout(20) << __func__ << " skipping down osd." << pit->first << dendl; continue; } int who = pit->first; ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch()); if (!con) { dout(20) << __func__ << " skipping osd." << who << " (NULL con)" << dendl; continue; } service.share_map_peer(who, con.get(), curmap); dout(7) << __func__ << " querying osd." << who << " on " << pit->second.size() << " PGs" << dendl; MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second); con->send_message(m); } } void OSD::do_infos(map > >& info_map, OSDMapRef curmap) { for (map > >::iterator p = info_map.begin(); p != info_map.end(); ++p) { if (!curmap->is_up(p->first)) { dout(20) << __func__ << " skipping down osd." << p->first << dendl; continue; } for (vector >::iterator i = p->second.begin(); i != p->second.end(); ++i) { dout(20) << __func__ << " sending info " << i->first.info << " to shard " << p->first << dendl; } ConnectionRef con = service.get_con_osd_cluster( p->first, curmap->get_epoch()); if (!con) { dout(20) << __func__ << " skipping osd." << p->first << " (NULL con)" << dendl; continue; } service.share_map_peer(p->first, con.get(), curmap); MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch()); m->pg_list = p->second; con->send_message(m); } info_map.clear(); } void OSD::handle_fast_pg_create(MOSDPGCreate2 *m) { dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; if (!require_mon_peer(m)) { m->put(); return; } for (auto& p : m->pgs) { spg_t pgid = p.first; epoch_t created = p.second.first; utime_t created_stamp = p.second.second; dout(20) << __func__ << " " << pgid << " e" << created << "@" << created_stamp << dendl; pg_history_t h; h.epoch_created = created; h.epoch_pool_created = created; h.same_up_since = created; h.same_interval_since = created; h.same_primary_since = created; h.last_scrub_stamp = created_stamp; h.last_deep_scrub_stamp = created_stamp; h.last_clean_scrub_stamp = created_stamp; enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( m->epoch, m->epoch, NullEvt(), true, new PGCreateInfo( pgid, created, h, PastIntervals(), true) ))); } { std::lock_guard l(pending_creates_lock); if (pending_creates_from_mon == 0) { last_pg_create_epoch = m->epoch; } } m->put(); } void OSD::handle_fast_pg_query(MOSDPGQuery *m) { dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; if (!require_osd_peer(m)) { m->put(); return; } int from = m->get_source().num(); for (auto& p : m->pg_list) { enqueue_peering_evt( p.first, PGPeeringEventRef( std::make_shared( p.second.epoch_sent, p.second.epoch_sent, MQuery( p.first, pg_shard_t(from, p.second.from), p.second, p.second.epoch_sent), false)) ); } m->put(); } void OSD::handle_fast_pg_notify(MOSDPGNotify* m) { dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; if (!require_osd_peer(m)) { m->put(); return; } int from = m->get_source().num(); for (auto& p : m->get_pg_list()) { spg_t pgid(p.first.info.pgid.pgid, p.first.to); enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( p.first.epoch_sent, p.first.query_epoch, MNotifyRec( pgid, pg_shard_t(from, p.first.from), p.first, m->get_connection()->get_features(), p.second), true, new PGCreateInfo( pgid, p.first.query_epoch, p.first.info.history, p.second, false) ))); } m->put(); } void OSD::handle_fast_pg_info(MOSDPGInfo* m) { dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; if (!require_osd_peer(m)) { m->put(); return; } int from = m->get_source().num(); for (auto& p : m->pg_list) { enqueue_peering_evt( spg_t(p.first.info.pgid.pgid, p.first.to), PGPeeringEventRef( std::make_shared( p.first.epoch_sent, p.first.query_epoch, MInfoRec( pg_shard_t(from, p.first.from), p.first.info, p.first.epoch_sent))) ); } m->put(); } void OSD::handle_fast_pg_remove(MOSDPGRemove *m) { dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl; if (!require_osd_peer(m)) { m->put(); return; } for (auto& pgid : m->pg_list) { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( m->get_epoch(), m->get_epoch(), PG::DeleteStart()))); } m->put(); } void OSD::handle_fast_force_recovery(MOSDForceRecovery *m) { dout(10) << __func__ << " " << *m << dendl; if (!require_mon_or_mgr_peer(m)) { m->put(); return; } epoch_t epoch = get_osdmap_epoch(); for (auto pgid : m->forced_pgs) { if (m->options & OFR_BACKFILL) { if (m->options & OFR_CANCEL) { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( epoch, epoch, PG::UnsetForceBackfill()))); } else { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( epoch, epoch, PG::SetForceBackfill()))); } } else if (m->options & OFR_RECOVERY) { if (m->options & OFR_CANCEL) { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( epoch, epoch, PG::UnsetForceRecovery()))); } else { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( epoch, epoch, PG::SetForceRecovery()))); } } } m->put(); } void OSD::handle_pg_query_nopg(const MQuery& q) { spg_t pgid = q.pgid; dout(10) << __func__ << " " << pgid << dendl; OSDMapRef osdmap = get_osdmap(); if (!osdmap->have_pg_pool(pgid.pool())) return; dout(10) << " pg " << pgid << " dne" << dendl; pg_info_t empty(spg_t(pgid.pgid, q.query.to)); ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch()); if (con) { Message *m; if (q.query.type == pg_query_t::LOG || q.query.type == pg_query_t::FULLLOG) { m = new MOSDPGLog( q.query.from, q.query.to, osdmap->get_epoch(), empty, q.query.epoch_sent); } else { vector> ls; ls.push_back( make_pair( pg_notify_t( q.query.from, q.query.to, q.query.epoch_sent, osdmap->get_epoch(), empty), PastIntervals())); m = new MOSDPGNotify(osdmap->get_epoch(), ls); } service.share_map_peer(q.from.osd, con.get(), osdmap); con->send_message(m); } } // ========================================================= // RECOVERY void OSDService::_maybe_queue_recovery() { ceph_assert(recovery_lock.is_locked_by_me()); uint64_t available_pushes; while (!awaiting_throttle.empty() && _recover_now(&available_pushes)) { uint64_t to_start = std::min( available_pushes, cct->_conf->osd_recovery_max_single_start); _queue_for_recovery(awaiting_throttle.front(), to_start); awaiting_throttle.pop_front(); dout(10) << __func__ << " starting " << to_start << ", recovery_ops_reserved " << recovery_ops_reserved << " -> " << (recovery_ops_reserved + to_start) << dendl; recovery_ops_reserved += to_start; } } bool OSDService::_recover_now(uint64_t *available_pushes) { if (available_pushes) *available_pushes = 0; if (ceph_clock_now() < defer_recovery_until) { dout(15) << __func__ << " defer until " << defer_recovery_until << dendl; return false; } if (recovery_paused) { dout(15) << __func__ << " paused" << dendl; return false; } uint64_t max = cct->_conf->osd_recovery_max_active; if (max <= recovery_ops_active + recovery_ops_reserved) { dout(15) << __func__ << " active " << recovery_ops_active << " + reserved " << recovery_ops_reserved << " >= max " << max << dendl; return false; } if (available_pushes) *available_pushes = max - recovery_ops_active - recovery_ops_reserved; return true; } void OSD::do_recovery( PG *pg, epoch_t queued, uint64_t reserved_pushes, ThreadPool::TPHandle &handle) { uint64_t started = 0; /* * When the value of osd_recovery_sleep is set greater than zero, recovery * ops are scheduled after osd_recovery_sleep amount of time from the previous * recovery event's schedule time. This is done by adding a * recovery_requeue_callback event, which re-queues the recovery op using * queue_recovery_after_sleep. */ float recovery_sleep = get_osd_recovery_sleep(); { std::lock_guard l(service.sleep_lock); if (recovery_sleep > 0 && service.recovery_needs_sleep) { PGRef pgref(pg); auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) { dout(20) << "do_recovery wake up at " << ceph_clock_now() << ", re-queuing recovery" << dendl; std::lock_guard l(service.sleep_lock); service.recovery_needs_sleep = false; service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes); }); // This is true for the first recovery op and when the previous recovery op // has been scheduled in the past. The next recovery op is scheduled after // completing the sleep from now. if (service.recovery_schedule_time < ceph_clock_now()) { service.recovery_schedule_time = ceph_clock_now(); } service.recovery_schedule_time += recovery_sleep; service.sleep_timer.add_event_at(service.recovery_schedule_time, recovery_requeue_callback); dout(20) << "Recovery event scheduled at " << service.recovery_schedule_time << dendl; return; } } { { std::lock_guard l(service.sleep_lock); service.recovery_needs_sleep = true; } if (pg->pg_has_reset_since(queued)) { goto out; } dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl; #ifdef DEBUG_RECOVERY_OIDS dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl; #endif bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started); dout(10) << "do_recovery started " << started << "/" << reserved_pushes << " on " << *pg << dendl; if (do_unfound) { PG::RecoveryCtx rctx = create_context(); rctx.handle = &handle; pg->find_unfound(queued, &rctx); dispatch_context(rctx, pg, pg->get_osdmap()); } } out: ceph_assert(started <= reserved_pushes); service.release_reserved_pushes(reserved_pushes); } void OSDService::start_recovery_op(PG *pg, const hobject_t& soid) { std::lock_guard l(recovery_lock); dout(10) << "start_recovery_op " << *pg << " " << soid << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)" << dendl; recovery_ops_active++; #ifdef DEBUG_RECOVERY_OIDS dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl; ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0); recovery_oids[pg->pg_id].insert(soid); #endif } void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue) { std::lock_guard l(recovery_lock); dout(10) << "finish_recovery_op " << *pg << " " << soid << " dequeue=" << dequeue << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)" << dendl; // adjust count ceph_assert(recovery_ops_active > 0); recovery_ops_active--; #ifdef DEBUG_RECOVERY_OIDS dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl; ceph_assert(recovery_oids[pg->pg_id].count(soid)); recovery_oids[pg->pg_id].erase(soid); #endif _maybe_queue_recovery(); } bool OSDService::is_recovery_active() { if (cct->_conf->osd_debug_pretend_recovery_active) { return true; } return local_reserver.has_reservation() || remote_reserver.has_reservation(); } void OSDService::release_reserved_pushes(uint64_t pushes) { std::lock_guard l(recovery_lock); dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved " << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes) << dendl; ceph_assert(recovery_ops_reserved >= pushes); recovery_ops_reserved -= pushes; _maybe_queue_recovery(); } // ========================================================= // OPS bool OSD::op_is_discardable(const MOSDOp *op) { // drop client request if they are not connected and can't get the // reply anyway. if (!op->get_connection()->is_connected()) { return true; } return false; } void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch) { const utime_t stamp = op->get_req()->get_recv_stamp(); const utime_t latency = ceph_clock_now() - stamp; const unsigned priority = op->get_req()->get_priority(); const int cost = op->get_req()->get_cost(); const uint64_t owner = op->get_req()->get_source().num(); dout(15) << "enqueue_op " << op << " prio " << priority << " cost " << cost << " latency " << latency << " epoch " << epoch << " " << *(op->get_req()) << dendl; op->osd_trace.event("enqueue op"); op->osd_trace.keyval("priority", priority); op->osd_trace.keyval("cost", cost); op->mark_queued_for_pg(); logger->tinc(l_osd_op_before_queue_op_lat, latency); op_shardedwq.queue( OpQueueItem( unique_ptr(new PGOpItem(pg, std::move(op))), cost, priority, stamp, owner, epoch)); } void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt) { dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl; op_shardedwq.queue( OpQueueItem( unique_ptr(new PGPeeringItem(pgid, evt)), 10, cct->_conf->osd_peering_op_priority, utime_t(), 0, evt->get_epoch_sent())); } void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt) { dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl; op_shardedwq.queue_front( OpQueueItem( unique_ptr(new PGPeeringItem(pgid, evt)), 10, cct->_conf->osd_peering_op_priority, utime_t(), 0, evt->get_epoch_sent())); } /* * NOTE: dequeue called in worker thread, with pg lock */ void OSD::dequeue_op( PGRef pg, OpRequestRef op, ThreadPool::TPHandle &handle) { FUNCTRACE(cct); OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false); utime_t now = ceph_clock_now(); op->set_dequeued_time(now); utime_t latency = now - op->get_req()->get_recv_stamp(); dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority() << " cost " << op->get_req()->get_cost() << " latency " << latency << " " << *(op->get_req()) << " pg " << *pg << dendl; logger->tinc(l_osd_op_before_dequeue_op_lat, latency); auto priv = op->get_req()->get_connection()->get_priv(); if (auto session = static_cast(priv.get()); session) { maybe_share_map(session, op, pg->get_osdmap()); } if (pg->is_deleting()) return; op->mark_reached_pg(); op->osd_trace.event("dequeue_op"); pg->do_request(op, handle); // finish dout(10) << "dequeue_op " << op << " finish" << dendl; OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false); } void OSD::dequeue_peering_evt( OSDShard *sdata, PG *pg, PGPeeringEventRef evt, ThreadPool::TPHandle& handle) { PG::RecoveryCtx rctx = create_context(); auto curmap = sdata->get_osdmap(); epoch_t need_up_thru = 0, same_interval_since = 0; if (!pg) { if (const MQuery *q = dynamic_cast(evt->evt.get())) { handle_pg_query_nopg(*q); } else { derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl; ceph_abort(); } } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) { pg->do_peering_event(evt, &rctx); if (pg->is_deleted()) { // do not dispatch rctx; the final _delete_some already did it. discard_context(rctx); pg->unlock(); return; } dispatch_context_transaction(rctx, pg, &handle); need_up_thru = pg->get_need_up_thru(); same_interval_since = pg->get_same_interval_since(); pg->unlock(); } if (need_up_thru) { queue_want_up_thru(same_interval_since); } dispatch_context(rctx, pg, curmap, &handle); service.send_pg_temp(); } void OSD::dequeue_delete( OSDShard *sdata, PG *pg, epoch_t e, ThreadPool::TPHandle& handle) { dequeue_peering_evt( sdata, pg, PGPeeringEventRef( std::make_shared( e, e, PG::DeleteSome())), handle); } // -------------------------------- const char** OSD::get_tracked_conf_keys() const { static const char* KEYS[] = { "osd_max_backfills", "osd_min_recovery_priority", "osd_max_trimming_pgs", "osd_op_complaint_time", "osd_op_log_threshold", "osd_op_history_size", "osd_op_history_duration", "osd_op_history_slow_op_size", "osd_op_history_slow_op_threshold", "osd_enable_op_tracker", "osd_map_cache_size", "osd_pg_epoch_max_lag_factor", "osd_pg_epoch_persisted_max_stale", // clog & admin clog "clog_to_monitors", "clog_to_syslog", "clog_to_syslog_facility", "clog_to_syslog_level", "osd_objectstore_fuse", "clog_to_graylog", "clog_to_graylog_host", "clog_to_graylog_port", "host", "fsid", "osd_recovery_delay_start", "osd_client_message_size_cap", "osd_client_message_cap", "osd_heartbeat_min_size", "osd_heartbeat_interval", "osd_scrub_min_interval", "osd_scrub_max_interval", NULL }; return KEYS; } void OSD::handle_conf_change(const ConfigProxy& conf, const std::set &changed) { Mutex::Locker l(osd_lock); if (changed.count("osd_max_backfills")) { service.local_reserver.set_max(cct->_conf->osd_max_backfills); service.remote_reserver.set_max(cct->_conf->osd_max_backfills); } if (changed.count("osd_min_recovery_priority")) { service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority); } if (changed.count("osd_max_trimming_pgs")) { service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs); } if (changed.count("osd_op_complaint_time") || changed.count("osd_op_log_threshold")) { op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, cct->_conf->osd_op_log_threshold); } if (changed.count("osd_op_history_size") || changed.count("osd_op_history_duration")) { op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, cct->_conf->osd_op_history_duration); } if (changed.count("osd_op_history_slow_op_size") || changed.count("osd_op_history_slow_op_threshold")) { op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size, cct->_conf->osd_op_history_slow_op_threshold); } if (changed.count("osd_enable_op_tracker")) { op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker); } if (changed.count("osd_map_cache_size")) { service.map_cache.set_size(cct->_conf->osd_map_cache_size); service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size); service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size); } if (changed.count("clog_to_monitors") || changed.count("clog_to_syslog") || changed.count("clog_to_syslog_level") || changed.count("clog_to_syslog_facility") || changed.count("clog_to_graylog") || changed.count("clog_to_graylog_host") || changed.count("clog_to_graylog_port") || changed.count("host") || changed.count("fsid")) { update_log_config(); } if (changed.count("osd_pg_epoch_max_lag_factor")) { m_osd_pg_epoch_max_lag_factor = conf.get_val( "osd_pg_epoch_max_lag_factor"); } #ifdef HAVE_LIBFUSE if (changed.count("osd_objectstore_fuse")) { if (store) { enable_disable_fuse(false); } } #endif if (changed.count("osd_recovery_delay_start")) { service.defer_recovery(cct->_conf->osd_recovery_delay_start); service.kick_recovery_queue(); } if (changed.count("osd_client_message_cap")) { uint64_t newval = cct->_conf->osd_client_message_cap; Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT); if (pol.throttler_messages && newval > 0) { pol.throttler_messages->reset_max(newval); } } if (changed.count("osd_client_message_size_cap")) { uint64_t newval = cct->_conf->osd_client_message_size_cap; Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT); if (pol.throttler_bytes && newval > 0) { pol.throttler_bytes->reset_max(newval); } } if (changed.count("osd_scrub_min_interval") || changed.count("osd_scrub_max_interval")) { resched_all_scrubs(); dout(0) << __func__ << ": scrub interval change" << dendl; } check_config(); } void OSD::update_log_config() { map log_to_monitors; map log_to_syslog; map log_channel; map log_prio; map log_to_graylog; map log_to_graylog_host; map log_to_graylog_port; uuid_d fsid; string host; if (parse_log_client_options(cct, log_to_monitors, log_to_syslog, log_channel, log_prio, log_to_graylog, log_to_graylog_host, log_to_graylog_port, fsid, host) == 0) clog->update_config(log_to_monitors, log_to_syslog, log_channel, log_prio, log_to_graylog, log_to_graylog_host, log_to_graylog_port, fsid, host); derr << "log_to_monitors " << log_to_monitors << dendl; } void OSD::check_config() { // some sanity checks if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) { clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")" << " is not > osd_pg_epoch_persisted_max_stale (" << cct->_conf->osd_pg_epoch_persisted_max_stale << ")"; } } // -------------------------------- void OSD::get_latest_osdmap() { dout(10) << __func__ << " -- start" << dendl; C_SaferCond cond; service.objecter->wait_for_latest_osdmap(&cond); cond.wait(); dout(10) << __func__ << " -- finish" << dendl; } // -------------------------------- int OSD::init_op_flags(OpRequestRef& op) { const MOSDOp *m = static_cast(op->get_req()); vector::const_iterator iter; // client flags have no bearing on whether an op is a read, write, etc. op->rmw_flags = 0; if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) { op->set_force_rwordered(); } // set bits based on op codes, called methods. for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) { if ((iter->op.op == CEPH_OSD_OP_WATCH && iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) { /* This a bit odd. PING isn't actually a write. It can't * result in an update to the object_info. PINGs also aren't * resent, so there's no reason to write out a log entry. * * However, we pipeline them behind writes, so let's force * the write_ordered flag. */ op->set_force_rwordered(); } else { if (ceph_osd_op_mode_modify(iter->op.op)) op->set_write(); } if (ceph_osd_op_mode_read(iter->op.op)) op->set_read(); // set READ flag if there are src_oids if (iter->soid.oid.name.length()) op->set_read(); // set PGOP flag if there are PG ops if (ceph_osd_op_type_pg(iter->op.op)) op->set_pg_op(); if (ceph_osd_op_mode_cache(iter->op.op)) op->set_cache(); // check for ec base pool int64_t poolid = m->get_pg().pool(); const pg_pool_t *pool = get_osdmap()->get_pg_pool(poolid); if (pool && pool->is_tier()) { const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool->tier_of); if (base_pool && base_pool->require_rollback()) { if ((iter->op.op != CEPH_OSD_OP_READ) && (iter->op.op != CEPH_OSD_OP_CHECKSUM) && (iter->op.op != CEPH_OSD_OP_CMPEXT) && (iter->op.op != CEPH_OSD_OP_STAT) && (iter->op.op != CEPH_OSD_OP_ISDIRTY) && (iter->op.op != CEPH_OSD_OP_UNDIRTY) && (iter->op.op != CEPH_OSD_OP_GETXATTR) && (iter->op.op != CEPH_OSD_OP_GETXATTRS) && (iter->op.op != CEPH_OSD_OP_CMPXATTR) && (iter->op.op != CEPH_OSD_OP_ASSERT_VER) && (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) && (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) && (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) && (iter->op.op != CEPH_OSD_OP_WRITEFULL) && (iter->op.op != CEPH_OSD_OP_ROLLBACK) && (iter->op.op != CEPH_OSD_OP_CREATE) && (iter->op.op != CEPH_OSD_OP_DELETE) && (iter->op.op != CEPH_OSD_OP_SETXATTR) && (iter->op.op != CEPH_OSD_OP_RMXATTR) && (iter->op.op != CEPH_OSD_OP_STARTSYNC) && (iter->op.op != CEPH_OSD_OP_COPY_GET) && (iter->op.op != CEPH_OSD_OP_COPY_FROM)) { op->set_promote(); } } } switch (iter->op.op) { case CEPH_OSD_OP_CALL: { bufferlist::iterator bp = const_cast(iter->indata).begin(); int is_write, is_read; string cname, mname; bp.copy(iter->op.cls.class_len, cname); bp.copy(iter->op.cls.method_len, mname); ClassHandler::ClassData *cls; int r = class_handler->open_class(cname, &cls); if (r) { derr << "class " << cname << " open got " << cpp_strerror(r) << dendl; if (r == -ENOENT) r = -EOPNOTSUPP; else if (r != -EPERM) // propagate permission errors r = -EIO; return r; } int flags = cls->get_method_flags(mname.c_str()); if (flags < 0) { if (flags == -ENOENT) r = -EOPNOTSUPP; else r = flags; return r; } is_read = flags & CLS_METHOD_RD; is_write = flags & CLS_METHOD_WR; bool is_promote = flags & CLS_METHOD_PROMOTE; dout(10) << "class " << cname << " method " << mname << " " << "flags=" << (is_read ? "r" : "") << (is_write ? "w" : "") << (is_promote ? "p" : "") << dendl; if (is_read) op->set_class_read(); if (is_write) op->set_class_write(); if (is_promote) op->set_promote(); op->add_class(std::move(cname), std::move(mname), is_read, is_write, cls->whitelisted); break; } case CEPH_OSD_OP_WATCH: // force the read bit for watch since it is depends on previous // watch state (and may return early if the watch exists) or, in // the case of ping, is simply a read op. op->set_read(); // fall through case CEPH_OSD_OP_NOTIFY: case CEPH_OSD_OP_NOTIFY_ACK: { op->set_promote(); break; } case CEPH_OSD_OP_DELETE: // if we get a delete with FAILOK we can skip handle cache. without // FAILOK we still need to promote (or do something smarter) to // determine whether to return ENOENT or 0. if (iter == m->ops.begin() && iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) { op->set_skip_handle_cache(); } // skip promotion when proxying a delete op if (m->ops.size() == 1) { op->set_skip_promote(); } break; case CEPH_OSD_OP_CACHE_TRY_FLUSH: case CEPH_OSD_OP_CACHE_FLUSH: case CEPH_OSD_OP_CACHE_EVICT: // If try_flush/flush/evict is the only op, can skip handle cache. if (m->ops.size() == 1) { op->set_skip_handle_cache(); } break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SYNC_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_CHECKSUM: case CEPH_OSD_OP_WRITEFULL: if (m->ops.size() == 1 && (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE || iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) { op->set_skip_promote(); } break; // force promotion when pin an object in cache tier case CEPH_OSD_OP_CACHE_PIN: op->set_promote(); break; default: break; } } if (op->rmw_flags == 0) return -EINVAL; return 0; } void OSD::set_perf_queries( const std::map &queries) { dout(10) << "setting " << queries.size() << " queries" << dendl; std::list supported_queries; for (auto &it : queries) { auto &query = it.first; if (!query.key_descriptor.empty()) { supported_queries.push_back(query); } } if (supported_queries.size() < queries.size()) { dout(1) << queries.size() - supported_queries.size() << " unsupported queries" << dendl; } { Mutex::Locker locker(m_perf_queries_lock); m_perf_queries = supported_queries; m_perf_limits = queries; } std::vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { pg->lock(); pg->set_dynamic_perf_stats_queries(supported_queries); pg->unlock(); } } void OSD::get_perf_reports( std::map *reports) { std::vector pgs; _get_pgs(&pgs); DynamicPerfStats dps; for (auto& pg : pgs) { // m_perf_queries can be modified only in set_perf_queries by mgr client // request, and it is protected by by mgr client's lock, which is held // when set_perf_queries/get_perf_reports are called, so we may not hold // m_perf_queries_lock here. DynamicPerfStats pg_dps(m_perf_queries); pg->lock(); pg->get_dynamic_perf_stats(&pg_dps); pg->unlock(); dps.merge(pg_dps); } dps.add_to_reports(m_perf_limits, reports); dout(20) << "reports for " << reports->size() << " queries" << dendl; } // ============================================================= #undef dout_context #define dout_context cct #undef dout_prefix #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " " void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg) { dout(10) << pg->pg_id << " " << pg << dendl; slot->pg = pg; pg->osd_shard = this; pg->pg_slot = slot; osd->inc_num_pgs(); slot->epoch = pg->get_osdmap_epoch(); pg_slots_by_epoch.insert(*slot); } void OSDShard::_detach_pg(OSDShardPGSlot *slot) { dout(10) << slot->pg->pg_id << " " << slot->pg << dendl; slot->pg->osd_shard = nullptr; slot->pg->pg_slot = nullptr; slot->pg = nullptr; osd->dec_num_pgs(); pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot)); slot->epoch = 0; if (waiting_for_min_pg_epoch) { min_pg_epoch_cond.notify_all(); } } void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e) { std::lock_guard l(shard_lock); dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl; pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot)); dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl; slot->epoch = e; pg_slots_by_epoch.insert(*slot); dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl; if (waiting_for_min_pg_epoch) { min_pg_epoch_cond.notify_all(); } } epoch_t OSDShard::get_min_pg_epoch() { std::lock_guard l(shard_lock); auto p = pg_slots_by_epoch.begin(); if (p == pg_slots_by_epoch.end()) { return 0; } return p->epoch; } void OSDShard::wait_min_pg_epoch(epoch_t need) { std::unique_lock l{shard_lock}; ++waiting_for_min_pg_epoch; min_pg_epoch_cond.wait(l, [need, this] { if (pg_slots_by_epoch.empty()) { return true; } else if (pg_slots_by_epoch.begin()->epoch >= need) { return true; } else { dout(10) << need << " waiting on " << pg_slots_by_epoch.begin()->epoch << dendl; return false; } }); --waiting_for_min_pg_epoch; } epoch_t OSDShard::get_max_waiting_epoch() { std::lock_guard l(shard_lock); epoch_t r = 0; for (auto& i : pg_slots) { if (!i.second->waiting_peering.empty()) { r = std::max(r, i.second->waiting_peering.rbegin()->first); } } return r; } void OSDShard::consume_map( const OSDMapRef& new_osdmap, unsigned *pushes_to_free) { std::lock_guard l(shard_lock); OSDMapRef old_osdmap; { std::lock_guard l(osdmap_lock); old_osdmap = std::move(shard_osdmap); shard_osdmap = new_osdmap; } dout(10) << new_osdmap->get_epoch() << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")" << dendl; bool queued = false; // check slots auto p = pg_slots.begin(); while (p != pg_slots.end()) { OSDShardPGSlot *slot = p->second.get(); const spg_t& pgid = p->first; dout(20) << __func__ << " " << pgid << dendl; if (!slot->waiting_for_split.empty()) { dout(20) << __func__ << " " << pgid << " waiting for split " << slot->waiting_for_split << dendl; ++p; continue; } if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) { dout(20) << __func__ << " " << pgid << " waiting for merge by epoch " << slot->waiting_for_merge_epoch << dendl; ++p; continue; } if (!slot->waiting_peering.empty()) { epoch_t first = slot->waiting_peering.begin()->first; if (first <= new_osdmap->get_epoch()) { dout(20) << __func__ << " " << pgid << " pending_peering first epoch " << first << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl; _wake_pg_slot(pgid, slot); queued = true; } ++p; continue; } if (!slot->waiting.empty()) { if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) { dout(20) << __func__ << " " << pgid << " maps to us, keeping" << dendl; ++p; continue; } while (!slot->waiting.empty() && slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) { auto& qi = slot->waiting.front(); dout(20) << __func__ << " " << pgid << " waiting item " << qi << " epoch " << qi.get_map_epoch() << " <= " << new_osdmap->get_epoch() << ", " << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" : "misdirected") << ", dropping" << dendl; *pushes_to_free += qi.get_reserved_pushes(); slot->waiting.pop_front(); } } if (slot->waiting.empty() && slot->num_running == 0 && slot->waiting_for_split.empty() && !slot->pg) { dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl; p = pg_slots.erase(p); continue; } ++p; } if (queued) { std::lock_guard l{sdata_wait_lock}; sdata_cond.notify_one(); } } void OSDShard::_wake_pg_slot( spg_t pgid, OSDShardPGSlot *slot) { dout(20) << __func__ << " " << pgid << " to_process " << slot->to_process << " waiting " << slot->waiting << " waiting_peering " << slot->waiting_peering << dendl; for (auto i = slot->to_process.rbegin(); i != slot->to_process.rend(); ++i) { _enqueue_front(std::move(*i), osd->op_prio_cutoff); } slot->to_process.clear(); for (auto i = slot->waiting.rbegin(); i != slot->waiting.rend(); ++i) { _enqueue_front(std::move(*i), osd->op_prio_cutoff); } slot->waiting.clear(); for (auto i = slot->waiting_peering.rbegin(); i != slot->waiting_peering.rend(); ++i) { // this is overkill; we requeue everything, even if some of these // items are waiting for maps we don't have yet. FIXME, maybe, // someday, if we decide this inefficiency matters for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) { _enqueue_front(std::move(*j), osd->op_prio_cutoff); } } slot->waiting_peering.clear(); ++slot->requeue_seq; } void OSDShard::identify_splits_and_merges( const OSDMapRef& as_of_osdmap, set> *split_pgs, set> *merge_pgs) { std::lock_guard l(shard_lock); if (shard_osdmap) { for (auto& i : pg_slots) { const spg_t& pgid = i.first; auto *slot = i.second.get(); if (slot->pg) { osd->service.identify_splits_and_merges( shard_osdmap, as_of_osdmap, pgid, split_pgs, merge_pgs); } else if (!slot->waiting_for_split.empty()) { osd->service.identify_splits_and_merges( shard_osdmap, as_of_osdmap, pgid, split_pgs, nullptr); } else { dout(20) << __func__ << " slot " << pgid << " has no pg and waiting_for_split " << slot->waiting_for_split << dendl; } } } } void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap, set> *pgids) { std::lock_guard l(shard_lock); _prime_splits(pgids); if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) { set> newer_children; for (auto i : *pgids) { osd->service.identify_splits_and_merges( as_of_osdmap, shard_osdmap, i.first, &newer_children, nullptr); } newer_children.insert(pgids->begin(), pgids->end()); dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard " << shard_osdmap->get_epoch() << ", new children " << newer_children << dendl; _prime_splits(&newer_children); // note: we don't care what is left over here for other shards. // if this shard is ahead of us and one isn't, e.g., one thread is // calling into prime_splits via _process (due to a newly created // pg) and this shard has a newer map due to a racing consume_map, // then any grandchildren left here will be identified (or were // identified) when the slower shard's osdmap is advanced. // _prime_splits() will tolerate the case where the pgid is // already primed. } } void OSDShard::_prime_splits(set> *pgids) { dout(10) << *pgids << dendl; auto p = pgids->begin(); while (p != pgids->end()) { unsigned shard_index = p->first.hash_to_shard(osd->num_shards); if (shard_index == shard_id) { auto r = pg_slots.emplace(p->first, nullptr); if (r.second) { dout(10) << "priming slot " << p->first << " e" << p->second << dendl; r.first->second = make_unique(); r.first->second->waiting_for_split.insert(p->second); } else { auto q = r.first; ceph_assert(q != pg_slots.end()); dout(10) << "priming (existing) slot " << p->first << " e" << p->second << dendl; q->second->waiting_for_split.insert(p->second); } p = pgids->erase(p); } else { ++p; } } } void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap, set> *merge_pgs) { std::lock_guard l(shard_lock); dout(20) << __func__ << " checking shard " << shard_id << " for remaining merge pgs " << merge_pgs << dendl; auto p = merge_pgs->begin(); while (p != merge_pgs->end()) { spg_t pgid = p->first; epoch_t epoch = p->second; unsigned shard_index = pgid.hash_to_shard(osd->num_shards); if (shard_index != shard_id) { ++p; continue; } OSDShardPGSlot *slot; auto r = pg_slots.emplace(pgid, nullptr); if (r.second) { r.first->second = make_unique(); } slot = r.first->second.get(); if (slot->pg) { // already have pg dout(20) << __func__ << " have merge participant pg " << pgid << " " << slot->pg << dendl; } else if (!slot->waiting_for_split.empty() && *slot->waiting_for_split.begin() < epoch) { dout(20) << __func__ << " pending split on merge participant pg " << pgid << " " << slot->waiting_for_split << dendl; } else { dout(20) << __func__ << " creating empty merge participant " << pgid << " for merge in " << epoch << dendl; // leave history zeroed; PG::merge_from() will fill it in. pg_history_t history; PGCreateInfo cinfo(pgid, epoch - 1, history, PastIntervals(), false); PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo); _attach_pg(r.first->second.get(), pg.get()); _wake_pg_slot(pgid, slot); pg->unlock(); } // mark slot for merge dout(20) << __func__ << " marking merge participant " << pgid << dendl; slot->waiting_for_merge_epoch = epoch; p = merge_pgs->erase(p); } } void OSDShard::register_and_wake_split_child(PG *pg) { epoch_t epoch; { std::lock_guard l(shard_lock); dout(10) << pg->pg_id << " " << pg << dendl; auto p = pg_slots.find(pg->pg_id); ceph_assert(p != pg_slots.end()); auto *slot = p->second.get(); dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split << dendl; ceph_assert(!slot->pg); ceph_assert(!slot->waiting_for_split.empty()); _attach_pg(slot, pg); epoch = pg->get_osdmap_epoch(); ceph_assert(slot->waiting_for_split.count(epoch)); slot->waiting_for_split.erase(epoch); if (slot->waiting_for_split.empty()) { _wake_pg_slot(pg->pg_id, slot); } else { dout(10) << __func__ << " still waiting for split on " << slot->waiting_for_split << dendl; } } // kick child to ensure it pulls up to the latest osdmap osd->enqueue_peering_evt( pg->pg_id, PGPeeringEventRef( std::make_shared( epoch, epoch, NullEvt()))); std::lock_guard l{sdata_wait_lock}; sdata_cond.notify_one(); } void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num) { std::lock_guard l(shard_lock); vector to_delete; for (auto& i : pg_slots) { if (i.first != parent && i.first.get_ancestor(old_pg_num) == parent) { dout(10) << __func__ << " parent " << parent << " clearing " << i.first << dendl; _wake_pg_slot(i.first, i.second.get()); to_delete.push_back(i.first); } } for (auto pgid : to_delete) { pg_slots.erase(pgid); } } // ============================================================= #undef dout_context #define dout_context osd->cct #undef dout_prefix #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq " void OSD::ShardedOpWQ::_add_slot_waiter( spg_t pgid, OSDShardPGSlot *slot, OpQueueItem&& qi) { if (qi.is_peering()) { dout(20) << __func__ << " " << pgid << " peering, item epoch is " << qi.get_map_epoch() << ", will wait on " << qi << dendl; slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi)); } else { dout(20) << __func__ << " " << pgid << " item epoch is " << qi.get_map_epoch() << ", will wait on " << qi << dendl; slot->waiting.push_back(std::move(qi)); } } #undef dout_prefix #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") " void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) { uint32_t shard_index = thread_index % osd->num_shards; auto& sdata = osd->shards[shard_index]; ceph_assert(sdata); // If all threads of shards do oncommits, there is a out-of-order // problem. So we choose the thread which has the smallest // thread_index(thread_index < num_shards) of shard to do oncommit // callback. bool is_smallest_thread_index = thread_index < osd->num_shards; // peek at spg_t sdata->shard_lock.lock(); if (sdata->pqueue->empty() && (!is_smallest_thread_index || sdata->context_queue.empty())) { std::unique_lock wait_lock{sdata->sdata_wait_lock}; if (is_smallest_thread_index && !sdata->context_queue.empty()) { // we raced with a context_queue addition, don't wait wait_lock.unlock(); } else if (!sdata->stop_waiting) { dout(20) << __func__ << " empty q, waiting" << dendl; osd->cct->get_heartbeat_map()->clear_timeout(hb); sdata->shard_lock.unlock(); sdata->sdata_cond.wait(wait_lock); wait_lock.unlock(); sdata->shard_lock.lock(); if (sdata->pqueue->empty() && !(is_smallest_thread_index && !sdata->context_queue.empty())) { sdata->shard_lock.unlock(); return; } // found a work item; reapply default wq timeouts osd->cct->get_heartbeat_map()->reset_timeout(hb, timeout_interval, suicide_interval); } else { dout(20) << __func__ << " need return immediately" << dendl; wait_lock.unlock(); sdata->shard_lock.unlock(); return; } } list oncommits; if (is_smallest_thread_index && !sdata->context_queue.empty()) { sdata->context_queue.swap(oncommits); } if (sdata->pqueue->empty()) { if (osd->is_stopping()) { sdata->shard_lock.unlock(); for (auto c : oncommits) { dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; delete c; } return; // OSD shutdown, discard. } sdata->shard_lock.unlock(); handle_oncommits(oncommits); return; } OpQueueItem item = sdata->pqueue->dequeue(); if (osd->is_stopping()) { sdata->shard_lock.unlock(); for (auto c : oncommits) { dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl; delete c; } return; // OSD shutdown, discard. } const auto token = item.get_ordering_token(); auto r = sdata->pg_slots.emplace(token, nullptr); if (r.second) { r.first->second = make_unique(); } OSDShardPGSlot *slot = r.first->second.get(); dout(20) << __func__ << " " << token << (r.second ? " (new)" : "") << " to_process " << slot->to_process << " waiting " << slot->waiting << " waiting_peering " << slot->waiting_peering << dendl; slot->to_process.push_back(std::move(item)); dout(20) << __func__ << " " << slot->to_process.back() << " queued" << dendl; retry_pg: PGRef pg = slot->pg; // lock pg (if we have it) if (pg) { // note the requeue seq now... uint64_t requeue_seq = slot->requeue_seq; ++slot->num_running; sdata->shard_lock.unlock(); osd->service.maybe_inject_dispatch_delay(); pg->lock(); osd->service.maybe_inject_dispatch_delay(); sdata->shard_lock.lock(); auto q = sdata->pg_slots.find(token); if (q == sdata->pg_slots.end()) { // this can happen if we race with pg removal. dout(20) << __func__ << " slot " << token << " no longer there" << dendl; pg->unlock(); sdata->shard_lock.unlock(); handle_oncommits(oncommits); return; } slot = q->second.get(); --slot->num_running; if (slot->to_process.empty()) { // raced with _wake_pg_slot or consume_map dout(20) << __func__ << " " << token << " nothing queued" << dendl; pg->unlock(); sdata->shard_lock.unlock(); handle_oncommits(oncommits); return; } if (requeue_seq != slot->requeue_seq) { dout(20) << __func__ << " " << token << " requeue_seq " << slot->requeue_seq << " > our " << requeue_seq << ", we raced with _wake_pg_slot" << dendl; pg->unlock(); sdata->shard_lock.unlock(); handle_oncommits(oncommits); return; } if (slot->pg != pg) { // this can happen if we race with pg removal. dout(20) << __func__ << " slot " << token << " no longer attached to " << pg << dendl; pg->unlock(); goto retry_pg; } } dout(20) << __func__ << " " << token << " to_process " << slot->to_process << " waiting " << slot->waiting << " waiting_peering " << slot->waiting_peering << dendl; ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval, suicide_interval); // take next item auto qi = std::move(slot->to_process.front()); slot->to_process.pop_front(); dout(20) << __func__ << " " << qi << " pg " << pg << dendl; set> new_children; OSDMapRef osdmap; while (!pg) { // should this pg shard exist on this osd in this (or a later) epoch? osdmap = sdata->shard_osdmap; const PGCreateInfo *create_info = qi.creates_pg(); if (!slot->waiting_for_split.empty()) { dout(20) << __func__ << " " << token << " splitting " << slot->waiting_for_split << dendl; _add_slot_waiter(token, slot, std::move(qi)); } else if (qi.get_map_epoch() > osdmap->get_epoch()) { dout(20) << __func__ << " " << token << " map " << qi.get_map_epoch() << " > " << osdmap->get_epoch() << dendl; _add_slot_waiter(token, slot, std::move(qi)); } else if (qi.is_peering()) { if (!qi.peering_requires_pg()) { // for pg-less events, we run them under the ordering lock, since // we don't have the pg lock to keep them ordered. qi.run(osd, sdata, pg, tp_handle); } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) { if (create_info) { if (create_info->by_mon && osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) { dout(20) << __func__ << " " << token << " no pg, no longer primary, ignoring mon create on " << qi << dendl; } else { dout(20) << __func__ << " " << token << " no pg, should create on " << qi << dendl; pg = osd->handle_pg_create_info(osdmap, create_info); if (pg) { // we created the pg! drop out and continue "normally"! sdata->_attach_pg(slot, pg.get()); sdata->_wake_pg_slot(token, slot); // identify split children between create epoch and shard epoch. osd->service.identify_splits_and_merges( pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr); sdata->_prime_splits(&new_children); // distribute remaining split children to other shards below! break; } dout(20) << __func__ << " ignored create on " << qi << dendl; } } else { dout(20) << __func__ << " " << token << " no pg, peering, !create, discarding " << qi << dendl; } } else { dout(20) << __func__ << " " << token << " no pg, peering, doesn't map here e" << osdmap->get_epoch() << ", discarding " << qi << dendl; } } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) { dout(20) << __func__ << " " << token << " no pg, should exist e" << osdmap->get_epoch() << ", will wait on " << qi << dendl; _add_slot_waiter(token, slot, std::move(qi)); } else { dout(20) << __func__ << " " << token << " no pg, shouldn't exist e" << osdmap->get_epoch() << ", dropping " << qi << dendl; // share map with client? if (boost::optional _op = qi.maybe_get_op()) { auto priv = (*_op)->get_req()->get_connection()->get_priv(); if (auto session = static_cast(priv.get()); session) { osd->maybe_share_map(session, *_op, sdata->shard_osdmap); } } unsigned pushes_to_free = qi.get_reserved_pushes(); if (pushes_to_free > 0) { sdata->shard_lock.unlock(); osd->service.release_reserved_pushes(pushes_to_free); handle_oncommits(oncommits); return; } } sdata->shard_lock.unlock(); handle_oncommits(oncommits); return; } if (qi.is_peering()) { OSDMapRef osdmap = sdata->shard_osdmap; if (qi.get_map_epoch() > osdmap->get_epoch()) { _add_slot_waiter(token, slot, std::move(qi)); sdata->shard_lock.unlock(); pg->unlock(); handle_oncommits(oncommits); return; } } sdata->shard_lock.unlock(); if (!new_children.empty()) { for (auto shard : osd->shards) { shard->prime_splits(osdmap, &new_children); } ceph_assert(new_children.empty()); } // osd_opwq_process marks the point at which an operation has been dequeued // and will begin to be handled by a worker thread. { #ifdef WITH_LTTNG osd_reqid_t reqid; if (boost::optional _op = qi.maybe_get_op()) { reqid = (*_op)->get_reqid(); } #endif tracepoint(osd, opwq_process_start, reqid.name._type, reqid.name._num, reqid.tid, reqid.inc); } lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: "; Formatter *f = Formatter::create("json"); f->open_object_section("q"); dump(f); f->close_section(); f->flush(*_dout); delete f; *_dout << dendl; qi.run(osd, sdata, pg, tp_handle); { #ifdef WITH_LTTNG osd_reqid_t reqid; if (boost::optional _op = qi.maybe_get_op()) { reqid = (*_op)->get_reqid(); } #endif tracepoint(osd, opwq_process_finish, reqid.name._type, reqid.name._num, reqid.tid, reqid.inc); } handle_oncommits(oncommits); } void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) { uint32_t shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); OSDShard* sdata = osd->shards[shard_index]; assert (NULL != sdata); unsigned priority = item.get_priority(); unsigned cost = item.get_cost(); sdata->shard_lock.lock(); dout(20) << __func__ << " " << item << dendl; if (priority >= osd->op_prio_cutoff) sdata->pqueue->enqueue_strict( item.get_owner(), priority, std::move(item)); else sdata->pqueue->enqueue( item.get_owner(), priority, cost, std::move(item)); sdata->shard_lock.unlock(); std::lock_guard l{sdata->sdata_wait_lock}; sdata->sdata_cond.notify_one(); } void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item) { auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); auto& sdata = osd->shards[shard_index]; ceph_assert(sdata); sdata->shard_lock.lock(); auto p = sdata->pg_slots.find(item.get_ordering_token()); if (p != sdata->pg_slots.end() && !p->second->to_process.empty()) { // we may be racing with _process, which has dequeued a new item // from pqueue, put it on to_process, and is now busy taking the // pg lock. ensure this old requeued item is ordered before any // such newer item in to_process. p->second->to_process.push_front(std::move(item)); item = std::move(p->second->to_process.back()); p->second->to_process.pop_back(); dout(20) << __func__ << " " << p->second->to_process.front() << " shuffled w/ " << item << dendl; } else { dout(20) << __func__ << " " << item << dendl; } sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff); sdata->shard_lock.unlock(); std::lock_guard l{sdata->sdata_wait_lock}; sdata->sdata_cond.notify_one(); } namespace ceph { namespace osd_cmds { int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os) { if (!ceph_using_tcmalloc()) { os << "could not issue heap profiler command -- not using tcmalloc!"; return -EOPNOTSUPP; } string cmd; if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) { os << "unable to get value for command \"" << cmd << "\""; return -EINVAL; } std::vector cmd_vec; get_str_vec(cmd, cmd_vec); string val; if (cmd_getval(&cct, cmdmap, "value", val)) { cmd_vec.push_back(val); } ceph_heap_profiler_handle_command(cmd_vec, os); return 0; } }} // namespace ceph::osd_cmds std::ostream& operator<<(std::ostream& out, const io_queue& q) { switch(q) { case io_queue::prioritized: out << "prioritized"; break; case io_queue::weightedpriority: out << "weightedpriority"; break; case io_queue::mclock_opclass: out << "mclock_opclass"; break; case io_queue::mclock_client: out << "mclock_client"; break; } return out; }