summaryrefslogtreecommitdiffstats
path: root/src/osd
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/osd
parentInitial commit. (diff)
downloadceph-upstream/16.2.11+ds.tar.xz
ceph-upstream/16.2.11+ds.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/osd')
-rw-r--r--src/osd/CMakeLists.txt75
-rw-r--r--src/osd/ClassHandler.cc350
-rw-r--r--src/osd/ClassHandler.h126
-rw-r--r--src/osd/DynamicPerfStats.h267
-rw-r--r--src/osd/ECBackend.cc2637
-rw-r--r--src/osd/ECBackend.h686
-rw-r--r--src/osd/ECMsgTypes.cc393
-rw-r--r--src/osd/ECMsgTypes.h140
-rw-r--r--src/osd/ECTransaction.cc670
-rw-r--r--src/osd/ECTransaction.h200
-rw-r--r--src/osd/ECUtil.cc248
-rw-r--r--src/osd/ECUtil.h169
-rw-r--r--src/osd/ExtentCache.cc245
-rw-r--r--src/osd/ExtentCache.h489
-rw-r--r--src/osd/HitSet.cc256
-rw-r--r--src/osd/HitSet.h455
-rw-r--r--src/osd/MissingLoc.cc226
-rw-r--r--src/osd/MissingLoc.h353
-rw-r--r--src/osd/OSD.cc11378
-rw-r--r--src/osd/OSD.h2152
-rw-r--r--src/osd/OSDCap.cc532
-rw-r--r--src/osd/OSDCap.h261
-rw-r--r--src/osd/OSDMap.cc6412
-rw-r--r--src/osd/OSDMap.h1600
-rw-r--r--src/osd/OSDMapMapping.cc207
-rw-r--r--src/osd/OSDMapMapping.h352
-rw-r--r--src/osd/ObjectVersioner.h35
-rw-r--r--src/osd/OpRequest.cc170
-rw-r--r--src/osd/OpRequest.h200
-rw-r--r--src/osd/PG.cc2753
-rw-r--r--src/osd/PG.h1341
-rw-r--r--src/osd/PGBackend.cc1324
-rw-r--r--src/osd/PGBackend.h641
-rw-r--r--src/osd/PGLog.cc1189
-rw-r--r--src/osd/PGLog.h1697
-rw-r--r--src/osd/PGPeeringEvent.cc17
-rw-r--r--src/osd/PGPeeringEvent.h220
-rw-r--r--src/osd/PGStateUtils.cc57
-rw-r--r--src/osd/PGStateUtils.h85
-rw-r--r--src/osd/PGTransaction.h601
-rw-r--r--src/osd/PeeringState.cc7607
-rw-r--r--src/osd/PeeringState.h2442
-rw-r--r--src/osd/PrimaryLogPG.cc15470
-rw-r--r--src/osd/PrimaryLogPG.h1969
-rw-r--r--src/osd/PrimaryLogScrub.cc589
-rw-r--r--src/osd/PrimaryLogScrub.h71
-rw-r--r--src/osd/ReplicatedBackend.cc2425
-rw-r--r--src/osd/ReplicatedBackend.h437
-rw-r--r--src/osd/ScrubStore.cc198
-rw-r--r--src/osd/ScrubStore.h52
-rw-r--r--src/osd/Session.cc106
-rw-r--r--src/osd/Session.h240
-rw-r--r--src/osd/SnapMapper.cc752
-rw-r--r--src/osd/SnapMapper.h338
-rw-r--r--src/osd/TierAgentState.h128
-rw-r--r--src/osd/Watch.cc550
-rw-r--r--src/osd/Watch.h291
-rw-r--r--src/osd/error_code.cc105
-rw-r--r--src/osd/error_code.h53
-rw-r--r--src/osd/objclass.cc702
-rw-r--r--src/osd/object_state.h190
-rw-r--r--src/osd/osd_internal_types.h320
-rw-r--r--src/osd/osd_op_util.cc263
-rw-r--r--src/osd/osd_op_util.h83
-rw-r--r--src/osd/osd_perf_counters.cc321
-rw-r--r--src/osd/osd_perf_counters.h163
-rw-r--r--src/osd/osd_types.cc7212
-rw-r--r--src/osd/osd_types.h6568
-rw-r--r--src/osd/pg_scrubber.cc2384
-rw-r--r--src/osd/pg_scrubber.h821
-rw-r--r--src/osd/recovery_types.cc16
-rw-r--r--src/osd/recovery_types.h95
-rw-r--r--src/osd/scheduler/OpScheduler.cc56
-rw-r--r--src/osd/scheduler/OpScheduler.h147
-rw-r--r--src/osd/scheduler/OpSchedulerItem.cc259
-rw-r--r--src/osd/scheduler/OpSchedulerItem.h629
-rw-r--r--src/osd/scheduler/mClockScheduler.cc514
-rw-r--r--src/osd/scheduler/mClockScheduler.h204
-rw-r--r--src/osd/scrub_machine.cc534
-rw-r--r--src/osd/scrub_machine.h344
-rw-r--r--src/osd/scrub_machine_lstnr.h164
-rw-r--r--src/osd/scrubber_common.h299
82 files changed, 97320 insertions, 0 deletions
diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt
new file mode 100644
index 000000000..373456fc6
--- /dev/null
+++ b/src/osd/CMakeLists.txt
@@ -0,0 +1,75 @@
+set(osdc_osd_srcs
+ ${CMAKE_SOURCE_DIR}/src/osdc/Objecter.cc
+ ${CMAKE_SOURCE_DIR}/src/osdc/Striper.cc)
+
+if(WITH_OSD_INSTRUMENT_FUNCTIONS AND CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+ add_compile_options(
+ -finstrument-functions
+ -finstrument-functions-exclude-function-list=_mm_loadu_si128,_mm_cmpeq_epi32,_mm_movemask_epi8)
+ set(osd_cyg_functions_src ${CMAKE_SOURCE_DIR}/src/tracing/cyg_profile_functions.c)
+endif()
+
+set(osd_srcs
+ OSD.cc
+ pg_scrubber.cc
+ scrub_machine.cc
+ PrimaryLogScrub.cc
+ Watch.cc
+ ClassHandler.cc
+ PG.cc
+ PGLog.cc
+ PrimaryLogPG.cc
+ ReplicatedBackend.cc
+ ECBackend.cc
+ ECTransaction.cc
+ PGBackend.cc
+ OSDCap.cc
+ Watch.cc
+ Session.cc
+ SnapMapper.cc
+ ScrubStore.cc
+ osd_types.cc
+ ECUtil.cc
+ ExtentCache.cc
+ scheduler/OpScheduler.cc
+ scheduler/OpSchedulerItem.cc
+ scheduler/mClockScheduler.cc
+ PeeringState.cc
+ PGStateUtils.cc
+ recovery_types.cc
+ MissingLoc.cc
+ osd_perf_counters.cc
+ ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc
+ ${CMAKE_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc
+ ${osd_cyg_functions_src}
+ ${osdc_osd_srcs})
+if(HAS_VTA)
+ set_source_files_properties(osdcap.cc
+ PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+add_library(osd STATIC ${osd_srcs})
+target_link_libraries(osd
+ PUBLIC dmclock::dmclock Boost::MPL
+ PRIVATE os heap_profiler cpu_profiler fmt::fmt ${CMAKE_DL_LIBS})
+if(WITH_LTTNG)
+ add_dependencies(osd osd-tp pg-tp)
+endif()
+if(WITH_EVENTTRACE)
+ add_dependencies(osd eventtrace_tp)
+endif()
+if(WITH_OSD_INSTRUMENT_FUNCTIONS)
+ add_dependencies(osd cyg_profile_tp)
+endif()
+
+# libcls_* are runtime dependencies
+add_dependencies(osd cls_journal cls_hello cls_lock cls_log cls_numops
+ cls_refcount cls_timeindex cls_user cls_version cls_cas cls_cmpomap)
+if(WITH_CEPHFS)
+ add_dependencies(osd cls_cephfs)
+endif()
+if(WITH_RBD)
+ add_dependencies(osd cls_rbd)
+endif()
+if(WITH_RADOSGW)
+ add_dependencies(osd cls_otp cls_rgw cls_queue cls_rgw_gc cls_2pc_queue cls_fifo)
+endif()
diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc
new file mode 100644
index 000000000..d1e726408
--- /dev/null
+++ b/src/osd/ClassHandler.cc
@@ -0,0 +1,350 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "ClassHandler.h"
+#include "common/errno.h"
+#include "common/ceph_context.h"
+#include "include/dlfcn_compat.h"
+
+#include <map>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "common/config.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout
+
+
+#define CLS_PREFIX "libcls_"
+#define CLS_SUFFIX SHARED_LIB_SUFFIX
+
+using std::map;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+
+
+int ClassHandler::open_class(const string& cname, ClassData **pcls)
+{
+ std::lock_guard lock(mutex);
+ ClassData *cls = _get_class(cname, true);
+ if (!cls)
+ return -EPERM;
+ if (cls->status != ClassData::CLASS_OPEN) {
+ int r = _load_class(cls);
+ if (r)
+ return r;
+ }
+ *pcls = cls;
+ return 0;
+}
+
+int ClassHandler::open_all_classes()
+{
+ ldout(cct, 10) << __func__ << dendl;
+ DIR *dir = ::opendir(cct->_conf->osd_class_dir.c_str());
+ if (!dir)
+ return -errno;
+
+ struct dirent *pde = nullptr;
+ int r = 0;
+ while ((pde = ::readdir(dir))) {
+ if (pde->d_name[0] == '.')
+ continue;
+ if (strlen(pde->d_name) > sizeof(CLS_PREFIX) - 1 + sizeof(CLS_SUFFIX) - 1 &&
+ strncmp(pde->d_name, CLS_PREFIX, sizeof(CLS_PREFIX) - 1) == 0 &&
+ strcmp(pde->d_name + strlen(pde->d_name) - (sizeof(CLS_SUFFIX) - 1), CLS_SUFFIX) == 0) {
+ char cname[PATH_MAX + 1];
+ strncpy(cname, pde->d_name + sizeof(CLS_PREFIX) - 1, sizeof(cname) -1);
+ cname[strlen(cname) - (sizeof(CLS_SUFFIX) - 1)] = '\0';
+ ldout(cct, 10) << __func__ << " found " << cname << dendl;
+ ClassData *cls;
+ // skip classes that aren't in 'osd class load list'
+ r = open_class(cname, &cls);
+ if (r < 0 && r != -EPERM)
+ goto out;
+ }
+ }
+ out:
+ closedir(dir);
+ return r;
+}
+
+void ClassHandler::shutdown()
+{
+ for (auto& cls : classes) {
+ if (cls.second.handle) {
+ dlclose(cls.second.handle);
+ }
+ }
+ classes.clear();
+}
+
+/*
+ * Check if @cname is in the whitespace delimited list @list, or the @list
+ * contains the wildcard "*".
+ *
+ * This is expensive but doesn't consume memory for an index, and is performed
+ * only once when a class is loaded.
+ */
+bool ClassHandler::in_class_list(const std::string& cname,
+ const std::string& list)
+{
+ std::istringstream ss(list);
+ std::istream_iterator<std::string> begin{ss};
+ std::istream_iterator<std::string> end{};
+
+ const std::vector<std::string> targets{cname, "*"};
+
+ auto it = std::find_first_of(begin, end,
+ targets.begin(), targets.end());
+
+ return it != end;
+}
+
+ClassHandler::ClassData *ClassHandler::_get_class(const string& cname,
+ bool check_allowed)
+{
+ ClassData *cls;
+ map<string, ClassData>::iterator iter = classes.find(cname);
+
+ if (iter != classes.end()) {
+ cls = &iter->second;
+ } else {
+ if (check_allowed && !in_class_list(cname, cct->_conf->osd_class_load_list)) {
+ ldout(cct, 0) << "_get_class not permitted to load " << cname << dendl;
+ return NULL;
+ }
+ cls = &classes[cname];
+ ldout(cct, 10) << "_get_class adding new class name " << cname << " " << cls << dendl;
+ cls->name = cname;
+ cls->handler = this;
+ cls->allowed = in_class_list(cname, cct->_conf->osd_class_default_list);
+ }
+ return cls;
+}
+
+int ClassHandler::_load_class(ClassData *cls)
+{
+ // already open
+ if (cls->status == ClassData::CLASS_OPEN)
+ return 0;
+
+ if (cls->status == ClassData::CLASS_UNKNOWN ||
+ cls->status == ClassData::CLASS_MISSING) {
+ char fname[PATH_MAX];
+ snprintf(fname, sizeof(fname), "%s/" CLS_PREFIX "%s" CLS_SUFFIX,
+ cct->_conf->osd_class_dir.c_str(),
+ cls->name.c_str());
+ ldout(cct, 10) << "_load_class " << cls->name << " from " << fname << dendl;
+
+ cls->handle = dlopen(fname, RTLD_NOW);
+ if (!cls->handle) {
+ struct stat st;
+ int r = ::stat(fname, &st);
+ if (r < 0) {
+ r = -errno;
+ ldout(cct, 0) << __func__ << " could not stat class " << fname
+ << ": " << cpp_strerror(r) << dendl;
+ } else {
+ ldout(cct, 0) << "_load_class could not open class " << fname
+ << " (dlopen failed): " << dlerror() << dendl;
+ r = -EIO;
+ }
+ cls->status = ClassData::CLASS_MISSING;
+ return r;
+ }
+
+ cls_deps_t *(*cls_deps)();
+ cls_deps = (cls_deps_t *(*)())dlsym(cls->handle, "class_deps");
+ if (cls_deps) {
+ cls_deps_t *deps = cls_deps();
+ while (deps) {
+ if (!deps->name)
+ break;
+ ClassData *cls_dep = _get_class(deps->name, false);
+ cls->dependencies.insert(cls_dep);
+ if (cls_dep->status != ClassData::CLASS_OPEN)
+ cls->missing_dependencies.insert(cls_dep);
+ deps++;
+ }
+ }
+ }
+
+ // resolve dependencies
+ set<ClassData*>::iterator p = cls->missing_dependencies.begin();
+ while (p != cls->missing_dependencies.end()) {
+ ClassData *dc = *p;
+ int r = _load_class(dc);
+ if (r < 0) {
+ cls->status = ClassData::CLASS_MISSING_DEPS;
+ return r;
+ }
+
+ ldout(cct, 10) << "_load_class " << cls->name << " satisfied dependency " << dc->name << dendl;
+ cls->missing_dependencies.erase(p++);
+ }
+
+ // initialize
+ void (*cls_init)() = (void (*)())dlsym(cls->handle, "__cls_init");
+ if (cls_init) {
+ cls->status = ClassData::CLASS_INITIALIZING;
+ cls_init();
+ }
+
+ ldout(cct, 10) << "_load_class " << cls->name << " success" << dendl;
+ cls->status = ClassData::CLASS_OPEN;
+ return 0;
+}
+
+
+
+ClassHandler::ClassData *ClassHandler::register_class(const char *cname)
+{
+ ceph_assert(ceph_mutex_is_locked(mutex));
+
+ ClassData *cls = _get_class(cname, false);
+ ldout(cct, 10) << "register_class " << cname << " status " << cls->status << dendl;
+
+ if (cls->status != ClassData::CLASS_INITIALIZING) {
+ ldout(cct, 0) << "class " << cname << " isn't loaded; is the class registering under the wrong name?" << dendl;
+ return NULL;
+ }
+ return cls;
+}
+
+void ClassHandler::unregister_class(ClassHandler::ClassData *cls)
+{
+ /* FIXME: do we really need this one? */
+}
+
+ClassHandler::ClassMethod *ClassHandler::ClassData::register_method(const char *mname,
+ int flags,
+ cls_method_call_t func)
+{
+ /* no need for locking, called under the class_init mutex */
+ if (!flags) {
+ lderr(handler->cct) << "register_method " << name << "." << mname
+ << " flags " << flags << " " << (void*)func
+ << " FAILED -- flags must be non-zero" << dendl;
+ return NULL;
+ }
+ ldout(handler->cct, 10) << "register_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl;
+ [[maybe_unused]] auto [method, added] = methods_map.try_emplace(mname, mname, func, flags, this);
+ return &method->second;
+}
+
+ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const char *mname,
+ int flags,
+ cls_method_cxx_call_t func)
+{
+ /* no need for locking, called under the class_init mutex */
+ ldout(handler->cct, 10) << "register_cxx_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl;
+ [[maybe_unused]] auto [method, added] = methods_map.try_emplace(mname, mname, func, flags, this);
+ return &method->second;
+}
+
+ClassHandler::ClassFilter *ClassHandler::ClassData::register_cxx_filter(
+ const std::string &filter_name,
+ cls_cxx_filter_factory_t fn)
+{
+ ClassFilter &filter = filters_map[filter_name];
+ filter.fn = fn;
+ filter.name = filter_name;
+ filter.cls = this;
+ return &filter;
+}
+
+ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method(
+ const std::string& mname)
+{
+ if (auto iter = methods_map.find(mname); iter != methods_map.end()) {
+ return &(iter->second);
+ } else {
+ return nullptr;
+ }
+}
+
+int ClassHandler::ClassData::get_method_flags(const std::string& mname)
+{
+ std::lock_guard l(handler->mutex);
+ ClassMethod *method = _get_method(mname);
+ if (!method)
+ return -ENOENT;
+ return method->flags;
+}
+
+void ClassHandler::ClassData::unregister_method(ClassHandler::ClassMethod *method)
+{
+ /* no need for locking, called under the class_init mutex */
+ map<string, ClassMethod>::iterator iter = methods_map.find(method->name);
+ if (iter == methods_map.end())
+ return;
+ methods_map.erase(iter);
+}
+
+void ClassHandler::ClassMethod::unregister()
+{
+ cls->unregister_method(this);
+}
+
+void ClassHandler::ClassData::unregister_filter(ClassHandler::ClassFilter *filter)
+{
+ /* no need for locking, called under the class_init mutex */
+ map<string, ClassFilter>::iterator iter = filters_map.find(filter->name);
+ if (iter == filters_map.end())
+ return;
+ filters_map.erase(iter);
+}
+
+void ClassHandler::ClassFilter::unregister()
+{
+ cls->unregister_filter(this);
+}
+
+int ClassHandler::ClassMethod::exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata)
+{
+ int ret = 0;
+ std::visit([&](auto method) {
+ using method_t = decltype(method);
+ if constexpr (std::is_same_v<method_t, cls_method_cxx_call_t>) {
+ // C++ call version
+ ret = method(ctx, &indata, &outdata);
+ } else if constexpr (std::is_same_v<method_t, cls_method_call_t>) {
+ // C version
+ char *out = nullptr;
+ int olen = 0;
+ ret = method(ctx, indata.c_str(), indata.length(), &out, &olen);
+ if (out) {
+ // assume *out was allocated via cls_alloc (which calls malloc!)
+ ceph::buffer::ptr bp = ceph::buffer::claim_malloc(olen, out);
+ outdata.push_back(bp);
+ }
+ } else {
+ static_assert(std::is_same_v<method_t, void>);
+ }
+ }, func);
+ return ret;
+}
+
+ClassHandler& ClassHandler::get_instance()
+{
+#ifdef WITH_SEASTAR
+ // the context is being used solely for:
+ // 1. random number generation (cls_gen_random_bytes)
+ // 2. accessing the configuration
+ // 3. logging
+ static CephContext cct;
+ static ClassHandler single(&cct);
+#else
+ static ClassHandler single(g_ceph_context);
+#endif // WITH_SEASTAR
+ return single;
+}
diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h
new file mode 100644
index 000000000..fff61d5d2
--- /dev/null
+++ b/src/osd/ClassHandler.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_CLASSHANDLER_H
+#define CEPH_CLASSHANDLER_H
+
+#include <variant>
+
+#include "include/types.h"
+#include "include/common_fwd.h"
+#include "common/ceph_mutex.h"
+#include "objclass/objclass.h"
+
+//forward declaration
+class ClassHandler
+{
+public:
+ CephContext *cct;
+ struct ClassData;
+
+ struct ClassMethod {
+ const std::string name;
+ using func_t = std::variant<cls_method_cxx_call_t, cls_method_call_t>;
+ func_t func;
+ int flags = 0;
+ ClassData *cls = nullptr;
+
+ int exec(cls_method_context_t ctx,
+ ceph::bufferlist& indata,
+ ceph::bufferlist& outdata);
+ void unregister();
+
+ int get_flags() {
+ std::lock_guard l(cls->handler->mutex);
+ return flags;
+ }
+ ClassMethod(const char* name, func_t call, int flags, ClassData* cls)
+ : name{name}, func{call}, flags{flags}, cls{cls}
+ {}
+ };
+
+ struct ClassFilter {
+ ClassData *cls = nullptr;
+ std::string name;
+ cls_cxx_filter_factory_t fn = nullptr;
+
+ void unregister();
+ };
+
+ struct ClassData {
+ enum Status {
+ CLASS_UNKNOWN,
+ CLASS_MISSING, // missing
+ CLASS_MISSING_DEPS, // missing dependencies
+ CLASS_INITIALIZING, // calling init() right now
+ CLASS_OPEN, // initialized, usable
+ } status = CLASS_UNKNOWN;
+
+ std::string name;
+ ClassHandler *handler = nullptr;
+ void *handle = nullptr;
+
+ bool allowed = false;
+
+ std::map<std::string, ClassMethod> methods_map;
+ std::map<std::string, ClassFilter> filters_map;
+
+ std::set<ClassData *> dependencies; /* our dependencies */
+ std::set<ClassData *> missing_dependencies; /* only missing dependencies */
+
+ ClassMethod *_get_method(const std::string& mname);
+
+ ClassMethod *register_method(const char *mname,
+ int flags,
+ cls_method_call_t func);
+ ClassMethod *register_cxx_method(const char *mname,
+ int flags,
+ cls_method_cxx_call_t func);
+ void unregister_method(ClassMethod *method);
+
+ ClassFilter *register_cxx_filter(const std::string &filter_name,
+ cls_cxx_filter_factory_t fn);
+ void unregister_filter(ClassFilter *method);
+
+ ClassMethod *get_method(const std::string& mname) {
+ std::lock_guard l(handler->mutex);
+ return _get_method(mname);
+ }
+ int get_method_flags(const std::string& mname);
+
+ ClassFilter *get_filter(const std::string &filter_name) {
+ std::lock_guard l(handler->mutex);
+ if (auto i = filters_map.find(filter_name); i == filters_map.end()) {
+ return nullptr;
+ } else {
+ return &(i->second);
+ }
+ }
+ };
+
+private:
+ std::map<std::string, ClassData> classes;
+
+ ClassData *_get_class(const std::string& cname, bool check_allowed);
+ int _load_class(ClassData *cls);
+
+ static bool in_class_list(const std::string& cname,
+ const std::string& list);
+
+ ceph::mutex mutex = ceph::make_mutex("ClassHandler");
+
+public:
+ explicit ClassHandler(CephContext *cct) : cct(cct) {}
+
+ int open_all_classes();
+ int open_class(const std::string& cname, ClassData **pcls);
+
+ ClassData *register_class(const char *cname);
+ void unregister_class(ClassData *cls);
+
+ void shutdown();
+
+ static ClassHandler& get_instance();
+};
+
+
+#endif
diff --git a/src/osd/DynamicPerfStats.h b/src/osd/DynamicPerfStats.h
new file mode 100644
index 000000000..1c6c26c71
--- /dev/null
+++ b/src/osd/DynamicPerfStats.h
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef DYNAMIC_PERF_STATS_H
+#define DYNAMIC_PERF_STATS_H
+
+#include "include/random.h"
+#include "messages/MOSDOp.h"
+#include "mgr/OSDPerfMetricTypes.h"
+#include "osd/OSD.h"
+#include "osd/OpRequest.h"
+
+class DynamicPerfStats {
+public:
+ DynamicPerfStats() {
+ }
+
+ DynamicPerfStats(const std::list<OSDPerfMetricQuery> &queries) {
+ for (auto &query : queries) {
+ data[query];
+ }
+ }
+
+ void merge(const DynamicPerfStats &dps) {
+ for (auto &query_it : dps.data) {
+ auto &query = query_it.first;
+ for (auto &key_it : query_it.second) {
+ auto &key = key_it.first;
+ auto counter_it = key_it.second.begin();
+ auto update_counter_fnc =
+ [&counter_it](const PerformanceCounterDescriptor &d,
+ PerformanceCounter *c) {
+ c->first += counter_it->first;
+ c->second += counter_it->second;
+ counter_it++;
+ };
+
+ ceph_assert(key_it.second.size() >= data[query][key].size());
+ query.update_counters(update_counter_fnc, &data[query][key]);
+ }
+ }
+ }
+
+ void set_queries(const std::list<OSDPerfMetricQuery> &queries) {
+ std::map<OSDPerfMetricQuery,
+ std::map<OSDPerfMetricKey, PerformanceCounters>> new_data;
+ for (auto &query : queries) {
+ std::swap(new_data[query], data[query]);
+ }
+ std::swap(data, new_data);
+ }
+
+ bool is_enabled() {
+ return !data.empty();
+ }
+
+ void add(const OSDService *osd, const pg_info_t &pg_info, const OpRequest& op,
+ uint64_t inb, uint64_t outb, const utime_t &latency) {
+
+ auto update_counter_fnc =
+ [&op, inb, outb, &latency](const PerformanceCounterDescriptor &d,
+ PerformanceCounter *c) {
+ ceph_assert(d.is_supported());
+
+ switch(d.type) {
+ case PerformanceCounterType::OPS:
+ c->first++;
+ return;
+ case PerformanceCounterType::WRITE_OPS:
+ if (op.may_write() || op.may_cache()) {
+ c->first++;
+ }
+ return;
+ case PerformanceCounterType::READ_OPS:
+ if (op.may_read()) {
+ c->first++;
+ }
+ return;
+ case PerformanceCounterType::BYTES:
+ c->first += inb + outb;
+ return;
+ case PerformanceCounterType::WRITE_BYTES:
+ if (op.may_write() || op.may_cache()) {
+ c->first += inb;
+ }
+ return;
+ case PerformanceCounterType::READ_BYTES:
+ if (op.may_read()) {
+ c->first += outb;
+ }
+ return;
+ case PerformanceCounterType::LATENCY:
+ c->first += latency.to_nsec();
+ c->second++;
+ return;
+ case PerformanceCounterType::WRITE_LATENCY:
+ if (op.may_write() || op.may_cache()) {
+ c->first += latency.to_nsec();
+ c->second++;
+ }
+ return;
+ case PerformanceCounterType::READ_LATENCY:
+ if (op.may_read()) {
+ c->first += latency.to_nsec();
+ c->second++;
+ }
+ return;
+ default:
+ ceph_abort_msg("unknown counter type");
+ }
+ };
+
+ auto get_subkey_fnc =
+ [&osd, &pg_info, &op](const OSDPerfMetricSubKeyDescriptor &d,
+ OSDPerfMetricSubKey *sub_key) {
+ ceph_assert(d.is_supported());
+
+ auto m = op.get_req<MOSDOp>();
+ std::string match_string;
+ switch(d.type) {
+ case OSDPerfMetricSubKeyType::CLIENT_ID:
+ match_string = stringify(m->get_reqid().name);
+ break;
+ case OSDPerfMetricSubKeyType::CLIENT_ADDRESS:
+ match_string = stringify(m->get_connection()->get_peer_addr());
+ break;
+ case OSDPerfMetricSubKeyType::POOL_ID:
+ match_string = stringify(m->get_spg().pool());
+ break;
+ case OSDPerfMetricSubKeyType::NAMESPACE:
+ match_string = m->get_hobj().nspace;
+ break;
+ case OSDPerfMetricSubKeyType::OSD_ID:
+ match_string = stringify(osd->get_nodeid());
+ break;
+ case OSDPerfMetricSubKeyType::PG_ID:
+ match_string = stringify(pg_info.pgid);
+ break;
+ case OSDPerfMetricSubKeyType::OBJECT_NAME:
+ match_string = m->get_oid().name;
+ break;
+ case OSDPerfMetricSubKeyType::SNAP_ID:
+ match_string = stringify(m->get_snapid());
+ break;
+ default:
+ ceph_abort_msg("unknown counter type");
+ }
+
+ std::smatch match;
+ if (!std::regex_search(match_string, match, d.regex)) {
+ return false;
+ }
+ if (match.size() <= 1) {
+ return false;
+ }
+ for (size_t i = 1; i < match.size(); i++) {
+ sub_key->push_back(match[i].str());
+ }
+ return true;
+ };
+
+ for (auto &it : data) {
+ auto &query = it.first;
+ OSDPerfMetricKey key;
+ if (query.get_key(get_subkey_fnc, &key)) {
+ query.update_counters(update_counter_fnc, &it.second[key]);
+ }
+ }
+ }
+
+ void add_to_reports(
+ const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &limits,
+ std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
+ for (auto &it : data) {
+ auto &query = it.first;
+ auto limit_it = limits.find(query);
+ if (limit_it == limits.end()) {
+ continue;
+ }
+ auto &query_limits = limit_it->second;
+ auto &counters = it.second;
+ auto &report = (*reports)[query];
+
+ query.get_performance_counter_descriptors(
+ &report.performance_counter_descriptors);
+
+ auto &descriptors = report.performance_counter_descriptors;
+ ceph_assert(descriptors.size() > 0);
+
+ if (!is_limited(query_limits, counters.size())) {
+ for (auto &it_counters : counters) {
+ auto &bl = report.group_packed_performance_counters[it_counters.first];
+ query.pack_counters(it_counters.second, &bl);
+ }
+ continue;
+ }
+
+ for (auto &limit : query_limits) {
+ size_t index = 0;
+ for (; index < descriptors.size(); index++) {
+ if (descriptors[index] == limit.order_by) {
+ break;
+ }
+ }
+ if (index == descriptors.size()) {
+ // should not happen
+ continue;
+ }
+
+ // Weighted Random Sampling (Algorithm A-Chao):
+ // Select the first [0, max_count) samples, randomly replace
+ // with samples from [max_count, end) using weighted
+ // probability, and return [0, max_count) as the result.
+
+ ceph_assert(limit.max_count < counters.size());
+ typedef std::map<OSDPerfMetricKey, PerformanceCounters>::iterator
+ Iterator;
+ std::vector<Iterator> counter_iterators;
+ counter_iterators.reserve(limit.max_count);
+
+ Iterator it_counters = counters.begin();
+ uint64_t wsum = 0;
+ for (size_t i = 0; i < limit.max_count; i++) {
+ wsum += it_counters->second[index].first;
+ counter_iterators.push_back(it_counters++);
+ }
+ for (; it_counters != counters.end(); it_counters++) {
+ wsum += it_counters->second[index].first;
+ if (ceph::util::generate_random_number(0, wsum) <=
+ it_counters->second[index].first) {
+ auto i = ceph::util::generate_random_number(0, limit.max_count - 1);
+ counter_iterators[i] = it_counters;
+ }
+ }
+
+ for (auto it_counters : counter_iterators) {
+ auto &bl =
+ report.group_packed_performance_counters[it_counters->first];
+ if (bl.length() == 0) {
+ query.pack_counters(it_counters->second, &bl);
+ }
+ }
+ }
+ }
+ }
+
+private:
+ static bool is_limited(const OSDPerfMetricLimits &limits,
+ size_t counters_size) {
+ if (limits.empty()) {
+ return false;
+ }
+
+ for (auto &limit : limits) {
+ if (limit.max_count >= counters_size) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ std::map<OSDPerfMetricQuery,
+ std::map<OSDPerfMetricKey, PerformanceCounters>> data;
+};
+
+#endif // DYNAMIC_PERF_STATS_H
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
new file mode 100644
index 000000000..b13a99fbc
--- /dev/null
+++ b/src/osd/ECBackend.cc
@@ -0,0 +1,2637 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include "ECBackend.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+#include "ECMsgTypes.h"
+
+#include "PrimaryLogPG.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::ostream;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferhash;
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::Formatter;
+
+static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) {
+ return pgb->get_parent()->gen_dbg_prefix(*_dout);
+}
+
+struct ECRecoveryHandle : public PGBackend::RecoveryHandle {
+ list<ECBackend::RecoveryOp> ops;
+};
+
+ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs) {
+ switch (rhs.pipeline_state) {
+ case ECBackend::pipeline_state_t::CACHE_VALID:
+ return lhs << "CACHE_VALID";
+ case ECBackend::pipeline_state_t::CACHE_INVALID:
+ return lhs << "CACHE_INVALID";
+ default:
+ ceph_abort_msg("invalid pipeline state");
+ }
+ return lhs; // unreachable
+}
+
+static ostream &operator<<(ostream &lhs, const map<pg_shard_t, bufferlist> &rhs)
+{
+ lhs << "[";
+ for (map<pg_shard_t, bufferlist>::const_iterator i = rhs.begin();
+ i != rhs.end();
+ ++i) {
+ if (i != rhs.begin())
+ lhs << ", ";
+ lhs << make_pair(i->first, i->second.length());
+ }
+ return lhs << "]";
+}
+
+static ostream &operator<<(ostream &lhs, const map<int, bufferlist> &rhs)
+{
+ lhs << "[";
+ for (map<int, bufferlist>::const_iterator i = rhs.begin();
+ i != rhs.end();
+ ++i) {
+ if (i != rhs.begin())
+ lhs << ", ";
+ lhs << make_pair(i->first, i->second.length());
+ }
+ return lhs << "]";
+}
+
+static ostream &operator<<(
+ ostream &lhs,
+ const boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &rhs)
+{
+ return lhs << "(" << rhs.get<0>() << ", "
+ << rhs.get<1>() << ", " << rhs.get<2>() << ")";
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::read_request_t &rhs)
+{
+ return lhs << "read_request_t(to_read=[" << rhs.to_read << "]"
+ << ", need=" << rhs.need
+ << ", want_attrs=" << rhs.want_attrs
+ << ")";
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::read_result_t &rhs)
+{
+ lhs << "read_result_t(r=" << rhs.r
+ << ", errors=" << rhs.errors;
+ if (rhs.attrs) {
+ lhs << ", attrs=" << *(rhs.attrs);
+ } else {
+ lhs << ", noattrs";
+ }
+ return lhs << ", returned=" << rhs.returned << ")";
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::ReadOp &rhs)
+{
+ lhs << "ReadOp(tid=" << rhs.tid;
+ if (rhs.op && rhs.op->get_req()) {
+ lhs << ", op=";
+ rhs.op->get_req()->print(lhs);
+ }
+ return lhs << ", to_read=" << rhs.to_read
+ << ", complete=" << rhs.complete
+ << ", priority=" << rhs.priority
+ << ", obj_to_source=" << rhs.obj_to_source
+ << ", source_to_obj=" << rhs.source_to_obj
+ << ", in_progress=" << rhs.in_progress << ")";
+}
+
+void ECBackend::ReadOp::dump(Formatter *f) const
+{
+ f->dump_unsigned("tid", tid);
+ if (op && op->get_req()) {
+ f->dump_stream("op") << *(op->get_req());
+ }
+ f->dump_stream("to_read") << to_read;
+ f->dump_stream("complete") << complete;
+ f->dump_int("priority", priority);
+ f->dump_stream("obj_to_source") << obj_to_source;
+ f->dump_stream("source_to_obj") << source_to_obj;
+ f->dump_stream("in_progress") << in_progress;
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::Op &rhs)
+{
+ lhs << "Op(" << rhs.hoid
+ << " v=" << rhs.version
+ << " tt=" << rhs.trim_to
+ << " tid=" << rhs.tid
+ << " reqid=" << rhs.reqid;
+ if (rhs.client_op && rhs.client_op->get_req()) {
+ lhs << " client_op=";
+ rhs.client_op->get_req()->print(lhs);
+ }
+ lhs << " roll_forward_to=" << rhs.roll_forward_to
+ << " temp_added=" << rhs.temp_added
+ << " temp_cleared=" << rhs.temp_cleared
+ << " pending_read=" << rhs.pending_read
+ << " remote_read=" << rhs.remote_read
+ << " remote_read_result=" << rhs.remote_read_result
+ << " pending_apply=" << rhs.pending_apply
+ << " pending_commit=" << rhs.pending_commit
+ << " plan.to_read=" << rhs.plan.to_read
+ << " plan.will_write=" << rhs.plan.will_write
+ << ")";
+ return lhs;
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::RecoveryOp &rhs)
+{
+ return lhs << "RecoveryOp("
+ << "hoid=" << rhs.hoid
+ << " v=" << rhs.v
+ << " missing_on=" << rhs.missing_on
+ << " missing_on_shards=" << rhs.missing_on_shards
+ << " recovery_info=" << rhs.recovery_info
+ << " recovery_progress=" << rhs.recovery_progress
+ << " obc refcount=" << rhs.obc.use_count()
+ << " state=" << ECBackend::RecoveryOp::tostr(rhs.state)
+ << " waiting_on_pushes=" << rhs.waiting_on_pushes
+ << " extent_requested=" << rhs.extent_requested
+ << ")";
+}
+
+void ECBackend::RecoveryOp::dump(Formatter *f) const
+{
+ f->dump_stream("hoid") << hoid;
+ f->dump_stream("v") << v;
+ f->dump_stream("missing_on") << missing_on;
+ f->dump_stream("missing_on_shards") << missing_on_shards;
+ f->dump_stream("recovery_info") << recovery_info;
+ f->dump_stream("recovery_progress") << recovery_progress;
+ f->dump_stream("state") << tostr(state);
+ f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
+ f->dump_stream("extent_requested") << extent_requested;
+}
+
+ECBackend::ECBackend(
+ PGBackend::Listener *pg,
+ const coll_t &coll,
+ ObjectStore::CollectionHandle &ch,
+ ObjectStore *store,
+ CephContext *cct,
+ ErasureCodeInterfaceRef ec_impl,
+ uint64_t stripe_width)
+ : PGBackend(cct, pg, store, coll, ch),
+ ec_impl(ec_impl),
+ sinfo(ec_impl->get_data_chunk_count(), stripe_width) {
+ ceph_assert((ec_impl->get_data_chunk_count() *
+ ec_impl->get_chunk_size(stripe_width)) == stripe_width);
+}
+
+PGBackend::RecoveryHandle *ECBackend::open_recovery_op()
+{
+ return new ECRecoveryHandle;
+}
+
+void ECBackend::_failed_push(const hobject_t &hoid,
+ pair<RecoveryMessages *, ECBackend::read_result_t &> &in)
+{
+ ECBackend::read_result_t &res = in.second;
+ dout(10) << __func__ << ": Read error " << hoid << " r="
+ << res.r << " errors=" << res.errors << dendl;
+ dout(10) << __func__ << ": canceling recovery op for obj " << hoid
+ << dendl;
+ ceph_assert(recovery_ops.count(hoid));
+ eversion_t v = recovery_ops[hoid].v;
+ recovery_ops.erase(hoid);
+
+ set<pg_shard_t> fl;
+ for (auto&& i : res.errors) {
+ fl.insert(i.first);
+ }
+ get_parent()->on_failed_pull(fl, hoid, v);
+}
+
+struct OnRecoveryReadComplete :
+ public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> {
+ ECBackend *pg;
+ hobject_t hoid;
+ OnRecoveryReadComplete(ECBackend *pg, const hobject_t &hoid)
+ : pg(pg), hoid(hoid) {}
+ void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) override {
+ ECBackend::read_result_t &res = in.second;
+ if (!(res.r == 0 && res.errors.empty())) {
+ pg->_failed_push(hoid, in);
+ return;
+ }
+ ceph_assert(res.returned.size() == 1);
+ pg->handle_recovery_read_complete(
+ hoid,
+ res.returned.back(),
+ res.attrs,
+ in.first);
+ }
+};
+
+struct RecoveryMessages {
+ map<hobject_t,
+ ECBackend::read_request_t> reads;
+ map<hobject_t, set<int>> want_to_read;
+ void read(
+ ECBackend *ec,
+ const hobject_t &hoid, uint64_t off, uint64_t len,
+ set<int> &&_want_to_read,
+ const map<pg_shard_t, vector<pair<int, int>>> &need,
+ bool attrs) {
+ list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
+ to_read.push_back(boost::make_tuple(off, len, 0));
+ ceph_assert(!reads.count(hoid));
+ want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
+ reads.insert(
+ make_pair(
+ hoid,
+ ECBackend::read_request_t(
+ to_read,
+ need,
+ attrs,
+ new OnRecoveryReadComplete(
+ ec,
+ hoid))));
+ }
+
+ map<pg_shard_t, vector<PushOp> > pushes;
+ map<pg_shard_t, vector<PushReplyOp> > push_replies;
+ ObjectStore::Transaction t;
+ RecoveryMessages() {}
+ ~RecoveryMessages() {}
+};
+
+void ECBackend::handle_recovery_push(
+ const PushOp &op,
+ RecoveryMessages *m,
+ bool is_repair)
+{
+ if (get_parent()->check_failsafe_full()) {
+ dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
+ ceph_abort();
+ }
+
+ bool oneshot = op.before_progress.first && op.after_progress.data_complete;
+ ghobject_t tobj;
+ if (oneshot) {
+ tobj = ghobject_t(op.soid, ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard);
+ } else {
+ tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid,
+ op.version),
+ ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard);
+ if (op.before_progress.first) {
+ dout(10) << __func__ << ": Adding oid "
+ << tobj.hobj << " in the temp collection" << dendl;
+ add_temp_obj(tobj.hobj);
+ }
+ }
+
+ if (op.before_progress.first) {
+ m->t.remove(coll, tobj);
+ m->t.touch(coll, tobj);
+ }
+
+ if (!op.data_included.empty()) {
+ uint64_t start = op.data_included.range_start();
+ uint64_t end = op.data_included.range_end();
+ ceph_assert(op.data.length() == (end - start));
+
+ m->t.write(
+ coll,
+ tobj,
+ start,
+ op.data.length(),
+ op.data);
+ } else {
+ ceph_assert(op.data.length() == 0);
+ }
+
+ if (get_parent()->pg_is_remote_backfilling()) {
+ get_parent()->pg_add_local_num_bytes(op.data.length());
+ get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
+ dout(10) << __func__ << " " << op.soid
+ << " add new actual data by " << op.data.length()
+ << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
+ << dendl;
+ }
+
+ if (op.before_progress.first) {
+ ceph_assert(op.attrset.count(string("_")));
+ m->t.setattrs(
+ coll,
+ tobj,
+ op.attrset);
+ }
+
+ if (op.after_progress.data_complete && !oneshot) {
+ dout(10) << __func__ << ": Removing oid "
+ << tobj.hobj << " from the temp collection" << dendl;
+ clear_temp_obj(tobj.hobj);
+ m->t.remove(coll, ghobject_t(
+ op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ m->t.collection_move_rename(
+ coll, tobj,
+ coll, ghobject_t(
+ op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ }
+ if (op.after_progress.data_complete) {
+ if ((get_parent()->pgb_is_primary())) {
+ ceph_assert(recovery_ops.count(op.soid));
+ ceph_assert(recovery_ops[op.soid].obc);
+ if (get_parent()->pg_is_repair())
+ get_parent()->inc_osd_stat_repaired();
+ get_parent()->on_local_recover(
+ op.soid,
+ op.recovery_info,
+ recovery_ops[op.soid].obc,
+ false,
+ &m->t);
+ } else {
+ // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired
+ if (is_repair)
+ get_parent()->inc_osd_stat_repaired();
+ get_parent()->on_local_recover(
+ op.soid,
+ op.recovery_info,
+ ObjectContextRef(),
+ false,
+ &m->t);
+ if (get_parent()->pg_is_remote_backfilling()) {
+ struct stat st;
+ int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard), &st);
+ if (r == 0) {
+ get_parent()->pg_sub_local_num_bytes(st.st_size);
+ // XXX: This can be way overestimated for small objects
+ get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
+ dout(10) << __func__ << " " << op.soid
+ << " sub actual data by " << st.st_size
+ << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
+ << dendl;
+ }
+ }
+ }
+ }
+ m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());
+ m->push_replies[get_parent()->primary_shard()].back().soid = op.soid;
+}
+
+void ECBackend::handle_recovery_push_reply(
+ const PushReplyOp &op,
+ pg_shard_t from,
+ RecoveryMessages *m)
+{
+ if (!recovery_ops.count(op.soid))
+ return;
+ RecoveryOp &rop = recovery_ops[op.soid];
+ ceph_assert(rop.waiting_on_pushes.count(from));
+ rop.waiting_on_pushes.erase(from);
+ continue_recovery_op(rop, m);
+}
+
+void ECBackend::handle_recovery_read_complete(
+ const hobject_t &hoid,
+ boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read,
+ std::optional<map<string, bufferlist> > attrs,
+ RecoveryMessages *m)
+{
+ dout(10) << __func__ << ": returned " << hoid << " "
+ << "(" << to_read.get<0>()
+ << ", " << to_read.get<1>()
+ << ", " << to_read.get<2>()
+ << ")"
+ << dendl;
+ ceph_assert(recovery_ops.count(hoid));
+ RecoveryOp &op = recovery_ops[hoid];
+ ceph_assert(op.returned_data.empty());
+ map<int, bufferlist*> target;
+ for (set<shard_id_t>::iterator i = op.missing_on_shards.begin();
+ i != op.missing_on_shards.end();
+ ++i) {
+ target[*i] = &(op.returned_data[*i]);
+ }
+ map<int, bufferlist> from;
+ for(map<pg_shard_t, bufferlist>::iterator i = to_read.get<2>().begin();
+ i != to_read.get<2>().end();
+ ++i) {
+ from[i->first.shard] = std::move(i->second);
+ }
+ dout(10) << __func__ << ": " << from << dendl;
+ int r;
+ r = ECUtil::decode(sinfo, ec_impl, from, target);
+ ceph_assert(r == 0);
+ if (attrs) {
+ op.xattrs.swap(*attrs);
+
+ if (!op.obc) {
+ // attrs only reference the origin bufferlist (decode from
+ // ECSubReadReply message) whose size is much greater than attrs
+ // in recovery. If obc cache it (get_obc maybe cache the attr),
+ // this causes the whole origin bufferlist would not be free
+ // until obc is evicted from obc cache. So rebuild the
+ // bufferlist before cache it.
+ for (map<string, bufferlist>::iterator it = op.xattrs.begin();
+ it != op.xattrs.end();
+ ++it) {
+ it->second.rebuild();
+ }
+ // Need to remove ECUtil::get_hinfo_key() since it should not leak out
+ // of the backend (see bug #12983)
+ map<string, bufferlist> sanitized_attrs(op.xattrs);
+ sanitized_attrs.erase(ECUtil::get_hinfo_key());
+ op.obc = get_parent()->get_obc(hoid, sanitized_attrs);
+ ceph_assert(op.obc);
+ op.recovery_info.size = op.obc->obs.oi.size;
+ op.recovery_info.oi = op.obc->obs.oi;
+ }
+
+ ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
+ if (op.obc->obs.oi.size > 0) {
+ ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
+ auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin();
+ decode(hinfo, bp);
+ }
+ op.hinfo = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo);
+ }
+ ceph_assert(op.xattrs.size());
+ ceph_assert(op.obc);
+ continue_recovery_op(op, m);
+}
+
+struct SendPushReplies : public Context {
+ PGBackend::Listener *l;
+ epoch_t epoch;
+ map<int, MOSDPGPushReply*> replies;
+ SendPushReplies(
+ PGBackend::Listener *l,
+ epoch_t epoch,
+ map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) {
+ replies.swap(in);
+ }
+ void finish(int) override {
+ std::vector<std::pair<int, Message*>> messages;
+ messages.reserve(replies.size());
+ for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
+ i != replies.end();
+ ++i) {
+ messages.push_back(std::make_pair(i->first, i->second));
+ }
+ if (!messages.empty()) {
+ l->send_message_osd_cluster(messages, epoch);
+ }
+ replies.clear();
+ }
+ ~SendPushReplies() override {
+ for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
+ i != replies.end();
+ ++i) {
+ i->second->put();
+ }
+ replies.clear();
+ }
+};
+
+void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
+{
+ for (map<pg_shard_t, vector<PushOp> >::iterator i = m.pushes.begin();
+ i != m.pushes.end();
+ m.pushes.erase(i++)) {
+ MOSDPGPush *msg = new MOSDPGPush();
+ msg->set_priority(priority);
+ msg->map_epoch = get_osdmap_epoch();
+ msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+ msg->from = get_parent()->whoami_shard();
+ msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
+ msg->pushes.swap(i->second);
+ msg->compute_cost(cct);
+ msg->is_repair = get_parent()->pg_is_repair();
+ get_parent()->send_message(
+ i->first.osd,
+ msg);
+ }
+ map<int, MOSDPGPushReply*> replies;
+ for (map<pg_shard_t, vector<PushReplyOp> >::iterator i =
+ m.push_replies.begin();
+ i != m.push_replies.end();
+ m.push_replies.erase(i++)) {
+ MOSDPGPushReply *msg = new MOSDPGPushReply();
+ msg->set_priority(priority);
+ msg->map_epoch = get_osdmap_epoch();
+ msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+ msg->from = get_parent()->whoami_shard();
+ msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
+ msg->replies.swap(i->second);
+ msg->compute_cost(cct);
+ replies.insert(make_pair(i->first.osd, msg));
+ }
+
+ if (!replies.empty()) {
+ (m.t).register_on_complete(
+ get_parent()->bless_context(
+ new SendPushReplies(
+ get_parent(),
+ get_osdmap_epoch(),
+ replies)));
+ get_parent()->queue_transaction(std::move(m.t));
+ }
+
+ if (m.reads.empty())
+ return;
+ start_read_op(
+ priority,
+ m.want_to_read,
+ m.reads,
+ OpRequestRef(),
+ false, true);
+}
+
+void ECBackend::continue_recovery_op(
+ RecoveryOp &op,
+ RecoveryMessages *m)
+{
+ dout(10) << __func__ << ": continuing " << op << dendl;
+ while (1) {
+ switch (op.state) {
+ case RecoveryOp::IDLE: {
+ // start read
+ op.state = RecoveryOp::READING;
+ ceph_assert(!op.recovery_progress.data_complete);
+ set<int> want(op.missing_on_shards.begin(), op.missing_on_shards.end());
+ uint64_t from = op.recovery_progress.data_recovered_to;
+ uint64_t amount = get_recovery_chunk_size();
+
+ if (op.recovery_progress.first && op.obc) {
+ /* We've got the attrs and the hinfo, might as well use them */
+ op.hinfo = get_hash_info(op.hoid);
+ if (!op.hinfo) {
+ derr << __func__ << ": " << op.hoid << " has inconsistent hinfo"
+ << dendl;
+ ceph_assert(recovery_ops.count(op.hoid));
+ eversion_t v = recovery_ops[op.hoid].v;
+ recovery_ops.erase(op.hoid);
+ get_parent()->on_failed_pull({get_parent()->whoami_shard()},
+ op.hoid, v);
+ return;
+ }
+ op.xattrs = op.obc->attr_cache;
+ encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
+ }
+
+ map<pg_shard_t, vector<pair<int, int>>> to_read;
+ int r = get_min_avail_to_read_shards(
+ op.hoid, want, true, false, &to_read);
+ if (r != 0) {
+ // we must have lost a recovery source
+ ceph_assert(!op.recovery_progress.first);
+ dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
+ << dendl;
+ get_parent()->cancel_pull(op.hoid);
+ recovery_ops.erase(op.hoid);
+ return;
+ }
+ m->read(
+ this,
+ op.hoid,
+ op.recovery_progress.data_recovered_to,
+ amount,
+ std::move(want),
+ to_read,
+ op.recovery_progress.first && !op.obc);
+ op.extent_requested = make_pair(
+ from,
+ amount);
+ dout(10) << __func__ << ": IDLE return " << op << dendl;
+ return;
+ }
+ case RecoveryOp::READING: {
+ // read completed, start write
+ ceph_assert(op.xattrs.size());
+ ceph_assert(op.returned_data.size());
+ op.state = RecoveryOp::WRITING;
+ ObjectRecoveryProgress after_progress = op.recovery_progress;
+ after_progress.data_recovered_to += op.extent_requested.second;
+ after_progress.first = false;
+ if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
+ after_progress.data_recovered_to =
+ sinfo.logical_to_next_stripe_offset(
+ op.obc->obs.oi.size);
+ after_progress.data_complete = true;
+ }
+ for (set<pg_shard_t>::iterator mi = op.missing_on.begin();
+ mi != op.missing_on.end();
+ ++mi) {
+ ceph_assert(op.returned_data.count(mi->shard));
+ m->pushes[*mi].push_back(PushOp());
+ PushOp &pop = m->pushes[*mi].back();
+ pop.soid = op.hoid;
+ pop.version = op.v;
+ pop.data = op.returned_data[mi->shard];
+ dout(10) << __func__ << ": before_progress=" << op.recovery_progress
+ << ", after_progress=" << after_progress
+ << ", pop.data.length()=" << pop.data.length()
+ << ", size=" << op.obc->obs.oi.size << dendl;
+ ceph_assert(
+ pop.data.length() ==
+ sinfo.aligned_logical_offset_to_chunk_offset(
+ after_progress.data_recovered_to -
+ op.recovery_progress.data_recovered_to)
+ );
+ if (pop.data.length())
+ pop.data_included.insert(
+ sinfo.aligned_logical_offset_to_chunk_offset(
+ op.recovery_progress.data_recovered_to),
+ pop.data.length()
+ );
+ if (op.recovery_progress.first) {
+ pop.attrset = op.xattrs;
+ }
+ pop.recovery_info = op.recovery_info;
+ pop.before_progress = op.recovery_progress;
+ pop.after_progress = after_progress;
+ if (*mi != get_parent()->primary_shard())
+ get_parent()->begin_peer_recover(
+ *mi,
+ op.hoid);
+ }
+ op.returned_data.clear();
+ op.waiting_on_pushes = op.missing_on;
+ op.recovery_progress = after_progress;
+ dout(10) << __func__ << ": READING return " << op << dendl;
+ return;
+ }
+ case RecoveryOp::WRITING: {
+ if (op.waiting_on_pushes.empty()) {
+ if (op.recovery_progress.data_complete) {
+ op.state = RecoveryOp::COMPLETE;
+ for (set<pg_shard_t>::iterator i = op.missing_on.begin();
+ i != op.missing_on.end();
+ ++i) {
+ if (*i != get_parent()->primary_shard()) {
+ dout(10) << __func__ << ": on_peer_recover on " << *i
+ << ", obj " << op.hoid << dendl;
+ get_parent()->on_peer_recover(
+ *i,
+ op.hoid,
+ op.recovery_info);
+ }
+ }
+ object_stat_sum_t stat;
+ stat.num_bytes_recovered = op.recovery_info.size;
+ stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
+ stat.num_objects_recovered = 1;
+ if (get_parent()->pg_is_repair())
+ stat.num_objects_repaired = 1;
+ get_parent()->on_global_recover(op.hoid, stat, false);
+ dout(10) << __func__ << ": WRITING return " << op << dendl;
+ recovery_ops.erase(op.hoid);
+ return;
+ } else {
+ op.state = RecoveryOp::IDLE;
+ dout(10) << __func__ << ": WRITING continue " << op << dendl;
+ continue;
+ }
+ }
+ return;
+ }
+ // should never be called once complete
+ case RecoveryOp::COMPLETE:
+ default: {
+ ceph_abort();
+ };
+ }
+ }
+}
+
+void ECBackend::run_recovery_op(
+ RecoveryHandle *_h,
+ int priority)
+{
+ ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h);
+ RecoveryMessages m;
+ for (list<RecoveryOp>::iterator i = h->ops.begin();
+ i != h->ops.end();
+ ++i) {
+ dout(10) << __func__ << ": starting " << *i << dendl;
+ ceph_assert(!recovery_ops.count(i->hoid));
+ RecoveryOp &op = recovery_ops.insert(make_pair(i->hoid, *i)).first->second;
+ continue_recovery_op(op, &m);
+ }
+
+ dispatch_recovery_messages(m, priority);
+ send_recovery_deletes(priority, h->deletes);
+ delete _h;
+}
+
+int ECBackend::recover_object(
+ const hobject_t &hoid,
+ eversion_t v,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *_h)
+{
+ ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h);
+ h->ops.push_back(RecoveryOp());
+ h->ops.back().v = v;
+ h->ops.back().hoid = hoid;
+ h->ops.back().obc = obc;
+ h->ops.back().recovery_info.soid = hoid;
+ h->ops.back().recovery_info.version = v;
+ if (obc) {
+ h->ops.back().recovery_info.size = obc->obs.oi.size;
+ h->ops.back().recovery_info.oi = obc->obs.oi;
+ }
+ if (hoid.is_snap()) {
+ if (obc) {
+ ceph_assert(obc->ssc);
+ h->ops.back().recovery_info.ss = obc->ssc->snapset;
+ } else if (head) {
+ ceph_assert(head->ssc);
+ h->ops.back().recovery_info.ss = head->ssc->snapset;
+ } else {
+ ceph_abort_msg("neither obc nor head set for a snap object");
+ }
+ }
+ h->ops.back().recovery_progress.omap_complete = true;
+ for (set<pg_shard_t>::const_iterator i =
+ get_parent()->get_acting_recovery_backfill_shards().begin();
+ i != get_parent()->get_acting_recovery_backfill_shards().end();
+ ++i) {
+ dout(10) << "checking " << *i << dendl;
+ if (get_parent()->get_shard_missing(*i).is_missing(hoid)) {
+ h->ops.back().missing_on.insert(*i);
+ h->ops.back().missing_on_shards.insert(i->shard);
+ }
+ }
+ dout(10) << __func__ << ": built op " << h->ops.back() << dendl;
+ return 0;
+}
+
+bool ECBackend::can_handle_while_inactive(
+ OpRequestRef _op)
+{
+ return false;
+}
+
+bool ECBackend::_handle_message(
+ OpRequestRef _op)
+{
+ dout(10) << __func__ << ": " << *_op->get_req() << dendl;
+ int priority = _op->get_req()->get_priority();
+ switch (_op->get_req()->get_type()) {
+ case MSG_OSD_EC_WRITE: {
+ // NOTE: this is non-const because handle_sub_write modifies the embedded
+ // ObjectStore::Transaction in place (and then std::move's it). It does
+ // not conflict with ECSubWrite's operator<<.
+ MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
+ _op->get_nonconst_req());
+ parent->maybe_preempt_replica_scrub(op->op.soid);
+ handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
+ return true;
+ }
+ case MSG_OSD_EC_WRITE_REPLY: {
+ const MOSDECSubOpWriteReply *op = static_cast<const MOSDECSubOpWriteReply*>(
+ _op->get_req());
+ handle_sub_write_reply(op->op.from, op->op, _op->pg_trace);
+ return true;
+ }
+ case MSG_OSD_EC_READ: {
+ auto op = _op->get_req<MOSDECSubOpRead>();
+ MOSDECSubOpReadReply *reply = new MOSDECSubOpReadReply;
+ reply->pgid = get_parent()->primary_spg_t();
+ reply->map_epoch = get_osdmap_epoch();
+ reply->min_epoch = get_parent()->get_interval_start_epoch();
+ handle_sub_read(op->op.from, op->op, &(reply->op), _op->pg_trace);
+ reply->trace = _op->pg_trace;
+ get_parent()->send_message_osd_cluster(
+ reply, _op->get_req()->get_connection());
+ return true;
+ }
+ case MSG_OSD_EC_READ_REPLY: {
+ // NOTE: this is non-const because handle_sub_read_reply steals resulting
+ // buffers. It does not conflict with ECSubReadReply operator<<.
+ MOSDECSubOpReadReply *op = static_cast<MOSDECSubOpReadReply*>(
+ _op->get_nonconst_req());
+ RecoveryMessages rm;
+ handle_sub_read_reply(op->op.from, op->op, &rm, _op->pg_trace);
+ dispatch_recovery_messages(rm, priority);
+ return true;
+ }
+ case MSG_OSD_PG_PUSH: {
+ auto op = _op->get_req<MOSDPGPush>();
+ RecoveryMessages rm;
+ for (vector<PushOp>::const_iterator i = op->pushes.begin();
+ i != op->pushes.end();
+ ++i) {
+ handle_recovery_push(*i, &rm, op->is_repair);
+ }
+ dispatch_recovery_messages(rm, priority);
+ return true;
+ }
+ case MSG_OSD_PG_PUSH_REPLY: {
+ const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply *>(
+ _op->get_req());
+ RecoveryMessages rm;
+ for (vector<PushReplyOp>::const_iterator i = op->replies.begin();
+ i != op->replies.end();
+ ++i) {
+ handle_recovery_push_reply(*i, op->from, &rm);
+ }
+ dispatch_recovery_messages(rm, priority);
+ return true;
+ }
+ default:
+ return false;
+ }
+ return false;
+}
+
+struct SubWriteCommitted : public Context {
+ ECBackend *pg;
+ OpRequestRef msg;
+ ceph_tid_t tid;
+ eversion_t version;
+ eversion_t last_complete;
+ const ZTracer::Trace trace;
+ SubWriteCommitted(
+ ECBackend *pg,
+ OpRequestRef msg,
+ ceph_tid_t tid,
+ eversion_t version,
+ eversion_t last_complete,
+ const ZTracer::Trace &trace)
+ : pg(pg), msg(msg), tid(tid),
+ version(version), last_complete(last_complete), trace(trace) {}
+ void finish(int) override {
+ if (msg)
+ msg->mark_event("sub_op_committed");
+ pg->sub_write_committed(tid, version, last_complete, trace);
+ }
+};
+void ECBackend::sub_write_committed(
+ ceph_tid_t tid, eversion_t version, eversion_t last_complete,
+ const ZTracer::Trace &trace) {
+ if (get_parent()->pgb_is_primary()) {
+ ECSubWriteReply reply;
+ reply.tid = tid;
+ reply.last_complete = last_complete;
+ reply.committed = true;
+ reply.applied = true;
+ reply.from = get_parent()->whoami_shard();
+ handle_sub_write_reply(
+ get_parent()->whoami_shard(),
+ reply, trace);
+ } else {
+ get_parent()->update_last_complete_ondisk(last_complete);
+ MOSDECSubOpWriteReply *r = new MOSDECSubOpWriteReply;
+ r->pgid = get_parent()->primary_spg_t();
+ r->map_epoch = get_osdmap_epoch();
+ r->min_epoch = get_parent()->get_interval_start_epoch();
+ r->op.tid = tid;
+ r->op.last_complete = last_complete;
+ r->op.committed = true;
+ r->op.applied = true;
+ r->op.from = get_parent()->whoami_shard();
+ r->set_priority(CEPH_MSG_PRIO_HIGH);
+ r->trace = trace;
+ r->trace.event("sending sub op commit");
+ get_parent()->send_message_osd_cluster(
+ get_parent()->primary_shard().osd, r, get_osdmap_epoch());
+ }
+}
+
+void ECBackend::handle_sub_write(
+ pg_shard_t from,
+ OpRequestRef msg,
+ ECSubWrite &op,
+ const ZTracer::Trace &trace)
+{
+ if (msg)
+ msg->mark_event("sub_op_started");
+ trace.event("handle_sub_write");
+#ifdef HAVE_JAEGER
+ if (msg->osd_parent_span) {
+ auto ec_sub_trans = jaeger_tracing::child_span(__func__, msg->osd_parent_span);
+ }
+#endif
+ if (!get_parent()->pgb_is_primary())
+ get_parent()->update_stats(op.stats);
+ ObjectStore::Transaction localt;
+ if (!op.temp_added.empty()) {
+ add_temp_objs(op.temp_added);
+ }
+ if (op.backfill_or_async_recovery) {
+ for (set<hobject_t>::iterator i = op.temp_removed.begin();
+ i != op.temp_removed.end();
+ ++i) {
+ dout(10) << __func__ << ": removing object " << *i
+ << " since we won't get the transaction" << dendl;
+ localt.remove(
+ coll,
+ ghobject_t(
+ *i,
+ ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard));
+ }
+ }
+ clear_temp_objs(op.temp_removed);
+ dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl;
+ // flag set to true during async recovery
+ bool async = false;
+ pg_missing_tracker_t pmissing = get_parent()->get_local_missing();
+ if (pmissing.is_missing(op.soid)) {
+ async = true;
+ dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl;
+ for (auto &&e: op.log_entries) {
+ dout(30) << " add_next_event entry " << e << dendl;
+ get_parent()->add_local_next_event(e);
+ dout(30) << " entry is_delete " << e.is_delete() << dendl;
+ }
+ }
+ get_parent()->log_operation(
+ std::move(op.log_entries),
+ op.updated_hit_set_history,
+ op.trim_to,
+ op.roll_forward_to,
+ op.roll_forward_to,
+ !op.backfill_or_async_recovery,
+ localt,
+ async);
+
+ if (!get_parent()->pg_is_undersized() &&
+ (unsigned)get_parent()->whoami_shard().shard >=
+ ec_impl->get_data_chunk_count())
+ op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+
+ localt.register_on_commit(
+ get_parent()->bless_context(
+ new SubWriteCommitted(
+ this, msg, op.tid,
+ op.at_version,
+ get_parent()->get_info().last_complete, trace)));
+ vector<ObjectStore::Transaction> tls;
+ tls.reserve(2);
+ tls.push_back(std::move(op.t));
+ tls.push_back(std::move(localt));
+ get_parent()->queue_transactions(tls, msg);
+ dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl;
+ if (op.at_version != eversion_t()) {
+ // dummy rollforward transaction doesn't get at_version (and doesn't advance it)
+ get_parent()->op_applied(op.at_version);
+ }
+}
+
+void ECBackend::handle_sub_read(
+ pg_shard_t from,
+ const ECSubRead &op,
+ ECSubReadReply *reply,
+ const ZTracer::Trace &trace)
+{
+ trace.event("handle sub read");
+ shard_id_t shard = get_parent()->whoami_shard().shard;
+ for(auto i = op.to_read.begin();
+ i != op.to_read.end();
+ ++i) {
+ int r = 0;
+ for (auto j = i->second.begin(); j != i->second.end(); ++j) {
+ bufferlist bl;
+ if ((op.subchunks.find(i->first)->second.size() == 1) &&
+ (op.subchunks.find(i->first)->second.front().second ==
+ ec_impl->get_sub_chunk_count())) {
+ dout(25) << __func__ << " case1: reading the complete chunk/shard." << dendl;
+ r = store->read(
+ ch,
+ ghobject_t(i->first, ghobject_t::NO_GEN, shard),
+ j->get<0>(),
+ j->get<1>(),
+ bl, j->get<2>()); // Allow EIO return
+ } else {
+ dout(25) << __func__ << " case2: going to do fragmented read." << dendl;
+ int subchunk_size =
+ sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
+ bool error = false;
+ for (int m = 0; m < (int)j->get<1>() && !error;
+ m += sinfo.get_chunk_size()) {
+ for (auto &&k:op.subchunks.find(i->first)->second) {
+ bufferlist bl0;
+ r = store->read(
+ ch,
+ ghobject_t(i->first, ghobject_t::NO_GEN, shard),
+ j->get<0>() + m + (k.first)*subchunk_size,
+ (k.second)*subchunk_size,
+ bl0, j->get<2>());
+ if (r < 0) {
+ error = true;
+ break;
+ }
+ bl.claim_append(bl0);
+ }
+ }
+ }
+
+ if (r < 0) {
+ // if we are doing fast reads, it's possible for one of the shard
+ // reads to cross paths with another update and get a (harmless)
+ // ENOENT. Suppress the message to the cluster log in that case.
+ if (r == -ENOENT && get_parent()->get_pool().fast_read) {
+ dout(5) << __func__ << ": Error " << r
+ << " reading " << i->first << ", fast read, probably ok"
+ << dendl;
+ } else {
+ get_parent()->clog_error() << "Error " << r
+ << " reading object "
+ << i->first;
+ dout(5) << __func__ << ": Error " << r
+ << " reading " << i->first << dendl;
+ }
+ goto error;
+ } else {
+ dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl;
+ reply->buffers_read[i->first].push_back(
+ make_pair(
+ j->get<0>(),
+ bl)
+ );
+ }
+
+ if (!get_parent()->get_pool().allows_ecoverwrites()) {
+ // This shows that we still need deep scrub because large enough files
+ // are read in sections, so the digest check here won't be done here.
+ // Do NOT check osd_read_eio_on_bad_digest here. We need to report
+ // the state of our chunk in case other chunks could substitute.
+ ECUtil::HashInfoRef hinfo;
+ hinfo = get_hash_info(i->first);
+ if (!hinfo) {
+ r = -EIO;
+ get_parent()->clog_error() << "Corruption detected: object "
+ << i->first
+ << " is missing hash_info";
+ dout(5) << __func__ << ": No hinfo for " << i->first << dendl;
+ goto error;
+ }
+ ceph_assert(hinfo->has_chunk_hash());
+ if ((bl.length() == hinfo->get_total_chunk_size()) &&
+ (j->get<0>() == 0)) {
+ dout(20) << __func__ << ": Checking hash of " << i->first << dendl;
+ bufferhash h(-1);
+ h << bl;
+ if (h.digest() != hinfo->get_chunk_hash(shard)) {
+ get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x"
+ << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec;
+ dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x"
+ << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl;
+ r = -EIO;
+ goto error;
+ }
+ }
+ }
+ }
+ continue;
+error:
+ // Do NOT check osd_read_eio_on_bad_digest here. We need to report
+ // the state of our chunk in case other chunks could substitute.
+ reply->buffers_read.erase(i->first);
+ reply->errors[i->first] = r;
+ }
+ for (set<hobject_t>::iterator i = op.attrs_to_read.begin();
+ i != op.attrs_to_read.end();
+ ++i) {
+ dout(10) << __func__ << ": fulfilling attr request on "
+ << *i << dendl;
+ if (reply->errors.count(*i))
+ continue;
+ int r = store->getattrs(
+ ch,
+ ghobject_t(
+ *i, ghobject_t::NO_GEN, shard),
+ reply->attrs_read[*i]);
+ if (r < 0) {
+ // If we read error, we should not return the attrs too.
+ reply->attrs_read.erase(*i);
+ reply->buffers_read.erase(*i);
+ reply->errors[*i] = r;
+ }
+ }
+ reply->from = get_parent()->whoami_shard();
+ reply->tid = op.tid;
+}
+
+void ECBackend::handle_sub_write_reply(
+ pg_shard_t from,
+ const ECSubWriteReply &op,
+ const ZTracer::Trace &trace)
+{
+ map<ceph_tid_t, Op>::iterator i = tid_to_op_map.find(op.tid);
+ ceph_assert(i != tid_to_op_map.end());
+ if (op.committed) {
+ trace.event("sub write committed");
+ ceph_assert(i->second.pending_commit.count(from));
+ i->second.pending_commit.erase(from);
+ if (from != get_parent()->whoami_shard()) {
+ get_parent()->update_peer_last_complete_ondisk(from, op.last_complete);
+ }
+ }
+ if (op.applied) {
+ trace.event("sub write applied");
+ ceph_assert(i->second.pending_apply.count(from));
+ i->second.pending_apply.erase(from);
+ }
+
+ if (i->second.pending_commit.empty() &&
+ i->second.on_all_commit &&
+ // also wait for apply, to preserve ordering with luminous peers.
+ i->second.pending_apply.empty()) {
+ dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl;
+ i->second.on_all_commit->complete(0);
+ i->second.on_all_commit = 0;
+ i->second.trace.event("ec write all committed");
+ }
+ check_ops();
+}
+
+void ECBackend::handle_sub_read_reply(
+ pg_shard_t from,
+ ECSubReadReply &op,
+ RecoveryMessages *m,
+ const ZTracer::Trace &trace)
+{
+ trace.event("ec sub read reply");
+ dout(10) << __func__ << ": reply " << op << dendl;
+ map<ceph_tid_t, ReadOp>::iterator iter = tid_to_read_map.find(op.tid);
+ if (iter == tid_to_read_map.end()) {
+ //canceled
+ dout(20) << __func__ << ": dropped " << op << dendl;
+ return;
+ }
+ ReadOp &rop = iter->second;
+ for (auto i = op.buffers_read.begin();
+ i != op.buffers_read.end();
+ ++i) {
+ ceph_assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer
+ if (!rop.to_read.count(i->first)) {
+ // We canceled this read! @see filter_read_op
+ dout(20) << __func__ << " to_read skipping" << dendl;
+ continue;
+ }
+ list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator req_iter =
+ rop.to_read.find(i->first)->second.to_read.begin();
+ list<
+ boost::tuple<
+ uint64_t, uint64_t, map<pg_shard_t, bufferlist> > >::iterator riter =
+ rop.complete[i->first].returned.begin();
+ for (list<pair<uint64_t, bufferlist> >::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j, ++req_iter, ++riter) {
+ ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end());
+ ceph_assert(riter != rop.complete[i->first].returned.end());
+ pair<uint64_t, uint64_t> adjusted =
+ sinfo.aligned_offset_len_to_chunk(
+ make_pair(req_iter->get<0>(), req_iter->get<1>()));
+ ceph_assert(adjusted.first == j->first);
+ riter->get<2>()[from] = std::move(j->second);
+ }
+ }
+ for (auto i = op.attrs_read.begin();
+ i != op.attrs_read.end();
+ ++i) {
+ ceph_assert(!op.errors.count(i->first)); // if read error better not have sent an attribute
+ if (!rop.to_read.count(i->first)) {
+ // We canceled this read! @see filter_read_op
+ dout(20) << __func__ << " to_read skipping" << dendl;
+ continue;
+ }
+ rop.complete[i->first].attrs = map<string, bufferlist>();
+ (*(rop.complete[i->first].attrs)).swap(i->second);
+ }
+ for (auto i = op.errors.begin();
+ i != op.errors.end();
+ ++i) {
+ rop.complete[i->first].errors.insert(
+ make_pair(
+ from,
+ i->second));
+ dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl;
+ }
+
+ map<pg_shard_t, set<ceph_tid_t> >::iterator siter =
+ shard_to_read_map.find(from);
+ ceph_assert(siter != shard_to_read_map.end());
+ ceph_assert(siter->second.count(op.tid));
+ siter->second.erase(op.tid);
+
+ ceph_assert(rop.in_progress.count(from));
+ rop.in_progress.erase(from);
+ unsigned is_complete = 0;
+ bool need_resend = false;
+ // For redundant reads check for completion as each shard comes in,
+ // or in a non-recovery read check for completion once all the shards read.
+ if (rop.do_redundant_reads || rop.in_progress.empty()) {
+ for (map<hobject_t, read_result_t>::const_iterator iter =
+ rop.complete.begin();
+ iter != rop.complete.end();
+ ++iter) {
+ set<int> have;
+ for (map<pg_shard_t, bufferlist>::const_iterator j =
+ iter->second.returned.front().get<2>().begin();
+ j != iter->second.returned.front().get<2>().end();
+ ++j) {
+ have.insert(j->first.shard);
+ dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
+ }
+ map<int, vector<pair<int, int>>> dummy_minimum;
+ int err;
+ if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) {
+ dout(20) << __func__ << " minimum_to_decode failed" << dendl;
+ if (rop.in_progress.empty()) {
+ // If we don't have enough copies, try other pg_shard_ts if available.
+ // During recovery there may be multiple osds with copies of the same shard,
+ // so getting EIO from one may result in multiple passes through this code path.
+ if (!rop.do_redundant_reads) {
+ int r = send_all_remaining_reads(iter->first, rop);
+ if (r == 0) {
+ // We changed the rop's to_read and not incrementing is_complete
+ need_resend = true;
+ continue;
+ }
+ // Couldn't read any additional shards so handle as completed with errors
+ }
+ // We don't want to confuse clients / RBD with objectstore error
+ // values in particular ENOENT. We may have different error returns
+ // from different shards, so we'll return minimum_to_decode() error
+ // (usually EIO) to reader. It is likely an error here is due to a
+ // damaged pg.
+ rop.complete[iter->first].r = err;
+ ++is_complete;
+ }
+ } else {
+ ceph_assert(rop.complete[iter->first].r == 0);
+ if (!rop.complete[iter->first].errors.empty()) {
+ if (cct->_conf->osd_read_ec_check_for_errors) {
+ dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl;
+ err = rop.complete[iter->first].errors.begin()->second;
+ rop.complete[iter->first].r = err;
+ } else {
+ get_parent()->clog_warn() << "Error(s) ignored for "
+ << iter->first << " enough copies available";
+ dout(10) << __func__ << " Error(s) ignored for " << iter->first
+ << " enough copies available" << dendl;
+ rop.complete[iter->first].errors.clear();
+ }
+ }
+ // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
+ rop.to_read.at(iter->first).need.clear();
+ rop.to_read.at(iter->first).want_attrs = false;
+ ++is_complete;
+ }
+ }
+ }
+ if (need_resend) {
+ do_read_op(rop);
+ } else if (rop.in_progress.empty() ||
+ is_complete == rop.complete.size()) {
+ dout(20) << __func__ << " Complete: " << rop << dendl;
+ rop.trace.event("ec read complete");
+ complete_read_op(rop, m);
+ } else {
+ dout(10) << __func__ << " readop not complete: " << rop << dendl;
+ }
+}
+
+void ECBackend::complete_read_op(ReadOp &rop, RecoveryMessages *m)
+{
+ map<hobject_t, read_request_t>::iterator reqiter =
+ rop.to_read.begin();
+ map<hobject_t, read_result_t>::iterator resiter =
+ rop.complete.begin();
+ ceph_assert(rop.to_read.size() == rop.complete.size());
+ for (; reqiter != rop.to_read.end(); ++reqiter, ++resiter) {
+ if (reqiter->second.cb) {
+ pair<RecoveryMessages *, read_result_t &> arg(
+ m, resiter->second);
+ reqiter->second.cb->complete(arg);
+ reqiter->second.cb = nullptr;
+ }
+ }
+ // if the read op is over. clean all the data of this tid.
+ for (set<pg_shard_t>::iterator iter = rop.in_progress.begin();
+ iter != rop.in_progress.end();
+ iter++) {
+ shard_to_read_map[*iter].erase(rop.tid);
+ }
+ rop.in_progress.clear();
+ tid_to_read_map.erase(rop.tid);
+}
+
+struct FinishReadOp : public GenContext<ThreadPool::TPHandle&> {
+ ECBackend *ec;
+ ceph_tid_t tid;
+ FinishReadOp(ECBackend *ec, ceph_tid_t tid) : ec(ec), tid(tid) {}
+ void finish(ThreadPool::TPHandle &handle) override {
+ auto ropiter = ec->tid_to_read_map.find(tid);
+ ceph_assert(ropiter != ec->tid_to_read_map.end());
+ int priority = ropiter->second.priority;
+ RecoveryMessages rm;
+ ec->complete_read_op(ropiter->second, &rm);
+ ec->dispatch_recovery_messages(rm, priority);
+ }
+};
+
+void ECBackend::filter_read_op(
+ const OSDMapRef& osdmap,
+ ReadOp &op)
+{
+ set<hobject_t> to_cancel;
+ for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin();
+ i != op.source_to_obj.end();
+ ++i) {
+ if (osdmap->is_down(i->first.osd)) {
+ to_cancel.insert(i->second.begin(), i->second.end());
+ op.in_progress.erase(i->first);
+ continue;
+ }
+ }
+
+ if (to_cancel.empty())
+ return;
+
+ for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin();
+ i != op.source_to_obj.end();
+ ) {
+ for (set<hobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ) {
+ if (to_cancel.count(*j))
+ i->second.erase(j++);
+ else
+ ++j;
+ }
+ if (i->second.empty()) {
+ op.source_to_obj.erase(i++);
+ } else {
+ ceph_assert(!osdmap->is_down(i->first.osd));
+ ++i;
+ }
+ }
+
+ for (set<hobject_t>::iterator i = to_cancel.begin();
+ i != to_cancel.end();
+ ++i) {
+ get_parent()->cancel_pull(*i);
+
+ ceph_assert(op.to_read.count(*i));
+ read_request_t &req = op.to_read.find(*i)->second;
+ dout(10) << __func__ << ": canceling " << req
+ << " for obj " << *i << dendl;
+ ceph_assert(req.cb);
+ delete req.cb;
+ req.cb = nullptr;
+
+ op.to_read.erase(*i);
+ op.complete.erase(*i);
+ recovery_ops.erase(*i);
+ }
+
+ if (op.in_progress.empty()) {
+ get_parent()->schedule_recovery_work(
+ get_parent()->bless_unlocked_gencontext(
+ new FinishReadOp(this, op.tid)));
+ }
+}
+
+void ECBackend::check_recovery_sources(const OSDMapRef& osdmap)
+{
+ set<ceph_tid_t> tids_to_filter;
+ for (map<pg_shard_t, set<ceph_tid_t> >::iterator
+ i = shard_to_read_map.begin();
+ i != shard_to_read_map.end();
+ ) {
+ if (osdmap->is_down(i->first.osd)) {
+ tids_to_filter.insert(i->second.begin(), i->second.end());
+ shard_to_read_map.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+ for (set<ceph_tid_t>::iterator i = tids_to_filter.begin();
+ i != tids_to_filter.end();
+ ++i) {
+ map<ceph_tid_t, ReadOp>::iterator j = tid_to_read_map.find(*i);
+ ceph_assert(j != tid_to_read_map.end());
+ filter_read_op(osdmap, j->second);
+ }
+}
+
+void ECBackend::on_change()
+{
+ dout(10) << __func__ << dendl;
+
+ completed_to = eversion_t();
+ committed_to = eversion_t();
+ pipeline_state.clear();
+ waiting_reads.clear();
+ waiting_state.clear();
+ waiting_commit.clear();
+ for (auto &&op: tid_to_op_map) {
+ cache.release_write_pin(op.second.pin);
+ }
+ tid_to_op_map.clear();
+
+ for (map<ceph_tid_t, ReadOp>::iterator i = tid_to_read_map.begin();
+ i != tid_to_read_map.end();
+ ++i) {
+ dout(10) << __func__ << ": cancelling " << i->second << dendl;
+ for (map<hobject_t, read_request_t>::iterator j =
+ i->second.to_read.begin();
+ j != i->second.to_read.end();
+ ++j) {
+ delete j->second.cb;
+ j->second.cb = nullptr;
+ }
+ }
+ tid_to_read_map.clear();
+ in_progress_client_reads.clear();
+ shard_to_read_map.clear();
+ clear_recovery_state();
+}
+
+void ECBackend::clear_recovery_state()
+{
+ recovery_ops.clear();
+}
+
+void ECBackend::dump_recovery_info(Formatter *f) const
+{
+ f->open_array_section("recovery_ops");
+ for (map<hobject_t, RecoveryOp>::const_iterator i = recovery_ops.begin();
+ i != recovery_ops.end();
+ ++i) {
+ f->open_object_section("op");
+ i->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("read_ops");
+ for (map<ceph_tid_t, ReadOp>::const_iterator i = tid_to_read_map.begin();
+ i != tid_to_read_map.end();
+ ++i) {
+ f->open_object_section("read_op");
+ i->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ECBackend::submit_transaction(
+ const hobject_t &hoid,
+ const object_stat_sum_t &delta_stats,
+ const eversion_t &at_version,
+ PGTransactionUPtr &&t,
+ const eversion_t &trim_to,
+ const eversion_t &min_last_complete_ondisk,
+ vector<pg_log_entry_t>&& log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ OpRequestRef client_op
+ )
+{
+ ceph_assert(!tid_to_op_map.count(tid));
+ Op *op = &(tid_to_op_map[tid]);
+ op->hoid = hoid;
+ op->delta_stats = delta_stats;
+ op->version = at_version;
+ op->trim_to = trim_to;
+ op->roll_forward_to = std::max(min_last_complete_ondisk, committed_to);
+ op->log_entries = log_entries;
+ std::swap(op->updated_hit_set_history, hset_history);
+ op->on_all_commit = on_all_commit;
+ op->tid = tid;
+ op->reqid = reqid;
+ op->client_op = client_op;
+ if (client_op)
+ op->trace = client_op->pg_trace;
+
+#ifdef HAVE_JAEGER
+ if (client_op->osd_parent_span) {
+ auto ec_sub_trans = jaeger_tracing::child_span("ECBackend::submit_transaction", client_op->osd_parent_span);
+ }
+#endif
+ dout(10) << __func__ << ": op " << *op << " starting" << dendl;
+ start_rmw(op, std::move(t));
+}
+
+void ECBackend::call_write_ordered(std::function<void(void)> &&cb) {
+ if (!waiting_state.empty()) {
+ waiting_state.back().on_write.emplace_back(std::move(cb));
+ } else if (!waiting_reads.empty()) {
+ waiting_reads.back().on_write.emplace_back(std::move(cb));
+ } else {
+ // Nothing earlier in the pipeline, just call it
+ cb();
+ }
+}
+
+void ECBackend::get_all_avail_shards(
+ const hobject_t &hoid,
+ const set<pg_shard_t> &error_shards,
+ set<int> &have,
+ map<shard_id_t, pg_shard_t> &shards,
+ bool for_recovery)
+{
+ for (set<pg_shard_t>::const_iterator i =
+ get_parent()->get_acting_shards().begin();
+ i != get_parent()->get_acting_shards().end();
+ ++i) {
+ dout(10) << __func__ << ": checking acting " << *i << dendl;
+ const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+ if (error_shards.find(*i) != error_shards.end())
+ continue;
+ if (!missing.is_missing(hoid)) {
+ ceph_assert(!have.count(i->shard));
+ have.insert(i->shard);
+ ceph_assert(!shards.count(i->shard));
+ shards.insert(make_pair(i->shard, *i));
+ }
+ }
+
+ if (for_recovery) {
+ for (set<pg_shard_t>::const_iterator i =
+ get_parent()->get_backfill_shards().begin();
+ i != get_parent()->get_backfill_shards().end();
+ ++i) {
+ if (error_shards.find(*i) != error_shards.end())
+ continue;
+ if (have.count(i->shard)) {
+ ceph_assert(shards.count(i->shard));
+ continue;
+ }
+ dout(10) << __func__ << ": checking backfill " << *i << dendl;
+ ceph_assert(!shards.count(i->shard));
+ const pg_info_t &info = get_parent()->get_shard_info(*i);
+ const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+ if (hoid < info.last_backfill &&
+ !missing.is_missing(hoid)) {
+ have.insert(i->shard);
+ shards.insert(make_pair(i->shard, *i));
+ }
+ }
+
+ map<hobject_t, set<pg_shard_t>>::const_iterator miter =
+ get_parent()->get_missing_loc_shards().find(hoid);
+ if (miter != get_parent()->get_missing_loc_shards().end()) {
+ for (set<pg_shard_t>::iterator i = miter->second.begin();
+ i != miter->second.end();
+ ++i) {
+ dout(10) << __func__ << ": checking missing_loc " << *i << dendl;
+ auto m = get_parent()->maybe_get_shard_missing(*i);
+ if (m) {
+ ceph_assert(!(*m).is_missing(hoid));
+ }
+ if (error_shards.find(*i) != error_shards.end())
+ continue;
+ have.insert(i->shard);
+ shards.insert(make_pair(i->shard, *i));
+ }
+ }
+ }
+}
+
+int ECBackend::get_min_avail_to_read_shards(
+ const hobject_t &hoid,
+ const set<int> &want,
+ bool for_recovery,
+ bool do_redundant_reads,
+ map<pg_shard_t, vector<pair<int, int>>> *to_read)
+{
+ // Make sure we don't do redundant reads for recovery
+ ceph_assert(!for_recovery || !do_redundant_reads);
+
+ set<int> have;
+ map<shard_id_t, pg_shard_t> shards;
+ set<pg_shard_t> error_shards;
+
+ get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+ map<int, vector<pair<int, int>>> need;
+ int r = ec_impl->minimum_to_decode(want, have, &need);
+ if (r < 0)
+ return r;
+
+ if (do_redundant_reads) {
+ vector<pair<int, int>> subchunks_list;
+ subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
+ for (auto &&i: have) {
+ need[i] = subchunks_list;
+ }
+ }
+
+ if (!to_read)
+ return 0;
+
+ for (auto &&i:need) {
+ ceph_assert(shards.count(shard_id_t(i.first)));
+ to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second));
+ }
+ return 0;
+}
+
+int ECBackend::get_remaining_shards(
+ const hobject_t &hoid,
+ const set<int> &avail,
+ const set<int> &want,
+ const read_result_t &result,
+ map<pg_shard_t, vector<pair<int, int>>> *to_read,
+ bool for_recovery)
+{
+ ceph_assert(to_read);
+
+ set<int> have;
+ map<shard_id_t, pg_shard_t> shards;
+ set<pg_shard_t> error_shards;
+ for (auto &p : result.errors) {
+ error_shards.insert(p.first);
+ }
+
+ get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+ map<int, vector<pair<int, int>>> need;
+ int r = ec_impl->minimum_to_decode(want, have, &need);
+ if (r < 0) {
+ dout(0) << __func__ << " not enough shards left to try for " << hoid
+ << " read result was " << result << dendl;
+ return -EIO;
+ }
+
+ set<int> shards_left;
+ for (auto p : need) {
+ if (avail.find(p.first) == avail.end()) {
+ shards_left.insert(p.first);
+ }
+ }
+
+ vector<pair<int, int>> subchunks;
+ subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
+ for (set<int>::iterator i = shards_left.begin();
+ i != shards_left.end();
+ ++i) {
+ ceph_assert(shards.count(shard_id_t(*i)));
+ ceph_assert(avail.find(*i) == avail.end());
+ to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks));
+ }
+ return 0;
+}
+
+void ECBackend::start_read_op(
+ int priority,
+ map<hobject_t, set<int>> &want_to_read,
+ map<hobject_t, read_request_t> &to_read,
+ OpRequestRef _op,
+ bool do_redundant_reads,
+ bool for_recovery)
+{
+ ceph_tid_t tid = get_parent()->get_tid();
+ ceph_assert(!tid_to_read_map.count(tid));
+ auto &op = tid_to_read_map.emplace(
+ tid,
+ ReadOp(
+ priority,
+ tid,
+ do_redundant_reads,
+ for_recovery,
+ _op,
+ std::move(want_to_read),
+ std::move(to_read))).first->second;
+ dout(10) << __func__ << ": starting " << op << dendl;
+ if (_op) {
+ op.trace = _op->pg_trace;
+ op.trace.event("start ec read");
+ }
+ do_read_op(op);
+}
+
+void ECBackend::do_read_op(ReadOp &op)
+{
+ int priority = op.priority;
+ ceph_tid_t tid = op.tid;
+
+ dout(10) << __func__ << ": starting read " << op << dendl;
+
+ map<pg_shard_t, ECSubRead> messages;
+ for (map<hobject_t, read_request_t>::iterator i = op.to_read.begin();
+ i != op.to_read.end();
+ ++i) {
+ bool need_attrs = i->second.want_attrs;
+
+ for (auto j = i->second.need.begin();
+ j != i->second.need.end();
+ ++j) {
+ if (need_attrs) {
+ messages[j->first].attrs_to_read.insert(i->first);
+ need_attrs = false;
+ }
+ messages[j->first].subchunks[i->first] = j->second;
+ op.obj_to_source[i->first].insert(j->first);
+ op.source_to_obj[j->first].insert(i->first);
+ }
+ for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator j =
+ i->second.to_read.begin();
+ j != i->second.to_read.end();
+ ++j) {
+ pair<uint64_t, uint64_t> chunk_off_len =
+ sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>()));
+ for (auto k = i->second.need.begin();
+ k != i->second.need.end();
+ ++k) {
+ messages[k->first].to_read[i->first].push_back(
+ boost::make_tuple(
+ chunk_off_len.first,
+ chunk_off_len.second,
+ j->get<2>()));
+ }
+ ceph_assert(!need_attrs);
+ }
+ }
+
+ std::vector<std::pair<int, Message*>> m;
+ m.reserve(messages.size());
+ for (map<pg_shard_t, ECSubRead>::iterator i = messages.begin();
+ i != messages.end();
+ ++i) {
+ op.in_progress.insert(i->first);
+ shard_to_read_map[i->first].insert(op.tid);
+ i->second.tid = tid;
+ MOSDECSubOpRead *msg = new MOSDECSubOpRead;
+ msg->set_priority(priority);
+ msg->pgid = spg_t(
+ get_parent()->whoami_spg_t().pgid,
+ i->first.shard);
+ msg->map_epoch = get_osdmap_epoch();
+ msg->min_epoch = get_parent()->get_interval_start_epoch();
+ msg->op = i->second;
+ msg->op.from = get_parent()->whoami_shard();
+ msg->op.tid = tid;
+ if (op.trace) {
+ // initialize a child span for this shard
+ msg->trace.init("ec sub read", nullptr, &op.trace);
+ msg->trace.keyval("shard", i->first.shard.id);
+ }
+ m.push_back(std::make_pair(i->first.osd, msg));
+ }
+ if (!m.empty()) {
+ get_parent()->send_message_osd_cluster(m, get_osdmap_epoch());
+ }
+
+ dout(10) << __func__ << ": started " << op << dendl;
+}
+
+ECUtil::HashInfoRef ECBackend::get_hash_info(
+ const hobject_t &hoid, bool create, const map<string,bufferptr> *attrs)
+{
+ dout(10) << __func__ << ": Getting attr on " << hoid << dendl;
+ ECUtil::HashInfoRef ref = unstable_hashinfo_registry.lookup(hoid);
+ if (!ref) {
+ dout(10) << __func__ << ": not in cache " << hoid << dendl;
+ struct stat st;
+ int r = store->stat(
+ ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &st);
+ ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
+ if (r >= 0) {
+ dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
+ bufferlist bl;
+ if (attrs) {
+ map<string, bufferptr>::const_iterator k = attrs->find(ECUtil::get_hinfo_key());
+ if (k == attrs->end()) {
+ dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl;
+ } else {
+ bl.push_back(k->second);
+ }
+ } else {
+ r = store->getattr(
+ ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ ECUtil::get_hinfo_key(),
+ bl);
+ if (r < 0) {
+ dout(5) << __func__ << ": getattr failed: " << cpp_strerror(r) << dendl;
+ bl.clear(); // just in case
+ }
+ }
+ if (bl.length() > 0) {
+ auto bp = bl.cbegin();
+ try {
+ decode(hinfo, bp);
+ } catch(...) {
+ dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl;
+ return ECUtil::HashInfoRef();
+ }
+ if (hinfo.get_total_chunk_size() != (uint64_t)st.st_size) {
+ dout(0) << __func__ << ": Mismatch of total_chunk_size "
+ << hinfo.get_total_chunk_size() << dendl;
+ return ECUtil::HashInfoRef();
+ }
+ } else if (st.st_size > 0) { // If empty object and no hinfo, create it
+ return ECUtil::HashInfoRef();
+ }
+ } else if (r != -ENOENT || !create) {
+ derr << __func__ << ": stat " << hoid << " failed: " << cpp_strerror(r)
+ << dendl;
+ return ECUtil::HashInfoRef();
+ }
+ ref = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo);
+ }
+ return ref;
+}
+
+void ECBackend::start_rmw(Op *op, PGTransactionUPtr &&t)
+{
+ ceph_assert(op);
+
+ op->plan = ECTransaction::get_write_plan(
+ sinfo,
+ std::move(t),
+ [&](const hobject_t &i) {
+ ECUtil::HashInfoRef ref = get_hash_info(i, true);
+ if (!ref) {
+ derr << __func__ << ": get_hash_info(" << i << ")"
+ << " returned a null pointer and there is no "
+ << " way to recover from such an error in this "
+ << " context" << dendl;
+ ceph_abort();
+ }
+ return ref;
+ },
+ get_parent()->get_dpp());
+
+ dout(10) << __func__ << ": " << *op << dendl;
+
+ waiting_state.push_back(*op);
+ check_ops();
+}
+
+bool ECBackend::try_state_to_reads()
+{
+ if (waiting_state.empty())
+ return false;
+
+ Op *op = &(waiting_state.front());
+ if (op->requires_rmw() && pipeline_state.cache_invalid()) {
+ ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
+ dout(20) << __func__ << ": blocking " << *op
+ << " because it requires an rmw and the cache is invalid "
+ << pipeline_state
+ << dendl;
+ return false;
+ }
+
+ if (!pipeline_state.caching_enabled()) {
+ op->using_cache = false;
+ } else if (op->invalidates_cache()) {
+ dout(20) << __func__ << ": invalidating cache after this op"
+ << dendl;
+ pipeline_state.invalidate();
+ }
+
+ waiting_state.pop_front();
+ waiting_reads.push_back(*op);
+
+ if (op->using_cache) {
+ cache.open_write_pin(op->pin);
+
+ extent_set empty;
+ for (auto &&hpair: op->plan.will_write) {
+ auto to_read_plan_iter = op->plan.to_read.find(hpair.first);
+ const extent_set &to_read_plan =
+ to_read_plan_iter == op->plan.to_read.end() ?
+ empty :
+ to_read_plan_iter->second;
+
+ extent_set remote_read = cache.reserve_extents_for_rmw(
+ hpair.first,
+ op->pin,
+ hpair.second,
+ to_read_plan);
+
+ extent_set pending_read = to_read_plan;
+ pending_read.subtract(remote_read);
+
+ if (!remote_read.empty()) {
+ op->remote_read[hpair.first] = std::move(remote_read);
+ }
+ if (!pending_read.empty()) {
+ op->pending_read[hpair.first] = std::move(pending_read);
+ }
+ }
+ } else {
+ op->remote_read = op->plan.to_read;
+ }
+
+ dout(10) << __func__ << ": " << *op << dendl;
+
+ if (!op->remote_read.empty()) {
+ ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
+ objects_read_async_no_cache(
+ op->remote_read,
+ [this, op](map<hobject_t,pair<int, extent_map> > &&results) {
+ for (auto &&i: results) {
+ op->remote_read_result.emplace(i.first, i.second.second);
+ }
+ check_ops();
+ });
+ }
+
+ return true;
+}
+
+bool ECBackend::try_reads_to_commit()
+{
+ if (waiting_reads.empty())
+ return false;
+ Op *op = &(waiting_reads.front());
+ if (op->read_in_progress())
+ return false;
+ waiting_reads.pop_front();
+ waiting_commit.push_back(*op);
+
+ dout(10) << __func__ << ": starting commit on " << *op << dendl;
+ dout(20) << __func__ << ": " << cache << dendl;
+
+ get_parent()->apply_stats(
+ op->hoid,
+ op->delta_stats);
+
+ if (op->using_cache) {
+ for (auto &&hpair: op->pending_read) {
+ op->remote_read_result[hpair.first].insert(
+ cache.get_remaining_extents_for_rmw(
+ hpair.first,
+ op->pin,
+ hpair.second));
+ }
+ op->pending_read.clear();
+ } else {
+ ceph_assert(op->pending_read.empty());
+ }
+
+ map<shard_id_t, ObjectStore::Transaction> trans;
+ for (set<pg_shard_t>::const_iterator i =
+ get_parent()->get_acting_recovery_backfill_shards().begin();
+ i != get_parent()->get_acting_recovery_backfill_shards().end();
+ ++i) {
+ trans[i->shard];
+ }
+
+ op->trace.event("start ec write");
+
+ map<hobject_t,extent_map> written;
+ if (op->plan.t) {
+ ECTransaction::generate_transactions(
+ op->plan,
+ ec_impl,
+ get_parent()->get_info().pgid.pgid,
+ sinfo,
+ op->remote_read_result,
+ op->log_entries,
+ &written,
+ &trans,
+ &(op->temp_added),
+ &(op->temp_cleared),
+ get_parent()->get_dpp(),
+ get_osdmap()->require_osd_release);
+ }
+
+ dout(20) << __func__ << ": " << cache << dendl;
+ dout(20) << __func__ << ": written: " << written << dendl;
+ dout(20) << __func__ << ": op: " << *op << dendl;
+
+ if (!get_parent()->get_pool().allows_ecoverwrites()) {
+ for (auto &&i: op->log_entries) {
+ if (i.requires_kraken()) {
+ derr << __func__ << ": log entry " << i << " requires kraken"
+ << " but overwrites are not enabled!" << dendl;
+ ceph_abort();
+ }
+ }
+ }
+
+ map<hobject_t,extent_set> written_set;
+ for (auto &&i: written) {
+ written_set[i.first] = i.second.get_interval_set();
+ }
+ dout(20) << __func__ << ": written_set: " << written_set << dendl;
+ ceph_assert(written_set == op->plan.will_write);
+
+ if (op->using_cache) {
+ for (auto &&hpair: written) {
+ dout(20) << __func__ << ": " << hpair << dendl;
+ cache.present_rmw_update(hpair.first, op->pin, hpair.second);
+ }
+ }
+ op->remote_read.clear();
+ op->remote_read_result.clear();
+
+ ObjectStore::Transaction empty;
+ bool should_write_local = false;
+ ECSubWrite local_write_op;
+ std::vector<std::pair<int, Message*>> messages;
+ messages.reserve(get_parent()->get_acting_recovery_backfill_shards().size());
+ set<pg_shard_t> backfill_shards = get_parent()->get_backfill_shards();
+ for (set<pg_shard_t>::const_iterator i =
+ get_parent()->get_acting_recovery_backfill_shards().begin();
+ i != get_parent()->get_acting_recovery_backfill_shards().end();
+ ++i) {
+ op->pending_apply.insert(*i);
+ op->pending_commit.insert(*i);
+ map<shard_id_t, ObjectStore::Transaction>::iterator iter =
+ trans.find(i->shard);
+ ceph_assert(iter != trans.end());
+ bool should_send = get_parent()->should_send_op(*i, op->hoid);
+ const pg_stat_t &stats =
+ (should_send || !backfill_shards.count(*i)) ?
+ get_info().stats :
+ parent->get_shard_info().find(*i)->second.stats;
+
+ ECSubWrite sop(
+ get_parent()->whoami_shard(),
+ op->tid,
+ op->reqid,
+ op->hoid,
+ stats,
+ should_send ? iter->second : empty,
+ op->version,
+ op->trim_to,
+ op->roll_forward_to,
+ op->log_entries,
+ op->updated_hit_set_history,
+ op->temp_added,
+ op->temp_cleared,
+ !should_send);
+
+ ZTracer::Trace trace;
+ if (op->trace) {
+ // initialize a child span for this shard
+ trace.init("ec sub write", nullptr, &op->trace);
+ trace.keyval("shard", i->shard.id);
+ }
+
+ if (*i == get_parent()->whoami_shard()) {
+ should_write_local = true;
+ local_write_op.claim(sop);
+ } else {
+ MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
+ r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
+ r->map_epoch = get_osdmap_epoch();
+ r->min_epoch = get_parent()->get_interval_start_epoch();
+ r->trace = trace;
+ messages.push_back(std::make_pair(i->osd, r));
+ }
+ }
+
+#ifdef HAVE_JAEGER
+ if (op->client_op->osd_parent_span) {
+ auto sub_write_span = jaeger_tracing::child_span("EC sub write", op->client_op->osd_parent_span);
+ }
+#endif
+ if (!messages.empty()) {
+ get_parent()->send_message_osd_cluster(messages, get_osdmap_epoch());
+ }
+
+ if (should_write_local) {
+ handle_sub_write(
+ get_parent()->whoami_shard(),
+ op->client_op,
+ local_write_op,
+ op->trace);
+ }
+
+ for (auto i = op->on_write.begin();
+ i != op->on_write.end();
+ op->on_write.erase(i++)) {
+ (*i)();
+ }
+
+ return true;
+}
+
+bool ECBackend::try_finish_rmw()
+{
+ if (waiting_commit.empty())
+ return false;
+ Op *op = &(waiting_commit.front());
+ if (op->write_in_progress())
+ return false;
+ waiting_commit.pop_front();
+
+ dout(10) << __func__ << ": " << *op << dendl;
+ dout(20) << __func__ << ": " << cache << dendl;
+
+ if (op->roll_forward_to > completed_to)
+ completed_to = op->roll_forward_to;
+ if (op->version > committed_to)
+ committed_to = op->version;
+
+ if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+ if (op->version > get_parent()->get_log().get_can_rollback_to() &&
+ waiting_reads.empty() &&
+ waiting_commit.empty()) {
+ // submit a dummy transaction to kick the rollforward
+ auto tid = get_parent()->get_tid();
+ Op *nop = &(tid_to_op_map[tid]);
+ nop->hoid = op->hoid;
+ nop->trim_to = op->trim_to;
+ nop->roll_forward_to = op->version;
+ nop->tid = tid;
+ nop->reqid = op->reqid;
+ waiting_reads.push_back(*nop);
+ }
+ }
+
+ if (op->using_cache) {
+ cache.release_write_pin(op->pin);
+ }
+ tid_to_op_map.erase(op->tid);
+
+ if (waiting_reads.empty() &&
+ waiting_commit.empty()) {
+ pipeline_state.clear();
+ dout(20) << __func__ << ": clearing pipeline_state "
+ << pipeline_state
+ << dendl;
+ }
+ return true;
+}
+
+void ECBackend::check_ops()
+{
+ while (try_state_to_reads() ||
+ try_reads_to_commit() ||
+ try_finish_rmw());
+}
+
+int ECBackend::objects_read_sync(
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ bufferlist *bl)
+{
+ return -EOPNOTSUPP;
+}
+
+void ECBackend::objects_read_async(
+ const hobject_t &hoid,
+ const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ pair<bufferlist*, Context*> > > &to_read,
+ Context *on_complete,
+ bool fast_read)
+{
+ map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > >
+ reads;
+
+ uint32_t flags = 0;
+ extent_set es;
+ for (list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ pair<bufferlist*, Context*> > >::const_iterator i =
+ to_read.begin();
+ i != to_read.end();
+ ++i) {
+ pair<uint64_t, uint64_t> tmp =
+ sinfo.offset_len_to_stripe_bounds(
+ make_pair(i->first.get<0>(), i->first.get<1>()));
+
+ es.union_insert(tmp.first, tmp.second);
+ flags |= i->first.get<2>();
+ }
+
+ if (!es.empty()) {
+ auto &offsets = reads[hoid];
+ for (auto j = es.begin();
+ j != es.end();
+ ++j) {
+ offsets.push_back(
+ boost::make_tuple(
+ j.get_start(),
+ j.get_len(),
+ flags));
+ }
+ }
+
+ struct cb {
+ ECBackend *ec;
+ hobject_t hoid;
+ list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ pair<bufferlist*, Context*> > > to_read;
+ unique_ptr<Context> on_complete;
+ cb(const cb&) = delete;
+ cb(cb &&) = default;
+ cb(ECBackend *ec,
+ const hobject_t &hoid,
+ const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ pair<bufferlist*, Context*> > > &to_read,
+ Context *on_complete)
+ : ec(ec),
+ hoid(hoid),
+ to_read(to_read),
+ on_complete(on_complete) {}
+ void operator()(map<hobject_t,pair<int, extent_map> > &&results) {
+ auto dpp = ec->get_parent()->get_dpp();
+ ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results
+ << dendl;
+ ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->cache
+ << dendl;
+
+ auto &got = results[hoid];
+
+ int r = 0;
+ for (auto &&read: to_read) {
+ if (got.first < 0) {
+ if (read.second.second) {
+ read.second.second->complete(got.first);
+ }
+ if (r == 0)
+ r = got.first;
+ } else {
+ ceph_assert(read.second.first);
+ uint64_t offset = read.first.get<0>();
+ uint64_t length = read.first.get<1>();
+ auto range = got.second.get_containing_range(offset, length);
+ ceph_assert(range.first != range.second);
+ ceph_assert(range.first.get_off() <= offset);
+ ldpp_dout(dpp, 30) << "offset: " << offset << dendl;
+ ldpp_dout(dpp, 30) << "range offset: " << range.first.get_off() << dendl;
+ ldpp_dout(dpp, 30) << "length: " << length << dendl;
+ ldpp_dout(dpp, 30) << "range length: " << range.first.get_len() << dendl;
+ ceph_assert(
+ (offset + length) <=
+ (range.first.get_off() + range.first.get_len()));
+ read.second.first->substr_of(
+ range.first.get_val(),
+ offset - range.first.get_off(),
+ length);
+ if (read.second.second) {
+ read.second.second->complete(length);
+ read.second.second = nullptr;
+ }
+ }
+ }
+ to_read.clear();
+ if (on_complete) {
+ on_complete.release()->complete(r);
+ }
+ }
+ ~cb() {
+ for (auto &&i: to_read) {
+ delete i.second.second;
+ }
+ to_read.clear();
+ }
+ };
+ objects_read_and_reconstruct(
+ reads,
+ fast_read,
+ make_gen_lambda_context<
+ map<hobject_t,pair<int, extent_map> > &&, cb>(
+ cb(this,
+ hoid,
+ to_read,
+ on_complete)));
+}
+
+struct CallClientContexts :
+ public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> {
+ hobject_t hoid;
+ ECBackend *ec;
+ ECBackend::ClientAsyncReadStatus *status;
+ list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
+ CallClientContexts(
+ hobject_t hoid,
+ ECBackend *ec,
+ ECBackend::ClientAsyncReadStatus *status,
+ const list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read)
+ : hoid(hoid), ec(ec), status(status), to_read(to_read) {}
+ void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) override {
+ ECBackend::read_result_t &res = in.second;
+ extent_map result;
+ if (res.r != 0)
+ goto out;
+ ceph_assert(res.returned.size() == to_read.size());
+ ceph_assert(res.errors.empty());
+ for (auto &&read: to_read) {
+ pair<uint64_t, uint64_t> adjusted =
+ ec->sinfo.offset_len_to_stripe_bounds(
+ make_pair(read.get<0>(), read.get<1>()));
+ ceph_assert(res.returned.front().get<0>() == adjusted.first &&
+ res.returned.front().get<1>() == adjusted.second);
+ map<int, bufferlist> to_decode;
+ bufferlist bl;
+ for (map<pg_shard_t, bufferlist>::iterator j =
+ res.returned.front().get<2>().begin();
+ j != res.returned.front().get<2>().end();
+ ++j) {
+ to_decode[j->first.shard] = std::move(j->second);
+ }
+ int r = ECUtil::decode(
+ ec->sinfo,
+ ec->ec_impl,
+ to_decode,
+ &bl);
+ if (r < 0) {
+ res.r = r;
+ goto out;
+ }
+ bufferlist trimmed;
+ trimmed.substr_of(
+ bl,
+ read.get<0>() - adjusted.first,
+ std::min(read.get<1>(),
+ bl.length() - (read.get<0>() - adjusted.first)));
+ result.insert(
+ read.get<0>(), trimmed.length(), std::move(trimmed));
+ res.returned.pop_front();
+ }
+out:
+ status->complete_object(hoid, res.r, std::move(result));
+ ec->kick_reads();
+ }
+};
+
+void ECBackend::objects_read_and_reconstruct(
+ const map<hobject_t,
+ std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
+ > &reads,
+ bool fast_read,
+ GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func)
+{
+ in_progress_client_reads.emplace_back(
+ reads.size(), std::move(func));
+ if (!reads.size()) {
+ kick_reads();
+ return;
+ }
+
+ map<hobject_t, set<int>> obj_want_to_read;
+ set<int> want_to_read;
+ get_want_to_read_shards(&want_to_read);
+
+ map<hobject_t, read_request_t> for_read_op;
+ for (auto &&to_read: reads) {
+ map<pg_shard_t, vector<pair<int, int>>> shards;
+ int r = get_min_avail_to_read_shards(
+ to_read.first,
+ want_to_read,
+ false,
+ fast_read,
+ &shards);
+ ceph_assert(r == 0);
+
+ CallClientContexts *c = new CallClientContexts(
+ to_read.first,
+ this,
+ &(in_progress_client_reads.back()),
+ to_read.second);
+ for_read_op.insert(
+ make_pair(
+ to_read.first,
+ read_request_t(
+ to_read.second,
+ shards,
+ false,
+ c)));
+ obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
+ }
+
+ start_read_op(
+ CEPH_MSG_PRIO_DEFAULT,
+ obj_want_to_read,
+ for_read_op,
+ OpRequestRef(),
+ fast_read, false);
+ return;
+}
+
+
+int ECBackend::send_all_remaining_reads(
+ const hobject_t &hoid,
+ ReadOp &rop)
+{
+ set<int> already_read;
+ const set<pg_shard_t>& ots = rop.obj_to_source[hoid];
+ for (set<pg_shard_t>::iterator i = ots.begin(); i != ots.end(); ++i)
+ already_read.insert(i->shard);
+ dout(10) << __func__ << " have/error shards=" << already_read << dendl;
+ map<pg_shard_t, vector<pair<int, int>>> shards;
+ int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
+ rop.complete[hoid], &shards, rop.for_recovery);
+ if (r)
+ return r;
+
+ list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets =
+ rop.to_read.find(hoid)->second.to_read;
+ GenContext<pair<RecoveryMessages *, read_result_t& > &> *c =
+ rop.to_read.find(hoid)->second.cb;
+
+ // (Note cuixf) If we need to read attrs and we read failed, try to read again.
+ bool want_attrs =
+ rop.to_read.find(hoid)->second.want_attrs &&
+ (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty());
+ if (want_attrs) {
+ dout(10) << __func__ << " want attrs again" << dendl;
+ }
+
+ rop.to_read.erase(hoid);
+ rop.to_read.insert(make_pair(
+ hoid,
+ read_request_t(
+ offsets,
+ shards,
+ want_attrs,
+ c)));
+ return 0;
+}
+
+int ECBackend::objects_get_attrs(
+ const hobject_t &hoid,
+ map<string, bufferlist> *out)
+{
+ int r = store->getattrs(
+ ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ *out);
+ if (r < 0)
+ return r;
+
+ for (map<string, bufferlist>::iterator i = out->begin();
+ i != out->end();
+ ) {
+ if (ECUtil::is_hinfo_key_string(i->first))
+ out->erase(i++);
+ else
+ ++i;
+ }
+ return r;
+}
+
+void ECBackend::rollback_append(
+ const hobject_t &hoid,
+ uint64_t old_size,
+ ObjectStore::Transaction *t)
+{
+ ceph_assert(old_size % sinfo.get_stripe_width() == 0);
+ t->truncate(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ sinfo.aligned_logical_offset_to_chunk_offset(
+ old_size));
+}
+
+int ECBackend::be_deep_scrub(
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o)
+{
+ dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
+ int r;
+
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ utime_t sleeptime;
+ sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+ if (sleeptime != utime_t()) {
+ lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+ sleeptime.sleep();
+ }
+
+ if (pos.data_pos == 0) {
+ pos.data_hash = bufferhash(-1);
+ }
+
+ uint64_t stride = cct->_conf->osd_deep_scrub_stride;
+ if (stride % sinfo.get_chunk_size())
+ stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
+
+ bufferlist bl;
+ r = store->read(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ pos.data_pos,
+ stride, bl,
+ fadvise_flags);
+ if (r < 0) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on read, read_error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ if (bl.length() % sinfo.get_chunk_size()) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned"
+ << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ if (r > 0) {
+ pos.data_hash << bl;
+ }
+ pos.data_pos += r;
+ if (r == (int)stride) {
+ return -EINPROGRESS;
+ }
+
+ ECUtil::HashInfoRef hinfo = get_hash_info(poid, false, &o.attrs);
+ if (!hinfo) {
+ dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl;
+ o.read_error = true;
+ o.digest_present = false;
+ return 0;
+ } else {
+ if (!get_parent()->get_pool().allows_ecoverwrites()) {
+ if (!hinfo->has_chunk_hash()) {
+ dout(0) << "_scan_list " << poid << " got invalid hash info" << dendl;
+ o.ec_size_mismatch = true;
+ return 0;
+ }
+ if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
+ dout(0) << "_scan_list " << poid << " got incorrect size on read 0x"
+ << std::hex << pos
+ << " expected 0x" << hinfo->get_total_chunk_size() << std::dec
+ << dendl;
+ o.ec_size_mismatch = true;
+ return 0;
+ }
+
+ if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
+ pos.data_hash.digest()) {
+ dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x"
+ << std::hex << pos.data_hash.digest() << " != expected 0x"
+ << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
+ << std::dec << dendl;
+ o.ec_hash_mismatch = true;
+ return 0;
+ }
+
+ /* We checked above that we match our own stored hash. We cannot
+ * send a hash of the actual object, so instead we simply send
+ * our locally stored hash of shard 0 on the assumption that if
+ * we match our chunk hash and our recollection of the hash for
+ * chunk 0 matches that of our peers, there is likely no corruption.
+ */
+ o.digest = hinfo->get_chunk_hash(0);
+ o.digest_present = true;
+ } else {
+ /* Hack! We must be using partial overwrites, and partial overwrites
+ * don't support deep-scrub yet
+ */
+ o.digest = 0;
+ o.digest_present = true;
+ }
+ }
+
+ o.omap_digest = -1;
+ o.omap_digest_present = true;
+ return 0;
+}
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
new file mode 100644
index 000000000..45495376a
--- /dev/null
+++ b/src/osd/ECBackend.h
@@ -0,0 +1,686 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef ECBACKEND_H
+#define ECBACKEND_H
+
+#include <boost/intrusive/set.hpp>
+#include <boost/intrusive/list.hpp>
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "erasure-code/ErasureCodeInterface.h"
+#include "ECUtil.h"
+#include "ECTransaction.h"
+#include "ExtentCache.h"
+
+//forward declaration
+struct ECSubWrite;
+struct ECSubWriteReply;
+struct ECSubRead;
+struct ECSubReadReply;
+
+struct RecoveryMessages;
+class ECBackend : public PGBackend {
+public:
+ RecoveryHandle *open_recovery_op() override;
+
+ void run_recovery_op(
+ RecoveryHandle *h,
+ int priority
+ ) override;
+
+ int recover_object(
+ const hobject_t &hoid,
+ eversion_t v,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *h
+ ) override;
+
+ bool _handle_message(
+ OpRequestRef op
+ ) override;
+ bool can_handle_while_inactive(
+ OpRequestRef op
+ ) override;
+ friend struct SubWriteApplied;
+ friend struct SubWriteCommitted;
+ void sub_write_committed(
+ ceph_tid_t tid,
+ eversion_t version,
+ eversion_t last_complete,
+ const ZTracer::Trace &trace);
+ void handle_sub_write(
+ pg_shard_t from,
+ OpRequestRef msg,
+ ECSubWrite &op,
+ const ZTracer::Trace &trace
+ );
+ void handle_sub_read(
+ pg_shard_t from,
+ const ECSubRead &op,
+ ECSubReadReply *reply,
+ const ZTracer::Trace &trace
+ );
+ void handle_sub_write_reply(
+ pg_shard_t from,
+ const ECSubWriteReply &op,
+ const ZTracer::Trace &trace
+ );
+ void handle_sub_read_reply(
+ pg_shard_t from,
+ ECSubReadReply &op,
+ RecoveryMessages *m,
+ const ZTracer::Trace &trace
+ );
+
+ /// @see ReadOp below
+ void check_recovery_sources(const OSDMapRef& osdmap) override;
+
+ void on_change() override;
+ void clear_recovery_state() override;
+
+ void dump_recovery_info(ceph::Formatter *f) const override;
+
+ void call_write_ordered(std::function<void(void)> &&cb) override;
+
+ void submit_transaction(
+ const hobject_t &hoid,
+ const object_stat_sum_t &delta_stats,
+ const eversion_t &at_version,
+ PGTransactionUPtr &&t,
+ const eversion_t &trim_to,
+ const eversion_t &min_last_complete_ondisk,
+ std::vector<pg_log_entry_t>&& log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ OpRequestRef op
+ ) override;
+
+ int objects_read_sync(
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ ceph::buffer::list *bl) override;
+
+ /**
+ * Async read mechanism
+ *
+ * Async reads use the same async read mechanism as does recovery.
+ * CallClientContexts is responsible for reconstructing the response
+ * buffer as well as for calling the callbacks.
+ *
+ * One tricky bit is that two reads may possibly not read from the same
+ * std::set of replicas. This could result in two reads completing in the
+ * wrong (from the interface user's point of view) order. Thus, we
+ * maintain a queue of in progress reads (@see in_progress_client_reads)
+ * to ensure that we always call the completion callback in order.
+ *
+ * Another subtly is that while we may read a degraded object, we will
+ * still only perform a client read from shards in the acting std::set. This
+ * ensures that we won't ever have to restart a client initiated read in
+ * check_recovery_sources.
+ */
+ void objects_read_and_reconstruct(
+ const std::map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
+ > &reads,
+ bool fast_read,
+ GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> &&func);
+
+ friend struct CallClientContexts;
+ struct ClientAsyncReadStatus {
+ unsigned objects_to_read;
+ GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> func;
+ std::map<hobject_t,std::pair<int, extent_map> > results;
+ explicit ClientAsyncReadStatus(
+ unsigned objects_to_read,
+ GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> &&func)
+ : objects_to_read(objects_to_read), func(std::move(func)) {}
+ void complete_object(
+ const hobject_t &hoid,
+ int err,
+ extent_map &&buffers) {
+ ceph_assert(objects_to_read);
+ --objects_to_read;
+ ceph_assert(!results.count(hoid));
+ results.emplace(hoid, std::make_pair(err, std::move(buffers)));
+ }
+ bool is_complete() const {
+ return objects_to_read == 0;
+ }
+ void run() {
+ func.release()->complete(std::move(results));
+ }
+ };
+ std::list<ClientAsyncReadStatus> in_progress_client_reads;
+ void objects_read_async(
+ const hobject_t &hoid,
+ const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ std::pair<ceph::buffer::list*, Context*> > > &to_read,
+ Context *on_complete,
+ bool fast_read = false) override;
+
+ template <typename Func>
+ void objects_read_async_no_cache(
+ const std::map<hobject_t,extent_set> &to_read,
+ Func &&on_complete) {
+ std::map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > > _to_read;
+ for (auto &&hpair: to_read) {
+ auto &l = _to_read[hpair.first];
+ for (auto extent: hpair.second) {
+ l.emplace_back(extent.first, extent.second, 0);
+ }
+ }
+ objects_read_and_reconstruct(
+ _to_read,
+ false,
+ make_gen_lambda_context<
+ std::map<hobject_t,std::pair<int, extent_map> > &&, Func>(
+ std::forward<Func>(on_complete)));
+ }
+ void kick_reads() {
+ while (in_progress_client_reads.size() &&
+ in_progress_client_reads.front().is_complete()) {
+ in_progress_client_reads.front().run();
+ in_progress_client_reads.pop_front();
+ }
+ }
+
+private:
+ friend struct ECRecoveryHandle;
+ uint64_t get_recovery_chunk_size() const {
+ return round_up_to(cct->_conf->osd_recovery_max_chunk,
+ sinfo.get_stripe_width());
+ }
+
+ void get_want_to_read_shards(std::set<int> *want_to_read) const {
+ const std::vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
+ for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
+ int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
+ want_to_read->insert(chunk);
+ }
+ }
+
+ /**
+ * Recovery
+ *
+ * Recovery uses the same underlying read mechanism as client reads
+ * with the slight difference that recovery reads may come from non
+ * acting shards. Thus, check_recovery_sources may wind up calling
+ * cancel_pull for a read originating with RecoveryOp.
+ *
+ * The recovery process is expressed as a state machine:
+ * - IDLE: Nothing is currently in progress, reads will be started and
+ * we will transition to READING
+ * - READING: We are awaiting a pending read op. Once complete, we will
+ * decode the buffers and proceed to WRITING
+ * - WRITING: We are awaiting a completed push. Once complete, we will
+ * either transition to COMPLETE or to IDLE to continue.
+ * - COMPLETE: complete
+ *
+ * We use the existing Push and PushReply messages and structures to
+ * handle actually shuffling the data over to the replicas. recovery_info
+ * and recovery_progress are expressed in terms of the logical offset
+ * space except for data_included which is in terms of the chunked object
+ * space (to match the passed buffer).
+ *
+ * xattrs are requested on the first read and used to initialize the
+ * object_context if missing on completion of the first read.
+ *
+ * In order to batch up reads and writes, we batch Push, PushReply,
+ * Transaction, and reads in a RecoveryMessages object which is passed
+ * among the recovery methods.
+ */
+ struct RecoveryOp {
+ hobject_t hoid;
+ eversion_t v;
+ std::set<pg_shard_t> missing_on;
+ std::set<shard_id_t> missing_on_shards;
+
+ ObjectRecoveryInfo recovery_info;
+ ObjectRecoveryProgress recovery_progress;
+
+ enum state_t { IDLE, READING, WRITING, COMPLETE } state;
+
+ static const char* tostr(state_t state) {
+ switch (state) {
+ case ECBackend::RecoveryOp::IDLE:
+ return "IDLE";
+ case ECBackend::RecoveryOp::READING:
+ return "READING";
+ case ECBackend::RecoveryOp::WRITING:
+ return "WRITING";
+ case ECBackend::RecoveryOp::COMPLETE:
+ return "COMPLETE";
+ default:
+ ceph_abort();
+ return "";
+ }
+ }
+
+ // must be filled if state == WRITING
+ std::map<int, ceph::buffer::list> returned_data;
+ std::map<std::string, ceph::buffer::list> xattrs;
+ ECUtil::HashInfoRef hinfo;
+ ObjectContextRef obc;
+ std::set<pg_shard_t> waiting_on_pushes;
+
+ // valid in state READING
+ std::pair<uint64_t, uint64_t> extent_requested;
+
+ void dump(ceph::Formatter *f) const;
+
+ RecoveryOp() : state(IDLE) {}
+ };
+ friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs);
+ std::map<hobject_t, RecoveryOp> recovery_ops;
+
+ void continue_recovery_op(
+ RecoveryOp &op,
+ RecoveryMessages *m);
+ void dispatch_recovery_messages(RecoveryMessages &m, int priority);
+ friend struct OnRecoveryReadComplete;
+ void handle_recovery_read_complete(
+ const hobject_t &hoid,
+ boost::tuple<uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > &to_read,
+ std::optional<std::map<std::string, ceph::buffer::list> > attrs,
+ RecoveryMessages *m);
+ void handle_recovery_push(
+ const PushOp &op,
+ RecoveryMessages *m,
+ bool is_repair);
+ void handle_recovery_push_reply(
+ const PushReplyOp &op,
+ pg_shard_t from,
+ RecoveryMessages *m);
+ void get_all_avail_shards(
+ const hobject_t &hoid,
+ const std::set<pg_shard_t> &error_shards,
+ std::set<int> &have,
+ std::map<shard_id_t, pg_shard_t> &shards,
+ bool for_recovery);
+
+public:
+ /**
+ * Low level async read mechanism
+ *
+ * To avoid duplicating the logic for requesting and waiting for
+ * multiple object shards, there is a common async read mechanism
+ * taking a std::map of hobject_t->read_request_t which defines callbacks
+ * taking read_result_ts as arguments.
+ *
+ * tid_to_read_map gives open read ops. check_recovery_sources uses
+ * shard_to_read_map and ReadOp::source_to_obj to restart reads
+ * involving down osds.
+ *
+ * The user is responsible for specifying replicas on which to read
+ * and for reassembling the buffer on the other side since client
+ * reads require the original object buffer while recovery only needs
+ * the missing pieces.
+ *
+ * Rather than handling reads on the primary directly, we simply send
+ * ourselves a message. This avoids a dedicated primary path for that
+ * part.
+ */
+ struct read_result_t {
+ int r;
+ std::map<pg_shard_t, int> errors;
+ std::optional<std::map<std::string, ceph::buffer::list> > attrs;
+ std::list<
+ boost::tuple<
+ uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > > returned;
+ read_result_t() : r(0) {}
+ };
+ struct read_request_t {
+ const std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
+ std::map<pg_shard_t, std::vector<std::pair<int, int>>> need;
+ bool want_attrs;
+ GenContext<std::pair<RecoveryMessages *, read_result_t& > &> *cb;
+ read_request_t(
+ const std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read,
+ const std::map<pg_shard_t, std::vector<std::pair<int, int>>> &need,
+ bool want_attrs,
+ GenContext<std::pair<RecoveryMessages *, read_result_t& > &> *cb)
+ : to_read(to_read), need(need), want_attrs(want_attrs),
+ cb(cb) {}
+ };
+ friend ostream &operator<<(ostream &lhs, const read_request_t &rhs);
+
+ struct ReadOp {
+ int priority;
+ ceph_tid_t tid;
+ OpRequestRef op; // may be null if not on behalf of a client
+ // True if redundant reads are issued, false otherwise,
+ // this is useful to tradeoff some resources (redundant ops) for
+ // low latency read, especially on relatively idle cluster
+ bool do_redundant_reads;
+ // True if reading for recovery which could possibly reading only a subset
+ // of the available shards.
+ bool for_recovery;
+
+ ZTracer::Trace trace;
+
+ std::map<hobject_t, std::set<int>> want_to_read;
+ std::map<hobject_t, read_request_t> to_read;
+ std::map<hobject_t, read_result_t> complete;
+
+ std::map<hobject_t, std::set<pg_shard_t>> obj_to_source;
+ std::map<pg_shard_t, std::set<hobject_t> > source_to_obj;
+
+ void dump(ceph::Formatter *f) const;
+
+ std::set<pg_shard_t> in_progress;
+
+ ReadOp(
+ int priority,
+ ceph_tid_t tid,
+ bool do_redundant_reads,
+ bool for_recovery,
+ OpRequestRef op,
+ std::map<hobject_t, std::set<int>> &&_want_to_read,
+ std::map<hobject_t, read_request_t> &&_to_read)
+ : priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads),
+ for_recovery(for_recovery), want_to_read(std::move(_want_to_read)),
+ to_read(std::move(_to_read)) {
+ for (auto &&hpair: to_read) {
+ auto &returned = complete[hpair.first].returned;
+ for (auto &&extent: hpair.second.to_read) {
+ returned.push_back(
+ boost::make_tuple(
+ extent.get<0>(),
+ extent.get<1>(),
+ std::map<pg_shard_t, ceph::buffer::list>()));
+ }
+ }
+ }
+ ReadOp() = delete;
+ ReadOp(const ReadOp &) = default;
+ ReadOp(ReadOp &&) = default;
+ };
+ friend struct FinishReadOp;
+ void filter_read_op(
+ const OSDMapRef& osdmap,
+ ReadOp &op);
+ void complete_read_op(ReadOp &rop, RecoveryMessages *m);
+ friend ostream &operator<<(ostream &lhs, const ReadOp &rhs);
+ std::map<ceph_tid_t, ReadOp> tid_to_read_map;
+ std::map<pg_shard_t, std::set<ceph_tid_t> > shard_to_read_map;
+ void start_read_op(
+ int priority,
+ std::map<hobject_t, std::set<int>> &want_to_read,
+ std::map<hobject_t, read_request_t> &to_read,
+ OpRequestRef op,
+ bool do_redundant_reads, bool for_recovery);
+
+ void do_read_op(ReadOp &rop);
+ int send_all_remaining_reads(
+ const hobject_t &hoid,
+ ReadOp &rop);
+
+
+ /**
+ * Client writes
+ *
+ * ECTransaction is responsible for generating a transaction for
+ * each shard to which we need to send the write. As required
+ * by the PGBackend interface, the ECBackend write mechanism
+ * passes trim information with the write and last_complete back
+ * with the reply.
+ *
+ * As with client reads, there is a possibility of out-of-order
+ * completions. Thus, callbacks and completion are called in order
+ * on the writing std::list.
+ */
+ struct Op : boost::intrusive::list_base_hook<> {
+ /// From submit_transaction caller, describes operation
+ hobject_t hoid;
+ object_stat_sum_t delta_stats;
+ eversion_t version;
+ eversion_t trim_to;
+ std::optional<pg_hit_set_history_t> updated_hit_set_history;
+ std::vector<pg_log_entry_t> log_entries;
+ ceph_tid_t tid;
+ osd_reqid_t reqid;
+ ZTracer::Trace trace;
+
+ eversion_t roll_forward_to; /// Soon to be generated internally
+
+ /// Ancillary also provided from submit_transaction caller
+ std::map<hobject_t, ObjectContextRef> obc_map;
+
+ /// see call_write_ordered
+ std::list<std::function<void(void)> > on_write;
+
+ /// Generated internally
+ std::set<hobject_t> temp_added;
+ std::set<hobject_t> temp_cleared;
+
+ ECTransaction::WritePlan plan;
+ bool requires_rmw() const { return !plan.to_read.empty(); }
+ bool invalidates_cache() const { return plan.invalidates_cache; }
+
+ // must be true if requires_rmw(), must be false if invalidates_cache()
+ bool using_cache = true;
+
+ /// In progress read state;
+ std::map<hobject_t,extent_set> pending_read; // subset already being read
+ std::map<hobject_t,extent_set> remote_read; // subset we must read
+ std::map<hobject_t,extent_map> remote_read_result;
+ bool read_in_progress() const {
+ return !remote_read.empty() && remote_read_result.empty();
+ }
+
+ /// In progress write state.
+ std::set<pg_shard_t> pending_commit;
+ // we need pending_apply for pre-mimic peers so that we don't issue a
+ // read on a remote shard before it has applied a previous write. We can
+ // remove this after nautilus.
+ std::set<pg_shard_t> pending_apply;
+ bool write_in_progress() const {
+ return !pending_commit.empty() || !pending_apply.empty();
+ }
+
+ /// optional, may be null, for tracking purposes
+ OpRequestRef client_op;
+
+ /// pin for cache
+ ExtentCache::write_pin pin;
+
+ /// Callbacks
+ Context *on_all_commit = nullptr;
+ ~Op() {
+ delete on_all_commit;
+ }
+ };
+ using op_list = boost::intrusive::list<Op>;
+ friend ostream &operator<<(ostream &lhs, const Op &rhs);
+
+ ExtentCache cache;
+ std::map<ceph_tid_t, Op> tid_to_op_map; /// Owns Op structure
+
+ /**
+ * We model the possible rmw states as a std::set of waitlists.
+ * All writes at this time complete in order, so a write blocked
+ * at waiting_state blocks all writes behind it as well (same for
+ * other states).
+ *
+ * Future work: We can break this up into a per-object pipeline
+ * (almost). First, provide an ordering token to submit_transaction
+ * and require that all operations within a single transaction take
+ * place on a subset of hobject_t space partitioned by that token
+ * (the hashid seem about right to me -- even works for temp objects
+ * if you recall that a temp object created for object head foo will
+ * only ever be referenced by other transactions on foo and aren't
+ * reused). Next, factor this part into a class and maintain one per
+ * ordering token. Next, fixup PrimaryLogPG's repop queue to be
+ * partitioned by ordering token. Finally, refactor the op pipeline
+ * so that the log entries passed into submit_transaction aren't
+ * versioned. We can't assign versions to them until we actually
+ * submit the operation. That's probably going to be the hard part.
+ */
+ class pipeline_state_t {
+ enum {
+ CACHE_VALID = 0,
+ CACHE_INVALID = 1
+ } pipeline_state = CACHE_VALID;
+ public:
+ bool caching_enabled() const {
+ return pipeline_state == CACHE_VALID;
+ }
+ bool cache_invalid() const {
+ return !caching_enabled();
+ }
+ void invalidate() {
+ pipeline_state = CACHE_INVALID;
+ }
+ void clear() {
+ pipeline_state = CACHE_VALID;
+ }
+ friend ostream &operator<<(ostream &lhs, const pipeline_state_t &rhs);
+ } pipeline_state;
+
+
+ op_list waiting_state; /// writes waiting on pipe_state
+ op_list waiting_reads; /// writes waiting on partial stripe reads
+ op_list waiting_commit; /// writes waiting on initial commit
+ eversion_t completed_to;
+ eversion_t committed_to;
+ void start_rmw(Op *op, PGTransactionUPtr &&t);
+ bool try_state_to_reads();
+ bool try_reads_to_commit();
+ bool try_finish_rmw();
+ void check_ops();
+
+ ceph::ErasureCodeInterfaceRef ec_impl;
+
+
+ /**
+ * ECRecPred
+ *
+ * Determines the whether _have is sufficient to recover an object
+ */
+ class ECRecPred : public IsPGRecoverablePredicate {
+ std::set<int> want;
+ ceph::ErasureCodeInterfaceRef ec_impl;
+ public:
+ explicit ECRecPred(ceph::ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) {
+ for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) {
+ want.insert(i);
+ }
+ }
+ bool operator()(const std::set<pg_shard_t> &_have) const override {
+ std::set<int> have;
+ for (std::set<pg_shard_t>::const_iterator i = _have.begin();
+ i != _have.end();
+ ++i) {
+ have.insert(i->shard);
+ }
+ std::map<int, std::vector<std::pair<int, int>>> min;
+ return ec_impl->minimum_to_decode(want, have, &min) == 0;
+ }
+ };
+ IsPGRecoverablePredicate *get_is_recoverable_predicate() const override {
+ return new ECRecPred(ec_impl);
+ }
+
+ int get_ec_data_chunk_count() const override {
+ return ec_impl->get_data_chunk_count();
+ }
+ int get_ec_stripe_chunk_size() const override {
+ return sinfo.get_chunk_size();
+ }
+
+ /**
+ * ECReadPred
+ *
+ * Determines the whether _have is sufficient to read an object
+ */
+ class ECReadPred : public IsPGReadablePredicate {
+ pg_shard_t whoami;
+ ECRecPred rec_pred;
+ public:
+ ECReadPred(
+ pg_shard_t whoami,
+ ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {}
+ bool operator()(const std::set<pg_shard_t> &_have) const override {
+ return _have.count(whoami) && rec_pred(_have);
+ }
+ };
+ IsPGReadablePredicate *get_is_readable_predicate() const override {
+ return new ECReadPred(get_parent()->whoami_shard(), ec_impl);
+ }
+
+
+ const ECUtil::stripe_info_t sinfo;
+ /// If modified, ensure that the ref is held until the update is applied
+ SharedPtrRegistry<hobject_t, ECUtil::HashInfo> unstable_hashinfo_registry;
+ ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool create = false,
+ const std::map<std::string, ceph::buffer::ptr> *attr = NULL);
+
+public:
+ ECBackend(
+ PGBackend::Listener *pg,
+ const coll_t &coll,
+ ObjectStore::CollectionHandle &ch,
+ ObjectStore *store,
+ CephContext *cct,
+ ceph::ErasureCodeInterfaceRef ec_impl,
+ uint64_t stripe_width);
+
+ /// Returns to_read replicas sufficient to reconstruct want
+ int get_min_avail_to_read_shards(
+ const hobject_t &hoid, ///< [in] object
+ const std::set<int> &want, ///< [in] desired shards
+ bool for_recovery, ///< [in] true if we may use non-acting replicas
+ bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency
+ std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read ///< [out] shards, corresponding subchunks to read
+ ); ///< @return error code, 0 on success
+
+ int get_remaining_shards(
+ const hobject_t &hoid,
+ const std::set<int> &avail,
+ const std::set<int> &want,
+ const read_result_t &result,
+ std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read,
+ bool for_recovery);
+
+ int objects_get_attrs(
+ const hobject_t &hoid,
+ std::map<std::string, ceph::buffer::list> *out) override;
+
+ void rollback_append(
+ const hobject_t &hoid,
+ uint64_t old_size,
+ ObjectStore::Transaction *t) override;
+
+ bool auto_repair_supported() const override { return true; }
+
+ int be_deep_scrub(
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) override;
+ uint64_t be_get_ondisk_size(uint64_t logical_size) override {
+ return sinfo.logical_to_next_chunk_offset(logical_size);
+ }
+ void _failed_push(const hobject_t &hoid,
+ std::pair<RecoveryMessages *, ECBackend::read_result_t &> &in);
+};
+ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs);
+
+#endif
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
new file mode 100644
index 000000000..a65676643
--- /dev/null
+++ b/src/osd/ECMsgTypes.cc
@@ -0,0 +1,393 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "ECMsgTypes.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using ceph::bufferlist;
+using ceph::Formatter;
+
+void ECSubWrite::encode(bufferlist &bl) const
+{
+ ENCODE_START(4, 1, bl);
+ encode(from, bl);
+ encode(tid, bl);
+ encode(reqid, bl);
+ encode(soid, bl);
+ encode(stats, bl);
+ encode(t, bl);
+ encode(at_version, bl);
+ encode(trim_to, bl);
+ encode(log_entries, bl);
+ encode(temp_added, bl);
+ encode(temp_removed, bl);
+ encode(updated_hit_set_history, bl);
+ encode(roll_forward_to, bl);
+ encode(backfill_or_async_recovery, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ECSubWrite::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(4, bl);
+ decode(from, bl);
+ decode(tid, bl);
+ decode(reqid, bl);
+ decode(soid, bl);
+ decode(stats, bl);
+ decode(t, bl);
+ decode(at_version, bl);
+ decode(trim_to, bl);
+ decode(log_entries, bl);
+ decode(temp_added, bl);
+ decode(temp_removed, bl);
+ if (struct_v >= 2) {
+ decode(updated_hit_set_history, bl);
+ }
+ if (struct_v >= 3) {
+ decode(roll_forward_to, bl);
+ } else {
+ roll_forward_to = trim_to;
+ }
+ if (struct_v >= 4) {
+ decode(backfill_or_async_recovery, bl);
+ } else {
+ // The old protocol used an empty transaction to indicate backfill or async_recovery
+ backfill_or_async_recovery = t.empty();
+ }
+ DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubWrite &rhs)
+{
+ lhs << "ECSubWrite(tid=" << rhs.tid
+ << ", reqid=" << rhs.reqid
+ << ", at_version=" << rhs.at_version
+ << ", trim_to=" << rhs.trim_to
+ << ", roll_forward_to=" << rhs.roll_forward_to;
+ if (rhs.updated_hit_set_history)
+ lhs << ", has_updated_hit_set_history";
+ if (rhs.backfill_or_async_recovery)
+ lhs << ", backfill_or_async_recovery";
+ return lhs << ")";
+}
+
+void ECSubWrite::dump(Formatter *f) const
+{
+ f->dump_unsigned("tid", tid);
+ f->dump_stream("reqid") << reqid;
+ f->dump_stream("at_version") << at_version;
+ f->dump_stream("trim_to") << trim_to;
+ f->dump_stream("roll_forward_to") << roll_forward_to;
+ f->dump_bool("has_updated_hit_set_history",
+ static_cast<bool>(updated_hit_set_history));
+ f->dump_bool("backfill_or_async_recovery", backfill_or_async_recovery);
+}
+
+void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o)
+{
+ o.push_back(new ECSubWrite());
+ o.back()->tid = 1;
+ o.back()->at_version = eversion_t(2, 100);
+ o.back()->trim_to = eversion_t(1, 40);
+ o.push_back(new ECSubWrite());
+ o.back()->tid = 4;
+ o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+ o.back()->at_version = eversion_t(10, 300);
+ o.back()->trim_to = eversion_t(5, 42);
+ o.push_back(new ECSubWrite());
+ o.back()->tid = 9;
+ o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+ o.back()->at_version = eversion_t(10, 300);
+ o.back()->trim_to = eversion_t(5, 42);
+ o.back()->roll_forward_to = eversion_t(8, 250);
+}
+
+void ECSubWriteReply::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(from, bl);
+ encode(tid, bl);
+ encode(last_complete, bl);
+ encode(committed, bl);
+ encode(applied, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ECSubWriteReply::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(from, bl);
+ decode(tid, bl);
+ decode(last_complete, bl);
+ decode(committed, bl);
+ decode(applied, bl);
+ DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubWriteReply &rhs)
+{
+ return lhs
+ << "ECSubWriteReply(tid=" << rhs.tid
+ << ", last_complete=" << rhs.last_complete
+ << ", committed=" << rhs.committed
+ << ", applied=" << rhs.applied << ")";
+}
+
+void ECSubWriteReply::dump(Formatter *f) const
+{
+ f->dump_unsigned("tid", tid);
+ f->dump_stream("last_complete") << last_complete;
+ f->dump_bool("committed", committed);
+ f->dump_bool("applied", applied);
+}
+
+void ECSubWriteReply::generate_test_instances(list<ECSubWriteReply*>& o)
+{
+ o.push_back(new ECSubWriteReply());
+ o.back()->tid = 20;
+ o.back()->last_complete = eversion_t(100, 2000);
+ o.back()->committed = true;
+ o.push_back(new ECSubWriteReply());
+ o.back()->tid = 80;
+ o.back()->last_complete = eversion_t(50, 200);
+ o.back()->applied = true;
+}
+
+void ECSubRead::encode(bufferlist &bl, uint64_t features) const
+{
+ if ((features & CEPH_FEATURE_OSD_FADVISE_FLAGS) == 0) {
+ ENCODE_START(2, 1, bl);
+ encode(from, bl);
+ encode(tid, bl);
+ map<hobject_t, list<pair<uint64_t, uint64_t> >> tmp;
+ for (auto m = to_read.cbegin(); m != to_read.cend(); ++m) {
+ list<pair<uint64_t, uint64_t> > tlist;
+ for (auto l = m->second.cbegin(); l != m->second.cend(); ++l) {
+ tlist.push_back(std::make_pair(l->get<0>(), l->get<1>()));
+ }
+ tmp[m->first] = tlist;
+ }
+ encode(tmp, bl);
+ encode(attrs_to_read, bl);
+ encode(subchunks, bl);
+ ENCODE_FINISH(bl);
+ return;
+ }
+
+ ENCODE_START(3, 2, bl);
+ encode(from, bl);
+ encode(tid, bl);
+ encode(to_read, bl);
+ encode(attrs_to_read, bl);
+ encode(subchunks, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ECSubRead::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(3, bl);
+ decode(from, bl);
+ decode(tid, bl);
+ if (struct_v == 1) {
+ map<hobject_t, list<pair<uint64_t, uint64_t> >>tmp;
+ decode(tmp, bl);
+ for (auto m = tmp.cbegin(); m != tmp.cend(); ++m) {
+ list<boost::tuple<uint64_t, uint64_t, uint32_t> > tlist;
+ for (auto l = m->second.cbegin(); l != m->second.cend(); ++l) {
+ tlist.push_back(boost::make_tuple(l->first, l->second, 0));
+ }
+ to_read[m->first] = tlist;
+ }
+ } else {
+ decode(to_read, bl);
+ }
+ decode(attrs_to_read, bl);
+ if (struct_v > 2 && struct_v > struct_compat) {
+ decode(subchunks, bl);
+ } else {
+ for (auto &i : to_read) {
+ subchunks[i.first].push_back(make_pair(0, 1));
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubRead &rhs)
+{
+ return lhs
+ << "ECSubRead(tid=" << rhs.tid
+ << ", to_read=" << rhs.to_read
+ << ", subchunks=" << rhs.subchunks
+ << ", attrs_to_read=" << rhs.attrs_to_read << ")";
+}
+
+void ECSubRead::dump(Formatter *f) const
+{
+ f->dump_stream("from") << from;
+ f->dump_unsigned("tid", tid);
+ f->open_array_section("objects");
+ for (auto i = to_read.cbegin(); i != to_read.cend(); ++i) {
+ f->open_object_section("object");
+ f->dump_stream("oid") << i->first;
+ f->open_array_section("extents");
+ for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) {
+ f->open_object_section("extent");
+ f->dump_unsigned("off", j->get<0>());
+ f->dump_unsigned("len", j->get<1>());
+ f->dump_unsigned("flags", j->get<2>());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("object_attrs_requested");
+ for (auto i = attrs_to_read.cbegin(); i != attrs_to_read.cend(); ++i) {
+ f->open_object_section("object");
+ f->dump_stream("oid") << *i;
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ECSubRead::generate_test_instances(list<ECSubRead*>& o)
+{
+ hobject_t hoid1(sobject_t("asdf", 1));
+ hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP));
+ o.push_back(new ECSubRead());
+ o.back()->from = pg_shard_t(2, shard_id_t(-1));
+ o.back()->tid = 1;
+ o.back()->to_read[hoid1].push_back(boost::make_tuple(100, 200, 0));
+ o.back()->to_read[hoid1].push_back(boost::make_tuple(400, 600, 0));
+ o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0));
+ o.back()->attrs_to_read.insert(hoid1);
+ o.push_back(new ECSubRead());
+ o.back()->from = pg_shard_t(2, shard_id_t(-1));
+ o.back()->tid = 300;
+ o.back()->to_read[hoid1].push_back(boost::make_tuple(300, 200, 0));
+ o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0));
+ o.back()->to_read[hoid2].push_back(boost::make_tuple(2000, 600, 0));
+ o.back()->attrs_to_read.insert(hoid2);
+}
+
+void ECSubReadReply::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(from, bl);
+ encode(tid, bl);
+ encode(buffers_read, bl);
+ encode(attrs_read, bl);
+ encode(errors, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ECSubReadReply::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(from, bl);
+ decode(tid, bl);
+ decode(buffers_read, bl);
+ decode(attrs_read, bl);
+ decode(errors, bl);
+ DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubReadReply &rhs)
+{
+ return lhs
+ << "ECSubReadReply(tid=" << rhs.tid
+ << ", attrs_read=" << rhs.attrs_read.size()
+ << ")";
+}
+
+void ECSubReadReply::dump(Formatter *f) const
+{
+ f->dump_stream("from") << from;
+ f->dump_unsigned("tid", tid);
+ f->open_array_section("buffers_read");
+ for (auto i = buffers_read.cbegin(); i != buffers_read.cend(); ++i) {
+ f->open_object_section("object");
+ f->dump_stream("oid") << i->first;
+ f->open_array_section("data");
+ for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) {
+ f->open_object_section("extent");
+ f->dump_unsigned("off", j->first);
+ f->dump_unsigned("buf_len", j->second.length());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("attrs_returned");
+ for (auto i = attrs_read.cbegin(); i != attrs_read.cend(); ++i) {
+ f->open_object_section("object_attrs");
+ f->dump_stream("oid") << i->first;
+ f->open_array_section("attrs");
+ for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) {
+ f->open_object_section("attr");
+ f->dump_string("attr", j->first);
+ f->dump_unsigned("val_len", j->second.length());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("errors");
+ for (auto i = errors.cbegin(); i != errors.cend(); ++i) {
+ f->open_object_section("error_pair");
+ f->dump_stream("oid") << i->first;
+ f->dump_int("error", i->second);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ECSubReadReply::generate_test_instances(list<ECSubReadReply*>& o)
+{
+ hobject_t hoid1(sobject_t("asdf", 1));
+ hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP));
+ bufferlist bl;
+ bl.append_zero(100);
+ bufferlist bl2;
+ bl2.append_zero(200);
+ o.push_back(new ECSubReadReply());
+ o.back()->from = pg_shard_t(2, shard_id_t(-1));
+ o.back()->tid = 1;
+ o.back()->buffers_read[hoid1].push_back(make_pair(20, bl));
+ o.back()->buffers_read[hoid1].push_back(make_pair(2000, bl2));
+ o.back()->buffers_read[hoid2].push_back(make_pair(0, bl));
+ o.back()->attrs_read[hoid1]["foo"] = bl;
+ o.back()->attrs_read[hoid1]["_"] = bl2;
+ o.push_back(new ECSubReadReply());
+ o.back()->from = pg_shard_t(2, shard_id_t(-1));
+ o.back()->tid = 300;
+ o.back()->buffers_read[hoid2].push_back(make_pair(0, bl2));
+ o.back()->attrs_read[hoid2]["foo"] = bl;
+ o.back()->attrs_read[hoid2]["_"] = bl2;
+ o.back()->errors[hoid1] = -2;
+}
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
new file mode 100644
index 000000000..77b4222b2
--- /dev/null
+++ b/src/osd/ECMsgTypes.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef ECBMSGTYPES_H
+#define ECBMSGTYPES_H
+
+#include "osd_types.h"
+#include "include/buffer.h"
+#include "os/ObjectStore.h"
+#include "boost/tuple/tuple.hpp"
+
+struct ECSubWrite {
+ pg_shard_t from;
+ ceph_tid_t tid;
+ osd_reqid_t reqid;
+ hobject_t soid;
+ pg_stat_t stats;
+ ObjectStore::Transaction t;
+ eversion_t at_version;
+ eversion_t trim_to;
+ eversion_t roll_forward_to;
+ std::vector<pg_log_entry_t> log_entries;
+ std::set<hobject_t> temp_added;
+ std::set<hobject_t> temp_removed;
+ std::optional<pg_hit_set_history_t> updated_hit_set_history;
+ bool backfill_or_async_recovery = false;
+ ECSubWrite() : tid(0) {}
+ ECSubWrite(
+ pg_shard_t from,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ hobject_t soid,
+ const pg_stat_t &stats,
+ const ObjectStore::Transaction &t,
+ eversion_t at_version,
+ eversion_t trim_to,
+ eversion_t roll_forward_to,
+ std::vector<pg_log_entry_t> log_entries,
+ std::optional<pg_hit_set_history_t> updated_hit_set_history,
+ const std::set<hobject_t> &temp_added,
+ const std::set<hobject_t> &temp_removed,
+ bool backfill_or_async_recovery)
+ : from(from), tid(tid), reqid(reqid),
+ soid(soid), stats(stats), t(t),
+ at_version(at_version),
+ trim_to(trim_to), roll_forward_to(roll_forward_to),
+ log_entries(log_entries),
+ temp_added(temp_added),
+ temp_removed(temp_removed),
+ updated_hit_set_history(updated_hit_set_history),
+ backfill_or_async_recovery(backfill_or_async_recovery)
+ {}
+ void claim(ECSubWrite &other) {
+ from = other.from;
+ tid = other.tid;
+ reqid = other.reqid;
+ soid = other.soid;
+ stats = other.stats;
+ t.swap(other.t);
+ at_version = other.at_version;
+ trim_to = other.trim_to;
+ roll_forward_to = other.roll_forward_to;
+ log_entries.swap(other.log_entries);
+ temp_added.swap(other.temp_added);
+ temp_removed.swap(other.temp_removed);
+ updated_hit_set_history = other.updated_hit_set_history;
+ backfill_or_async_recovery = other.backfill_or_async_recovery;
+ }
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ECSubWrite*>& o);
+private:
+ // no outside copying -- slow
+ ECSubWrite(ECSubWrite& other);
+ const ECSubWrite& operator=(const ECSubWrite& other);
+};
+WRITE_CLASS_ENCODER(ECSubWrite)
+
+struct ECSubWriteReply {
+ pg_shard_t from;
+ ceph_tid_t tid;
+ eversion_t last_complete;
+ bool committed;
+ bool applied;
+ ECSubWriteReply() : tid(0), committed(false), applied(false) {}
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ECSubWriteReply*>& o);
+};
+WRITE_CLASS_ENCODER(ECSubWriteReply)
+
+struct ECSubRead {
+ pg_shard_t from;
+ ceph_tid_t tid;
+ std::map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >> to_read;
+ std::set<hobject_t> attrs_to_read;
+ std::map<hobject_t, std::vector<std::pair<int, int>>> subchunks;
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ECSubRead*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(ECSubRead)
+
+struct ECSubReadReply {
+ pg_shard_t from;
+ ceph_tid_t tid;
+ std::map<hobject_t, std::list<std::pair<uint64_t, ceph::buffer::list> >> buffers_read;
+ std::map<hobject_t, std::map<std::string, ceph::buffer::list>> attrs_read;
+ std::map<hobject_t, int> errors;
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ECSubReadReply*>& o);
+};
+WRITE_CLASS_ENCODER(ECSubReadReply)
+
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubWrite &rhs);
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubWriteReply &rhs);
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubRead &rhs);
+std::ostream &operator<<(
+ std::ostream &lhs, const ECSubReadReply &rhs);
+
+#endif
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc
new file mode 100644
index 000000000..603f9af0e
--- /dev/null
+++ b/src/osd/ECTransaction.cc
@@ -0,0 +1,670 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <vector>
+#include <sstream>
+
+#include "ECTransaction.h"
+#include "ECUtil.h"
+#include "os/ObjectStore.h"
+#include "common/inline_variant.h"
+
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+
+void encode_and_write(
+ pg_t pgid,
+ const hobject_t &oid,
+ const ECUtil::stripe_info_t &sinfo,
+ ErasureCodeInterfaceRef &ecimpl,
+ const set<int> &want,
+ uint64_t offset,
+ bufferlist bl,
+ uint32_t flags,
+ ECUtil::HashInfoRef hinfo,
+ extent_map &written,
+ map<shard_id_t, ObjectStore::Transaction> *transactions,
+ DoutPrefixProvider *dpp) {
+ const uint64_t before_size = hinfo->get_total_logical_size(sinfo);
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset));
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length()));
+ ceph_assert(bl.length());
+
+ map<int, bufferlist> buffers;
+ int r = ECUtil::encode(
+ sinfo, ecimpl, bl, want, &buffers);
+ ceph_assert(r == 0);
+
+ written.insert(offset, bl.length(), bl);
+
+ ldpp_dout(dpp, 20) << __func__ << ": " << oid
+ << " new_size "
+ << offset + bl.length()
+ << dendl;
+
+ if (offset >= before_size) {
+ ceph_assert(offset == before_size);
+ hinfo->append(
+ sinfo.aligned_logical_offset_to_chunk_offset(offset),
+ buffers);
+ }
+
+ for (auto &&i : *transactions) {
+ ceph_assert(buffers.count(i.first));
+ bufferlist &enc_bl = buffers[i.first];
+ if (offset >= before_size) {
+ i.second.set_alloc_hint(
+ coll_t(spg_t(pgid, i.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, i.first),
+ 0, 0,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+ }
+ i.second.write(
+ coll_t(spg_t(pgid, i.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, i.first),
+ sinfo.logical_to_prev_chunk_offset(
+ offset),
+ enc_bl.length(),
+ enc_bl,
+ flags);
+ }
+}
+
+bool ECTransaction::requires_overwrite(
+ uint64_t prev_size,
+ const PGTransaction::ObjectOperation &op) {
+ // special handling for truncates to 0
+ if (op.truncate && op.truncate->first == 0)
+ return false;
+ return op.is_none() &&
+ ((!op.buffer_updates.empty() &&
+ (op.buffer_updates.begin().get_off() < prev_size)) ||
+ (op.truncate &&
+ (op.truncate->first < prev_size)));
+}
+
+void ECTransaction::generate_transactions(
+ WritePlan &plan,
+ ErasureCodeInterfaceRef &ecimpl,
+ pg_t pgid,
+ const ECUtil::stripe_info_t &sinfo,
+ const map<hobject_t,extent_map> &partial_extents,
+ vector<pg_log_entry_t> &entries,
+ map<hobject_t,extent_map> *written_map,
+ map<shard_id_t, ObjectStore::Transaction> *transactions,
+ set<hobject_t> *temp_added,
+ set<hobject_t> *temp_removed,
+ DoutPrefixProvider *dpp,
+ const ceph_release_t require_osd_release)
+{
+ ceph_assert(written_map);
+ ceph_assert(transactions);
+ ceph_assert(temp_added);
+ ceph_assert(temp_removed);
+ ceph_assert(plan.t);
+ auto &t = *(plan.t);
+
+ auto &hash_infos = plan.hash_infos;
+
+ map<hobject_t, pg_log_entry_t*> obj_to_log;
+ for (auto &&i: entries) {
+ obj_to_log.insert(make_pair(i.soid, &i));
+ }
+
+ t.safe_create_traverse(
+ [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) {
+ const hobject_t &oid = opair.first;
+ auto &op = opair.second;
+ auto &obc_map = t.obc_map;
+ auto &written = (*written_map)[oid];
+
+ auto iter = obj_to_log.find(oid);
+ pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr;
+
+ ObjectContextRef obc;
+ auto obiter = t.obc_map.find(oid);
+ if (obiter != t.obc_map.end()) {
+ obc = obiter->second;
+ }
+ if (entry) {
+ ceph_assert(obc);
+ } else {
+ ceph_assert(oid.is_temp());
+ }
+
+ ECUtil::HashInfoRef hinfo;
+ {
+ auto iter = hash_infos.find(oid);
+ ceph_assert(iter != hash_infos.end());
+ hinfo = iter->second;
+ }
+
+ if (oid.is_temp()) {
+ if (op.is_fresh_object()) {
+ temp_added->insert(oid);
+ } else if (op.is_delete()) {
+ temp_removed->insert(oid);
+ }
+ }
+
+ if (entry &&
+ entry->is_modify() &&
+ op.updated_snaps) {
+ bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
+ encode(op.updated_snaps->second, bl);
+ entry->snaps.swap(bl);
+ entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+
+ ldpp_dout(dpp, 20) << "generate_transactions: "
+ << opair.first
+ << ", current size is "
+ << hinfo->get_total_logical_size(sinfo)
+ << " buffers are "
+ << op.buffer_updates
+ << dendl;
+ if (op.truncate) {
+ ldpp_dout(dpp, 20) << "generate_transactions: "
+ << " truncate is "
+ << *(op.truncate)
+ << dendl;
+ }
+
+ if (entry && op.updated_snaps) {
+ entry->mod_desc.update_snaps(op.updated_snaps->first);
+ }
+
+ map<string, std::optional<bufferlist> > xattr_rollback;
+ ceph_assert(hinfo);
+ bufferlist old_hinfo;
+ encode(*hinfo, old_hinfo);
+ xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo;
+
+ if (op.is_none() && op.truncate && op.truncate->first == 0) {
+ ceph_assert(op.truncate->first == 0);
+ ceph_assert(op.truncate->first ==
+ op.truncate->second);
+ ceph_assert(entry);
+ ceph_assert(obc);
+
+ if (op.truncate->first != op.truncate->second) {
+ op.truncate->first = op.truncate->second;
+ } else {
+ op.truncate = std::nullopt;
+ }
+
+ op.delete_first = true;
+ op.init_type = PGTransaction::ObjectOperation::Init::Create();
+
+ if (obc) {
+ /* We need to reapply all of the cached xattrs.
+ * std::map insert fortunately only writes keys
+ * which don't already exist, so this should do
+ * the right thing. */
+ op.attr_updates.insert(
+ obc->attr_cache.begin(),
+ obc->attr_cache.end());
+ }
+ }
+
+ if (op.delete_first) {
+ /* We also want to remove the std::nullopt entries since
+ * the keys already won't exist */
+ for (auto j = op.attr_updates.begin();
+ j != op.attr_updates.end();
+ ) {
+ if (j->second) {
+ ++j;
+ } else {
+ op.attr_updates.erase(j++);
+ }
+ }
+ /* Fill in all current entries for xattr rollback */
+ if (obc) {
+ xattr_rollback.insert(
+ obc->attr_cache.begin(),
+ obc->attr_cache.end());
+ obc->attr_cache.clear();
+ }
+ if (entry) {
+ entry->mod_desc.rmobject(entry->version.version);
+ for (auto &&st: *transactions) {
+ st.second.collection_move_rename(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, entry->version.version, st.first));
+ }
+ } else {
+ for (auto &&st: *transactions) {
+ st.second.remove(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+ }
+ }
+ hinfo->clear();
+ }
+
+ if (op.is_fresh_object() && entry) {
+ entry->mod_desc.create();
+ }
+
+ match(
+ op.init_type,
+ [&](const PGTransaction::ObjectOperation::Init::None &) {},
+ [&](const PGTransaction::ObjectOperation::Init::Create &op) {
+ for (auto &&st: *transactions) {
+ if (require_osd_release >= ceph_release_t::octopus) {
+ st.second.create(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+ } else {
+ st.second.touch(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+ }
+ }
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Clone &op) {
+ for (auto &&st: *transactions) {
+ st.second.clone(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+ }
+
+ auto siter = hash_infos.find(op.source);
+ ceph_assert(siter != hash_infos.end());
+ hinfo->update_to(*(siter->second));
+
+ if (obc) {
+ auto cobciter = obc_map.find(op.source);
+ ceph_assert(cobciter != obc_map.end());
+ obc->attr_cache = cobciter->second->attr_cache;
+ }
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Rename &op) {
+ ceph_assert(op.source.is_temp());
+ for (auto &&st: *transactions) {
+ st.second.collection_move_rename(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+ }
+ auto siter = hash_infos.find(op.source);
+ ceph_assert(siter != hash_infos.end());
+ hinfo->update_to(*(siter->second));
+ if (obc) {
+ auto cobciter = obc_map.find(op.source);
+ ceph_assert(cobciter == obc_map.end());
+ obc->attr_cache.clear();
+ }
+ });
+
+ // omap not supported (except 0, handled above)
+ ceph_assert(!(op.clear_omap));
+ ceph_assert(!(op.omap_header));
+ ceph_assert(op.omap_updates.empty());
+
+ if (!op.attr_updates.empty()) {
+ map<string, bufferlist> to_set;
+ for (auto &&j: op.attr_updates) {
+ if (j.second) {
+ to_set[j.first] = *(j.second);
+ } else {
+ for (auto &&st : *transactions) {
+ st.second.rmattr(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ j.first);
+ }
+ }
+ if (obc) {
+ auto citer = obc->attr_cache.find(j.first);
+ if (entry) {
+ if (citer != obc->attr_cache.end()) {
+ // won't overwrite anything we put in earlier
+ xattr_rollback.insert(
+ make_pair(
+ j.first,
+ std::optional<bufferlist>(citer->second)));
+ } else {
+ // won't overwrite anything we put in earlier
+ xattr_rollback.insert(
+ make_pair(
+ j.first,
+ std::nullopt));
+ }
+ }
+ if (j.second) {
+ obc->attr_cache[j.first] = *(j.second);
+ } else if (citer != obc->attr_cache.end()) {
+ obc->attr_cache.erase(citer);
+ }
+ } else {
+ ceph_assert(!entry);
+ }
+ }
+ for (auto &&st : *transactions) {
+ st.second.setattrs(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ to_set);
+ }
+ ceph_assert(!xattr_rollback.empty());
+ }
+ if (entry && !xattr_rollback.empty()) {
+ entry->mod_desc.setattrs(xattr_rollback);
+ }
+
+ if (op.alloc_hint) {
+ /* logical_to_next_chunk_offset() scales down both aligned and
+ * unaligned offsets
+
+ * we don't bother to roll this back at this time for two reasons:
+ * 1) it's advisory
+ * 2) we don't track the old value */
+ uint64_t object_size = sinfo.logical_to_next_chunk_offset(
+ op.alloc_hint->expected_object_size);
+ uint64_t write_size = sinfo.logical_to_next_chunk_offset(
+ op.alloc_hint->expected_write_size);
+
+ for (auto &&st : *transactions) {
+ st.second.set_alloc_hint(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ object_size,
+ write_size,
+ op.alloc_hint->flags);
+ }
+ }
+
+ extent_map to_write;
+ auto pextiter = partial_extents.find(oid);
+ if (pextiter != partial_extents.end()) {
+ to_write = pextiter->second;
+ }
+
+ vector<pair<uint64_t, uint64_t> > rollback_extents;
+ const uint64_t orig_size = hinfo->get_total_logical_size(sinfo);
+
+ uint64_t new_size = orig_size;
+ uint64_t append_after = new_size;
+ ldpp_dout(dpp, 20) << __func__ << ": new_size start " << new_size << dendl;
+ if (op.truncate && op.truncate->first < new_size) {
+ ceph_assert(!op.is_fresh_object());
+ new_size = sinfo.logical_to_next_stripe_offset(
+ op.truncate->first);
+ ldpp_dout(dpp, 20) << __func__ << ": new_size truncate down "
+ << new_size << dendl;
+ if (new_size != op.truncate->first) { // 0 the unaligned part
+ bufferlist bl;
+ bl.append_zero(new_size - op.truncate->first);
+ to_write.insert(
+ op.truncate->first,
+ bl.length(),
+ bl);
+ append_after = sinfo.logical_to_prev_stripe_offset(
+ op.truncate->first);
+ } else {
+ append_after = new_size;
+ }
+ to_write.erase(
+ new_size,
+ std::numeric_limits<uint64_t>::max() - new_size);
+
+ if (entry && !op.is_fresh_object()) {
+ uint64_t restore_from = sinfo.logical_to_prev_chunk_offset(
+ op.truncate->first);
+ uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
+ orig_size -
+ sinfo.logical_to_prev_stripe_offset(op.truncate->first));
+ ceph_assert(rollback_extents.empty());
+
+ ldpp_dout(dpp, 20) << __func__ << ": saving extent "
+ << make_pair(restore_from, restore_len)
+ << dendl;
+ ldpp_dout(dpp, 20) << __func__ << ": truncating to "
+ << new_size
+ << dendl;
+ rollback_extents.emplace_back(
+ make_pair(restore_from, restore_len));
+ for (auto &&st : *transactions) {
+ st.second.touch(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, entry->version.version, st.first));
+ st.second.clone_range(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ ghobject_t(oid, entry->version.version, st.first),
+ restore_from,
+ restore_len,
+ restore_from);
+
+ }
+ } else {
+ ldpp_dout(dpp, 20) << __func__ << ": not saving extents, fresh object"
+ << dendl;
+ }
+ for (auto &&st : *transactions) {
+ st.second.truncate(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ sinfo.aligned_logical_offset_to_chunk_offset(new_size));
+ }
+ }
+
+ uint32_t fadvise_flags = 0;
+ for (auto &&extent: op.buffer_updates) {
+ using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+ bufferlist bl;
+ match(
+ extent.get_val(),
+ [&](const BufferUpdate::Write &op) {
+ bl = op.buffer;
+ fadvise_flags |= op.fadvise_flags;
+ },
+ [&](const BufferUpdate::Zero &) {
+ bl.append_zero(extent.get_len());
+ },
+ [&](const BufferUpdate::CloneRange &) {
+ ceph_assert(
+ 0 ==
+ "CloneRange is not allowed, do_op should have returned ENOTSUPP");
+ });
+
+ uint64_t off = extent.get_off();
+ uint64_t len = extent.get_len();
+ uint64_t end = off + len;
+ ldpp_dout(dpp, 20) << __func__ << ": adding buffer_update "
+ << make_pair(off, len)
+ << dendl;
+ ceph_assert(len > 0);
+ if (off > new_size) {
+ ceph_assert(off > append_after);
+ bl.prepend_zero(off - new_size);
+ len += off - new_size;
+ ldpp_dout(dpp, 20) << __func__ << ": prepending zeroes to align "
+ << off << "->" << new_size
+ << dendl;
+ off = new_size;
+ }
+ if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) {
+ uint64_t aligned_end = sinfo.logical_to_next_stripe_offset(
+ end);
+ uint64_t tail = aligned_end - end;
+ bl.append_zero(tail);
+ ldpp_dout(dpp, 20) << __func__ << ": appending zeroes to align end "
+ << end << "->" << end+tail
+ << ", len: " << len << "->" << len+tail
+ << dendl;
+ end += tail;
+ len += tail;
+ }
+
+ to_write.insert(off, len, bl);
+ if (end > new_size)
+ new_size = end;
+ }
+
+ if (op.truncate &&
+ op.truncate->second > new_size) {
+ ceph_assert(op.truncate->second > append_after);
+ uint64_t truncate_to =
+ sinfo.logical_to_next_stripe_offset(
+ op.truncate->second);
+ uint64_t zeroes = truncate_to - new_size;
+ bufferlist bl;
+ bl.append_zero(zeroes);
+ to_write.insert(
+ new_size,
+ zeroes,
+ bl);
+ new_size = truncate_to;
+ ldpp_dout(dpp, 20) << __func__ << ": truncating out to "
+ << truncate_to
+ << dendl;
+ }
+
+ set<int> want;
+ for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) {
+ want.insert(i);
+ }
+ auto to_overwrite = to_write.intersect(0, append_after);
+ ldpp_dout(dpp, 20) << __func__ << ": to_overwrite: "
+ << to_overwrite
+ << dendl;
+ for (auto &&extent: to_overwrite) {
+ ceph_assert(extent.get_off() + extent.get_len() <= append_after);
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
+ if (entry) {
+ uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset(
+ extent.get_off());
+ uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
+ extent.get_len());
+ ldpp_dout(dpp, 20) << __func__ << ": overwriting "
+ << restore_from << "~" << restore_len
+ << dendl;
+ if (rollback_extents.empty()) {
+ for (auto &&st : *transactions) {
+ st.second.touch(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, entry->version.version, st.first));
+ }
+ }
+ rollback_extents.emplace_back(make_pair(restore_from, restore_len));
+ for (auto &&st : *transactions) {
+ st.second.clone_range(
+ coll_t(spg_t(pgid, st.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+ ghobject_t(oid, entry->version.version, st.first),
+ restore_from,
+ restore_len,
+ restore_from);
+ }
+ }
+ encode_and_write(
+ pgid,
+ oid,
+ sinfo,
+ ecimpl,
+ want,
+ extent.get_off(),
+ extent.get_val(),
+ fadvise_flags,
+ hinfo,
+ written,
+ transactions,
+ dpp);
+ }
+
+ auto to_append = to_write.intersect(
+ append_after,
+ std::numeric_limits<uint64_t>::max() - append_after);
+ ldpp_dout(dpp, 20) << __func__ << ": to_append: "
+ << to_append
+ << dendl;
+ for (auto &&extent: to_append) {
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
+ ldpp_dout(dpp, 20) << __func__ << ": appending "
+ << extent.get_off() << "~" << extent.get_len()
+ << dendl;
+ encode_and_write(
+ pgid,
+ oid,
+ sinfo,
+ ecimpl,
+ want,
+ extent.get_off(),
+ extent.get_val(),
+ fadvise_flags,
+ hinfo,
+ written,
+ transactions,
+ dpp);
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << ": " << oid
+ << " resetting hinfo to logical size "
+ << new_size
+ << dendl;
+ if (!rollback_extents.empty() && entry) {
+ if (entry) {
+ ldpp_dout(dpp, 20) << __func__ << ": " << oid
+ << " marking rollback extents "
+ << rollback_extents
+ << dendl;
+ entry->mod_desc.rollback_extents(
+ entry->version.version, rollback_extents);
+ }
+ hinfo->set_total_chunk_size_clear_hash(
+ sinfo.aligned_logical_offset_to_chunk_offset(new_size));
+ } else {
+ ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size);
+ }
+
+ if (entry && !to_append.empty()) {
+ ldpp_dout(dpp, 20) << __func__ << ": marking append "
+ << append_after
+ << dendl;
+ entry->mod_desc.append(append_after);
+ }
+
+ if (!op.is_delete()) {
+ bufferlist hbuf;
+ encode(*hinfo, hbuf);
+ for (auto &&i : *transactions) {
+ i.second.setattr(
+ coll_t(spg_t(pgid, i.first)),
+ ghobject_t(oid, ghobject_t::NO_GEN, i.first),
+ ECUtil::get_hinfo_key(),
+ hbuf);
+ }
+ }
+ });
+}
diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h
new file mode 100644
index 000000000..5cb16261a
--- /dev/null
+++ b/src/osd/ECTransaction.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef ECTRANSACTION_H
+#define ECTRANSACTION_H
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "ECUtil.h"
+#include "erasure-code/ErasureCodeInterface.h"
+#include "PGTransaction.h"
+#include "ExtentCache.h"
+
+namespace ECTransaction {
+ struct WritePlan {
+ PGTransactionUPtr t;
+ bool invalidates_cache = false; // Yes, both are possible
+ std::map<hobject_t,extent_set> to_read;
+ std::map<hobject_t,extent_set> will_write; // superset of to_read
+
+ std::map<hobject_t,ECUtil::HashInfoRef> hash_infos;
+ };
+
+ bool requires_overwrite(
+ uint64_t prev_size,
+ const PGTransaction::ObjectOperation &op);
+
+ template <typename F>
+ WritePlan get_write_plan(
+ const ECUtil::stripe_info_t &sinfo,
+ PGTransactionUPtr &&t,
+ F &&get_hinfo,
+ DoutPrefixProvider *dpp) {
+ WritePlan plan;
+ t->safe_create_traverse(
+ [&](std::pair<const hobject_t, PGTransaction::ObjectOperation> &i) {
+ ECUtil::HashInfoRef hinfo = get_hinfo(i.first);
+ plan.hash_infos[i.first] = hinfo;
+
+ uint64_t projected_size =
+ hinfo->get_projected_total_logical_size(sinfo);
+
+ if (i.second.deletes_first()) {
+ ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size"
+ << " to 0" << dendl;
+ projected_size = 0;
+ }
+
+ hobject_t source;
+ if (i.second.has_source(&source)) {
+ plan.invalidates_cache = true;
+
+ ECUtil::HashInfoRef shinfo = get_hinfo(source);
+ projected_size = shinfo->get_projected_total_logical_size(sinfo);
+ plan.hash_infos[source] = shinfo;
+ }
+
+ auto &will_write = plan.will_write[i.first];
+ if (i.second.truncate &&
+ i.second.truncate->first < projected_size) {
+ if (!(sinfo.logical_offset_is_stripe_aligned(
+ i.second.truncate->first))) {
+ plan.to_read[i.first].union_insert(
+ sinfo.logical_to_prev_stripe_offset(i.second.truncate->first),
+ sinfo.get_stripe_width());
+
+ ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl;
+
+ will_write.union_insert(
+ sinfo.logical_to_prev_stripe_offset(i.second.truncate->first),
+ sinfo.get_stripe_width());
+ }
+ projected_size = sinfo.logical_to_next_stripe_offset(
+ i.second.truncate->first);
+ }
+
+ extent_set raw_write_set;
+ for (auto &&extent: i.second.buffer_updates) {
+ using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+ if (boost::get<BufferUpdate::CloneRange>(&(extent.get_val()))) {
+ ceph_assert(
+ 0 ==
+ "CloneRange is not allowed, do_op should have returned ENOTSUPP");
+ }
+ raw_write_set.insert(extent.get_off(), extent.get_len());
+ }
+
+ auto orig_size = projected_size;
+ for (auto extent = raw_write_set.begin();
+ extent != raw_write_set.end();
+ ++extent) {
+ uint64_t head_start =
+ sinfo.logical_to_prev_stripe_offset(extent.get_start());
+ uint64_t head_finish =
+ sinfo.logical_to_next_stripe_offset(extent.get_start());
+ if (head_start > projected_size) {
+ head_start = projected_size;
+ }
+ if (head_start != head_finish &&
+ head_start < orig_size) {
+ ceph_assert(head_finish <= orig_size);
+ ceph_assert(head_finish - head_start == sinfo.get_stripe_width());
+ ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe "
+ << head_start << "~" << sinfo.get_stripe_width()
+ << dendl;
+ plan.to_read[i.first].union_insert(
+ head_start, sinfo.get_stripe_width());
+ }
+
+ uint64_t tail_start =
+ sinfo.logical_to_prev_stripe_offset(
+ extent.get_start() + extent.get_len());
+ uint64_t tail_finish =
+ sinfo.logical_to_next_stripe_offset(
+ extent.get_start() + extent.get_len());
+ if (tail_start != tail_finish &&
+ (head_start == head_finish || tail_start != head_start) &&
+ tail_start < orig_size) {
+ ceph_assert(tail_finish <= orig_size);
+ ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width());
+ ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe "
+ << tail_start << "~" << sinfo.get_stripe_width()
+ << dendl;
+ plan.to_read[i.first].union_insert(
+ tail_start, sinfo.get_stripe_width());
+ }
+
+ if (head_start != tail_finish) {
+ ceph_assert(
+ sinfo.logical_offset_is_stripe_aligned(
+ tail_finish - head_start)
+ );
+ will_write.union_insert(
+ head_start, tail_finish - head_start);
+ if (tail_finish > projected_size)
+ projected_size = tail_finish;
+ } else {
+ ceph_assert(tail_finish <= projected_size);
+ }
+ }
+
+ if (i.second.truncate &&
+ i.second.truncate->second > projected_size) {
+ uint64_t truncating_to =
+ sinfo.logical_to_next_stripe_offset(i.second.truncate->second);
+ ldpp_dout(dpp, 20) << __func__ << ": truncating out to "
+ << truncating_to
+ << dendl;
+ will_write.union_insert(projected_size,
+ truncating_to - projected_size);
+ projected_size = truncating_to;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << ": " << i.first
+ << " projected size "
+ << projected_size
+ << dendl;
+ hinfo->set_projected_total_logical_size(
+ sinfo,
+ projected_size);
+
+ /* validate post conditions:
+ * to_read should have an entry for i.first iff it isn't empty
+ * and if we are reading from i.first, we can't be renaming or
+ * cloning it */
+ ceph_assert(plan.to_read.count(i.first) == 0 ||
+ (!plan.to_read.at(i.first).empty() &&
+ !i.second.has_source()));
+ });
+ plan.t = std::move(t);
+ return plan;
+ }
+
+ void generate_transactions(
+ WritePlan &plan,
+ ceph::ErasureCodeInterfaceRef &ecimpl,
+ pg_t pgid,
+ const ECUtil::stripe_info_t &sinfo,
+ const std::map<hobject_t,extent_map> &partial_extents,
+ std::vector<pg_log_entry_t> &entries,
+ std::map<hobject_t,extent_map> *written,
+ std::map<shard_id_t, ObjectStore::Transaction> *transactions,
+ std::set<hobject_t> *temp_added,
+ std::set<hobject_t> *temp_removed,
+ DoutPrefixProvider *dpp,
+ const ceph_release_t require_osd_release = ceph_release_t::unknown);
+};
+
+#endif
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc
new file mode 100644
index 000000000..94b328458
--- /dev/null
+++ b/src/osd/ECUtil.cc
@@ -0,0 +1,248 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include <errno.h>
+#include "include/encoding.h"
+#include "ECUtil.h"
+
+using namespace std;
+using ceph::bufferlist;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::Formatter;
+
+int ECUtil::decode(
+ const stripe_info_t &sinfo,
+ ErasureCodeInterfaceRef &ec_impl,
+ map<int, bufferlist> &to_decode,
+ bufferlist *out) {
+ ceph_assert(to_decode.size());
+
+ uint64_t total_data_size = to_decode.begin()->second.length();
+ ceph_assert(total_data_size % sinfo.get_chunk_size() == 0);
+
+ ceph_assert(out);
+ ceph_assert(out->length() == 0);
+
+ for (map<int, bufferlist>::iterator i = to_decode.begin();
+ i != to_decode.end();
+ ++i) {
+ ceph_assert(i->second.length() == total_data_size);
+ }
+
+ if (total_data_size == 0)
+ return 0;
+
+ for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) {
+ map<int, bufferlist> chunks;
+ for (map<int, bufferlist>::iterator j = to_decode.begin();
+ j != to_decode.end();
+ ++j) {
+ chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size());
+ }
+ bufferlist bl;
+ int r = ec_impl->decode_concat(chunks, &bl);
+ ceph_assert(r == 0);
+ ceph_assert(bl.length() == sinfo.get_stripe_width());
+ out->claim_append(bl);
+ }
+ return 0;
+}
+
+int ECUtil::decode(
+ const stripe_info_t &sinfo,
+ ErasureCodeInterfaceRef &ec_impl,
+ map<int, bufferlist> &to_decode,
+ map<int, bufferlist*> &out) {
+
+ ceph_assert(to_decode.size());
+
+ for (auto &&i : to_decode) {
+ if(i.second.length() == 0)
+ return 0;
+ }
+
+ set<int> need;
+ for (map<int, bufferlist*>::iterator i = out.begin();
+ i != out.end();
+ ++i) {
+ ceph_assert(i->second);
+ ceph_assert(i->second->length() == 0);
+ need.insert(i->first);
+ }
+
+ set<int> avail;
+ for (auto &&i : to_decode) {
+ ceph_assert(i.second.length() != 0);
+ avail.insert(i.first);
+ }
+
+ map<int, vector<pair<int, int>>> min;
+ int r = ec_impl->minimum_to_decode(need, avail, &min);
+ ceph_assert(r == 0);
+
+ int chunks_count = 0;
+ int repair_data_per_chunk = 0;
+ int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count();
+
+ for (auto &&i : to_decode) {
+ auto found = min.find(i.first);
+ if (found != min.end()) {
+ int repair_subchunk_count = 0;
+ for (auto& subchunks : min[i.first]) {
+ repair_subchunk_count += subchunks.second;
+ }
+ repair_data_per_chunk = repair_subchunk_count * subchunk_size;
+ chunks_count = (int)i.second.length() / repair_data_per_chunk;
+ break;
+ }
+ }
+
+ for (int i = 0; i < chunks_count; i++) {
+ map<int, bufferlist> chunks;
+ for (auto j = to_decode.begin();
+ j != to_decode.end();
+ ++j) {
+ chunks[j->first].substr_of(j->second,
+ i*repair_data_per_chunk,
+ repair_data_per_chunk);
+ }
+ map<int, bufferlist> out_bls;
+ r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size());
+ ceph_assert(r == 0);
+ for (auto j = out.begin(); j != out.end(); ++j) {
+ ceph_assert(out_bls.count(j->first));
+ ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size());
+ j->second->claim_append(out_bls[j->first]);
+ }
+ }
+ for (auto &&i : out) {
+ ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size());
+ }
+ return 0;
+}
+
+int ECUtil::encode(
+ const stripe_info_t &sinfo,
+ ErasureCodeInterfaceRef &ec_impl,
+ bufferlist &in,
+ const set<int> &want,
+ map<int, bufferlist> *out) {
+
+ uint64_t logical_size = in.length();
+
+ ceph_assert(logical_size % sinfo.get_stripe_width() == 0);
+ ceph_assert(out);
+ ceph_assert(out->empty());
+
+ if (logical_size == 0)
+ return 0;
+
+ for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) {
+ map<int, bufferlist> encoded;
+ bufferlist buf;
+ buf.substr_of(in, i, sinfo.get_stripe_width());
+ int r = ec_impl->encode(want, buf, &encoded);
+ ceph_assert(r == 0);
+ for (map<int, bufferlist>::iterator i = encoded.begin();
+ i != encoded.end();
+ ++i) {
+ ceph_assert(i->second.length() == sinfo.get_chunk_size());
+ (*out)[i->first].claim_append(i->second);
+ }
+ }
+
+ for (map<int, bufferlist>::iterator i = out->begin();
+ i != out->end();
+ ++i) {
+ ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0);
+ ceph_assert(
+ sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) ==
+ logical_size);
+ }
+ return 0;
+}
+
+void ECUtil::HashInfo::append(uint64_t old_size,
+ map<int, bufferlist> &to_append) {
+ ceph_assert(old_size == total_chunk_size);
+ uint64_t size_to_append = to_append.begin()->second.length();
+ if (has_chunk_hash()) {
+ ceph_assert(to_append.size() == cumulative_shard_hashes.size());
+ for (map<int, bufferlist>::iterator i = to_append.begin();
+ i != to_append.end();
+ ++i) {
+ ceph_assert(size_to_append == i->second.length());
+ ceph_assert((unsigned)i->first < cumulative_shard_hashes.size());
+ uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]);
+ cumulative_shard_hashes[i->first] = new_hash;
+ }
+ }
+ total_chunk_size += size_to_append;
+}
+
+void ECUtil::HashInfo::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(total_chunk_size, bl);
+ encode(cumulative_shard_hashes, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(total_chunk_size, bl);
+ decode(cumulative_shard_hashes, bl);
+ projected_total_chunk_size = total_chunk_size;
+ DECODE_FINISH(bl);
+}
+
+void ECUtil::HashInfo::dump(Formatter *f) const
+{
+ f->dump_unsigned("total_chunk_size", total_chunk_size);
+ f->open_array_section("cumulative_shard_hashes");
+ for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) {
+ f->open_object_section("hash");
+ f->dump_unsigned("shard", i);
+ f->dump_unsigned("hash", cumulative_shard_hashes[i]);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+namespace ECUtil {
+std::ostream& operator<<(std::ostream& out, const HashInfo& hi)
+{
+ ostringstream hashes;
+ for (auto hash: hi.cumulative_shard_hashes)
+ hashes << " " << hex << hash;
+ return out << "tcs=" << hi.total_chunk_size << hashes.str();
+}
+}
+
+void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o)
+{
+ o.push_back(new HashInfo(3));
+ {
+ bufferlist bl;
+ bl.append_zero(20);
+ map<int, bufferlist> buffers;
+ buffers[0] = bl;
+ buffers[1] = bl;
+ buffers[2] = bl;
+ o.back()->append(0, buffers);
+ o.back()->append(20, buffers);
+ }
+ o.push_back(new HashInfo(4));
+}
+
+const string HINFO_KEY = "hinfo_key";
+
+bool ECUtil::is_hinfo_key_string(const string &key)
+{
+ return key == HINFO_KEY;
+}
+
+const string &ECUtil::get_hinfo_key()
+{
+ return HINFO_KEY;
+}
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h
new file mode 100644
index 000000000..dce78b8a8
--- /dev/null
+++ b/src/osd/ECUtil.h
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef ECUTIL_H
+#define ECUTIL_H
+
+#include <ostream>
+#include "erasure-code/ErasureCodeInterface.h"
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+namespace ECUtil {
+
+class stripe_info_t {
+ const uint64_t stripe_width;
+ const uint64_t chunk_size;
+public:
+ stripe_info_t(uint64_t stripe_size, uint64_t stripe_width)
+ : stripe_width(stripe_width),
+ chunk_size(stripe_width / stripe_size) {
+ ceph_assert(stripe_width % stripe_size == 0);
+ }
+ bool logical_offset_is_stripe_aligned(uint64_t logical) const {
+ return (logical % stripe_width) == 0;
+ }
+ uint64_t get_stripe_width() const {
+ return stripe_width;
+ }
+ uint64_t get_chunk_size() const {
+ return chunk_size;
+ }
+ uint64_t logical_to_prev_chunk_offset(uint64_t offset) const {
+ return (offset / stripe_width) * chunk_size;
+ }
+ uint64_t logical_to_next_chunk_offset(uint64_t offset) const {
+ return ((offset + stripe_width - 1)/ stripe_width) * chunk_size;
+ }
+ uint64_t logical_to_prev_stripe_offset(uint64_t offset) const {
+ return offset - (offset % stripe_width);
+ }
+ uint64_t logical_to_next_stripe_offset(uint64_t offset) const {
+ return ((offset % stripe_width) ?
+ (offset - (offset % stripe_width) + stripe_width) :
+ offset);
+ }
+ uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const {
+ ceph_assert(offset % stripe_width == 0);
+ return (offset / stripe_width) * chunk_size;
+ }
+ uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const {
+ ceph_assert(offset % chunk_size == 0);
+ return (offset / chunk_size) * stripe_width;
+ }
+ std::pair<uint64_t, uint64_t> aligned_offset_len_to_chunk(
+ std::pair<uint64_t, uint64_t> in) const {
+ return std::make_pair(
+ aligned_logical_offset_to_chunk_offset(in.first),
+ aligned_logical_offset_to_chunk_offset(in.second));
+ }
+ std::pair<uint64_t, uint64_t> offset_len_to_stripe_bounds(
+ std::pair<uint64_t, uint64_t> in) const {
+ uint64_t off = logical_to_prev_stripe_offset(in.first);
+ uint64_t len = logical_to_next_stripe_offset(
+ (in.first - off) + in.second);
+ return std::make_pair(off, len);
+ }
+};
+
+int decode(
+ const stripe_info_t &sinfo,
+ ceph::ErasureCodeInterfaceRef &ec_impl,
+ std::map<int, ceph::buffer::list> &to_decode,
+ ceph::buffer::list *out);
+
+int decode(
+ const stripe_info_t &sinfo,
+ ceph::ErasureCodeInterfaceRef &ec_impl,
+ std::map<int, ceph::buffer::list> &to_decode,
+ std::map<int, ceph::buffer::list*> &out);
+
+int encode(
+ const stripe_info_t &sinfo,
+ ceph::ErasureCodeInterfaceRef &ec_impl,
+ ceph::buffer::list &in,
+ const std::set<int> &want,
+ std::map<int, ceph::buffer::list> *out);
+
+class HashInfo {
+ uint64_t total_chunk_size = 0;
+ std::vector<uint32_t> cumulative_shard_hashes;
+
+ // purely ephemeral, represents the size once all in-flight ops commit
+ uint64_t projected_total_chunk_size = 0;
+public:
+ HashInfo() {}
+ explicit HashInfo(unsigned num_chunks) :
+ cumulative_shard_hashes(num_chunks, -1) {}
+ void append(uint64_t old_size, std::map<int, ceph::buffer::list> &to_append);
+ void clear() {
+ total_chunk_size = 0;
+ cumulative_shard_hashes = std::vector<uint32_t>(
+ cumulative_shard_hashes.size(),
+ -1);
+ }
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<HashInfo*>& o);
+ uint32_t get_chunk_hash(int shard) const {
+ ceph_assert((unsigned)shard < cumulative_shard_hashes.size());
+ return cumulative_shard_hashes[shard];
+ }
+ uint64_t get_total_chunk_size() const {
+ return total_chunk_size;
+ }
+ uint64_t get_projected_total_chunk_size() const {
+ return projected_total_chunk_size;
+ }
+ uint64_t get_total_logical_size(const stripe_info_t &sinfo) const {
+ return get_total_chunk_size() *
+ (sinfo.get_stripe_width()/sinfo.get_chunk_size());
+ }
+ uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const {
+ return get_projected_total_chunk_size() *
+ (sinfo.get_stripe_width()/sinfo.get_chunk_size());
+ }
+ void set_projected_total_logical_size(
+ const stripe_info_t &sinfo,
+ uint64_t logical_size) {
+ ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size));
+ projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset(
+ logical_size);
+ }
+ void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) {
+ cumulative_shard_hashes.clear();
+ total_chunk_size = new_chunk_size;
+ }
+ bool has_chunk_hash() const {
+ return !cumulative_shard_hashes.empty();
+ }
+ void update_to(const HashInfo &rhs) {
+ auto ptcs = projected_total_chunk_size;
+ *this = rhs;
+ projected_total_chunk_size = ptcs;
+ }
+ friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi);
+};
+
+typedef std::shared_ptr<HashInfo> HashInfoRef;
+
+bool is_hinfo_key_string(const std::string &key);
+const std::string &get_hinfo_key();
+
+WRITE_CLASS_ENCODER(ECUtil::HashInfo)
+}
+#endif
diff --git a/src/osd/ExtentCache.cc b/src/osd/ExtentCache.cc
new file mode 100644
index 000000000..3a8bbf11b
--- /dev/null
+++ b/src/osd/ExtentCache.cc
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "ExtentCache.h"
+
+using std::ostream;
+
+using ceph::bufferlist;
+
+void ExtentCache::extent::_link_pin_state(pin_state &pin_state)
+{
+ ceph_assert(parent_extent_set);
+ ceph_assert(!parent_pin_state);
+ parent_pin_state = &pin_state;
+ pin_state.pin_list.push_back(*this);
+}
+
+void ExtentCache::extent::_unlink_pin_state()
+{
+ ceph_assert(parent_extent_set);
+ ceph_assert(parent_pin_state);
+ auto liter = pin_state::list::s_iterator_to(*this);
+ parent_pin_state->pin_list.erase(liter);
+ parent_pin_state = nullptr;
+}
+
+void ExtentCache::extent::unlink()
+{
+ ceph_assert(parent_extent_set);
+ ceph_assert(parent_pin_state);
+
+ _unlink_pin_state();
+
+ // remove from extent set
+ {
+ auto siter = object_extent_set::set::s_iterator_to(*this);
+ auto &set = object_extent_set::set::container_from_iterator(siter);
+ ceph_assert(&set == &(parent_extent_set->extent_set));
+ set.erase(siter);
+ }
+
+ parent_extent_set = nullptr;
+ ceph_assert(!parent_pin_state);
+}
+
+void ExtentCache::extent::link(
+ object_extent_set &extent_set,
+ pin_state &pin_state)
+{
+ ceph_assert(!parent_extent_set);
+ parent_extent_set = &extent_set;
+ extent_set.extent_set.insert(*this);
+
+ _link_pin_state(pin_state);
+}
+
+void ExtentCache::extent::move(
+ pin_state &to)
+{
+ _unlink_pin_state();
+ _link_pin_state(to);
+}
+
+void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset)
+{
+ if (eset.extent_set.empty()) {
+ auto siter = cache_set::s_iterator_to(eset);
+ auto &set = cache_set::container_from_iterator(siter);
+ ceph_assert(&set == &per_object_caches);
+
+ // per_object_caches owns eset
+ per_object_caches.erase(eset);
+ delete &eset;
+ }
+}
+
+ExtentCache::object_extent_set &ExtentCache::get_or_create(
+ const hobject_t &oid)
+{
+ cache_set::insert_commit_data data;
+ auto p = per_object_caches.insert_check(oid, Cmp(), data);
+ if (p.second) {
+ auto *eset = new object_extent_set(oid);
+ per_object_caches.insert_commit(*eset, data);
+ return *eset;
+ } else {
+ return *(p.first);
+ }
+}
+
+ExtentCache::object_extent_set *ExtentCache::get_if_exists(
+ const hobject_t &oid)
+{
+ cache_set::insert_commit_data data;
+ auto p = per_object_caches.insert_check(oid, Cmp(), data);
+ if (p.second) {
+ return nullptr;
+ } else {
+ return &*(p.first);
+ }
+}
+
+std::pair<
+ ExtentCache::object_extent_set::set::iterator,
+ ExtentCache::object_extent_set::set::iterator
+ > ExtentCache::object_extent_set::get_containing_range(
+ uint64_t off, uint64_t len)
+{
+ // fst is first iterator with end after off (may be end)
+ auto fst = extent_set.upper_bound(off, uint_cmp());
+ if (fst != extent_set.begin())
+ --fst;
+ if (fst != extent_set.end() && off >= (fst->offset + fst->get_length()))
+ ++fst;
+
+ // lst is first iterator with start >= off + len (may be end)
+ auto lst = extent_set.lower_bound(off + len, uint_cmp());
+ return std::make_pair(fst, lst);
+}
+
+extent_set ExtentCache::reserve_extents_for_rmw(
+ const hobject_t &oid,
+ write_pin &pin,
+ const extent_set &to_write,
+ const extent_set &to_read)
+{
+ if (to_write.empty() && to_read.empty()) {
+ return extent_set();
+ }
+ extent_set must_read;
+ auto &eset = get_or_create(oid);
+ extent_set missing;
+ for (auto &&res: to_write) {
+ eset.traverse_update(
+ pin,
+ res.first,
+ res.second,
+ [&](uint64_t off, uint64_t len,
+ extent *ext, object_extent_set::update_action *action) {
+ action->action = object_extent_set::update_action::UPDATE_PIN;
+ if (!ext) {
+ missing.insert(off, len);
+ }
+ });
+ }
+ must_read.intersection_of(
+ to_read,
+ missing);
+ return must_read;
+}
+
+extent_map ExtentCache::get_remaining_extents_for_rmw(
+ const hobject_t &oid,
+ write_pin &pin,
+ const extent_set &to_get)
+{
+ if (to_get.empty()) {
+ return extent_map();
+ }
+ extent_map ret;
+ auto &eset = get_or_create(oid);
+ for (auto &&res: to_get) {
+ bufferlist bl;
+ uint64_t cur = res.first;
+ eset.traverse_update(
+ pin,
+ res.first,
+ res.second,
+ [&](uint64_t off, uint64_t len,
+ extent *ext, object_extent_set::update_action *action) {
+ ceph_assert(off == cur);
+ cur = off + len;
+ action->action = object_extent_set::update_action::NONE;
+ ceph_assert(ext && ext->bl && ext->pinned_by_write());
+ bl.substr_of(
+ *(ext->bl),
+ off - ext->offset,
+ len);
+ ret.insert(off, len, bl);
+ });
+ }
+ return ret;
+}
+
+void ExtentCache::present_rmw_update(
+ const hobject_t &oid,
+ write_pin &pin,
+ const extent_map &extents)
+{
+ if (extents.empty()) {
+ return;
+ }
+ auto &eset = get_or_create(oid);
+ for (auto &&res: extents) {
+ eset.traverse_update(
+ pin,
+ res.get_off(),
+ res.get_len(),
+ [&](uint64_t off, uint64_t len,
+ extent *ext, object_extent_set::update_action *action) {
+ action->action = object_extent_set::update_action::NONE;
+ ceph_assert(ext && ext->pinned_by_write());
+ action->bl = bufferlist();
+ action->bl->substr_of(
+ res.get_val(),
+ off - res.get_off(),
+ len);
+ });
+ }
+}
+
+ostream &ExtentCache::print(ostream &out) const
+{
+ out << "ExtentCache(" << std::endl;
+ for (auto esiter = per_object_caches.begin();
+ esiter != per_object_caches.end();
+ ++esiter) {
+ out << " Extents(" << esiter->oid << ")[" << std::endl;
+ for (auto exiter = esiter->extent_set.begin();
+ exiter != esiter->extent_set.end();
+ ++exiter) {
+ out << " Extent(" << exiter->offset
+ << "~" << exiter->get_length()
+ << ":" << exiter->pin_tid()
+ << ")" << std::endl;
+ }
+ }
+ return out << ")" << std::endl;
+}
+
+ostream &operator<<(ostream &lhs, const ExtentCache &cache)
+{
+ return cache.print(lhs);
+}
diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h
new file mode 100644
index 000000000..972228cd0
--- /dev/null
+++ b/src/osd/ExtentCache.h
@@ -0,0 +1,489 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef EXTENT_CACHE_H
+#define EXTENT_CACHE_H
+
+#include <map>
+#include <list>
+#include <vector>
+#include <utility>
+#include <optional>
+#include <boost/intrusive/set.hpp>
+#include <boost/intrusive/list.hpp>
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+#include "include/buffer.h"
+#include "common/hobject.h"
+
+/**
+ ExtentCache
+
+ The main purpose of this cache is to ensure that we can pipeline
+ overlapping partial overwrites.
+
+ To that end we need to ensure that an extent pinned for an operation is
+ live until that operation completes. However, a particular extent
+ might be pinned by multiple operations (several pipelined writes
+ on the same object).
+
+ 1) When we complete an operation, we only look at extents owned only
+ by that operation.
+ 2) Per-extent overhead is fixed size.
+ 2) Per-operation metadata is fixed size.
+
+ This is simple enough to realize with two main structures:
+ - extent: contains a pointer to the pin owning it and intrusive list
+ pointers to other extents owned by the same pin
+ - pin_state: contains the list head for extents owned by it
+
+ This works as long as we only need to remember one "owner" for
+ each extent. To make this work, we'll need to leverage some
+ invariants guaranteed by higher layers:
+
+ 1) Writes on a particular object must be ordered
+ 2) A particular object will have outstanding reads or writes, but not
+ both (note that you can have a read while a write is committed, but
+ not applied).
+
+ Our strategy therefore will be to have whichever in-progress op will
+ finish "last" be the owner of a particular extent. For now, we won't
+ cache reads, so 2) simply means that we can assume that reads and
+ recovery operations imply no unstable extents on the object in
+ question.
+
+ Write: WaitRead -> WaitCommit -> Complete
+
+ Invariant 1) above actually indicates that we can't have writes
+ bypassing the WaitRead state while there are writes waiting on
+ Reads. Thus, the set of operations pinning a particular extent
+ must always complete in order or arrival.
+
+ This suggests that a particular extent may be in only the following
+ states:
+
+
+ 0) Empty (not in the map at all)
+ 1) Write Pending N
+ - Some write with reqid <= N is currently fetching the data for
+ this extent
+ - The extent must persist until Write reqid N completes
+ - All ops pinning this extent are writes in the WaitRead state of
+ the Write pipeline (there must be an in progress write, so no
+ reads can be in progress).
+ 2) Write Pinned N:
+ - This extent has data corresponding to some reqid M <= N
+ - The extent must persist until Write reqid N commits
+ - All ops pinning this extent are writes in some Write
+ state (all are possible). Reads are not possible
+ in this state (or the others) due to 2).
+
+ All of the above suggests that there are 3 things users can
+ ask of the cache corresponding to the 3 Write pipelines
+ states.
+ */
+
+/// If someone wants these types, but not ExtentCache, move to another file
+struct bl_split_merge {
+ ceph::buffer::list split(
+ uint64_t offset,
+ uint64_t length,
+ ceph::buffer::list &bl) const {
+ ceph::buffer::list out;
+ out.substr_of(bl, offset, length);
+ return out;
+ }
+ bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const {
+ return true;
+ }
+ ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const {
+ ceph::buffer::list bl{std::move(left)};
+ bl.claim_append(right);
+ return bl;
+ }
+ uint64_t length(const ceph::buffer::list &b) const { return b.length(); }
+};
+using extent_set = interval_set<uint64_t>;
+using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge>;
+
+class ExtentCache {
+ struct object_extent_set;
+ struct pin_state;
+private:
+
+ struct extent {
+ object_extent_set *parent_extent_set = nullptr;
+ pin_state *parent_pin_state = nullptr;
+ boost::intrusive::set_member_hook<> extent_set_member;
+ boost::intrusive::list_member_hook<> pin_list_member;
+
+ uint64_t offset;
+ uint64_t length;
+ std::optional<ceph::buffer::list> bl;
+
+ uint64_t get_length() const {
+ return length;
+ }
+
+ bool is_pending() const {
+ return bl == std::nullopt;
+ }
+
+ bool pinned_by_write() const {
+ ceph_assert(parent_pin_state);
+ return parent_pin_state->is_write();
+ }
+
+ uint64_t pin_tid() const {
+ ceph_assert(parent_pin_state);
+ return parent_pin_state->tid;
+ }
+
+ extent(uint64_t offset, ceph::buffer::list _bl)
+ : offset(offset), length(_bl.length()), bl(_bl) {}
+
+ extent(uint64_t offset, uint64_t length)
+ : offset(offset), length(length) {}
+
+ bool operator<(const extent &rhs) const {
+ return offset < rhs.offset;
+ }
+ private:
+ // can briefly violate the two link invariant, used in unlink() and move()
+ void _link_pin_state(pin_state &pin_state);
+ void _unlink_pin_state();
+ public:
+ void unlink();
+ void link(object_extent_set &parent_extent_set, pin_state &pin_state);
+ void move(pin_state &to);
+ };
+
+ struct object_extent_set : boost::intrusive::set_base_hook<> {
+ hobject_t oid;
+ explicit object_extent_set(const hobject_t &oid) : oid(oid) {}
+
+ using set_member_options = boost::intrusive::member_hook<
+ extent,
+ boost::intrusive::set_member_hook<>,
+ &extent::extent_set_member>;
+ using set = boost::intrusive::set<extent, set_member_options>;
+ set extent_set;
+
+ bool operator<(const object_extent_set &rhs) const {
+ return oid < rhs.oid;
+ }
+
+ struct uint_cmp {
+ bool operator()(uint64_t lhs, const extent &rhs) const {
+ return lhs < rhs.offset;
+ }
+ bool operator()(const extent &lhs, uint64_t rhs) const {
+ return lhs.offset < rhs;
+ }
+ };
+ std::pair<set::iterator, set::iterator> get_containing_range(
+ uint64_t offset, uint64_t length);
+
+ void erase(uint64_t offset, uint64_t length);
+
+ struct update_action {
+ enum type {
+ NONE,
+ UPDATE_PIN
+ };
+ type action = NONE;
+ std::optional<ceph::buffer::list> bl;
+ };
+ template <typename F>
+ void traverse_update(
+ pin_state &pin,
+ uint64_t offset,
+ uint64_t length,
+ F &&f) {
+ auto range = get_containing_range(offset, length);
+
+ if (range.first == range.second || range.first->offset > offset) {
+ uint64_t extlen = range.first == range.second ?
+ length : range.first->offset - offset;
+
+ update_action action;
+ f(offset, extlen, nullptr, &action);
+ ceph_assert(!action.bl || action.bl->length() == extlen);
+ if (action.action == update_action::UPDATE_PIN) {
+ extent *ext = action.bl ?
+ new extent(offset, *action.bl) :
+ new extent(offset, extlen);
+ ext->link(*this, pin);
+ } else {
+ ceph_assert(!action.bl);
+ }
+ }
+
+ for (auto p = range.first; p != range.second;) {
+ extent *ext = &*p;
+ ++p;
+
+ uint64_t extoff = std::max(ext->offset, offset);
+ uint64_t extlen = std::min(
+ ext->length - (extoff - ext->offset),
+ offset + length - extoff);
+
+ update_action action;
+ f(extoff, extlen, ext, &action);
+ ceph_assert(!action.bl || action.bl->length() == extlen);
+ extent *final_extent = nullptr;
+ if (action.action == update_action::NONE) {
+ final_extent = ext;
+ } else {
+ pin_state *ps = ext->parent_pin_state;
+ ext->unlink();
+ if ((ext->offset < offset) &&
+ (ext->offset + ext->get_length() > offset)) {
+ extent *head = nullptr;
+ if (ext->bl) {
+ ceph::buffer::list bl;
+ bl.substr_of(
+ *(ext->bl),
+ 0,
+ offset - ext->offset);
+ head = new extent(ext->offset, bl);
+ } else {
+ head = new extent(
+ ext->offset, offset - ext->offset);
+ }
+ head->link(*this, *ps);
+ }
+ if ((ext->offset + ext->length > offset + length) &&
+ (offset + length > ext->offset)) {
+ uint64_t nlen =
+ (ext->offset + ext->get_length()) - (offset + length);
+ extent *tail = nullptr;
+ if (ext->bl) {
+ ceph::buffer::list bl;
+ bl.substr_of(
+ *(ext->bl),
+ ext->get_length() - nlen,
+ nlen);
+ tail = new extent(offset + length, bl);
+ } else {
+ tail = new extent(offset + length, nlen);
+ }
+ tail->link(*this, *ps);
+ }
+ if (action.action == update_action::UPDATE_PIN) {
+ if (ext->bl) {
+ ceph::buffer::list bl;
+ bl.substr_of(
+ *(ext->bl),
+ extoff - ext->offset,
+ extlen);
+ final_extent = new ExtentCache::extent(
+ extoff,
+ bl);
+ } else {
+ final_extent = new ExtentCache::extent(
+ extoff, extlen);
+ }
+ final_extent->link(*this, pin);
+ }
+ delete ext;
+ }
+
+ if (action.bl) {
+ ceph_assert(final_extent);
+ ceph_assert(final_extent->length == action.bl->length());
+ final_extent->bl = *(action.bl);
+ }
+
+ uint64_t next_off = p == range.second ?
+ offset + length : p->offset;
+ if (extoff + extlen < next_off) {
+ uint64_t tailoff = extoff + extlen;
+ uint64_t taillen = next_off - tailoff;
+
+ update_action action;
+ f(tailoff, taillen, nullptr, &action);
+ ceph_assert(!action.bl || action.bl->length() == taillen);
+ if (action.action == update_action::UPDATE_PIN) {
+ extent *ext = action.bl ?
+ new extent(tailoff, *action.bl) :
+ new extent(tailoff, taillen);
+ ext->link(*this, pin);
+ } else {
+ ceph_assert(!action.bl);
+ }
+ }
+ }
+ }
+ };
+ struct Cmp {
+ bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
+ return oid < rhs.oid;
+ }
+ bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
+ return lhs.oid < oid;
+ }
+ };
+
+ object_extent_set &get_or_create(const hobject_t &oid);
+ object_extent_set *get_if_exists(const hobject_t &oid);
+
+ void remove_and_destroy_if_empty(object_extent_set &set);
+ using cache_set = boost::intrusive::set<object_extent_set>;
+ cache_set per_object_caches;
+
+ uint64_t next_write_tid = 1;
+ uint64_t next_read_tid = 1;
+ struct pin_state {
+ uint64_t tid = 0;
+ enum pin_type_t {
+ NONE,
+ WRITE,
+ };
+ pin_type_t pin_type = NONE;
+ bool is_write() const { return pin_type == WRITE; }
+
+ pin_state(const pin_state &other) = delete;
+ pin_state &operator=(const pin_state &other) = delete;
+ pin_state(pin_state &&other) = delete;
+ pin_state() = default;
+
+ using list_member_options = boost::intrusive::member_hook<
+ extent,
+ boost::intrusive::list_member_hook<>,
+ &extent::pin_list_member>;
+ using list = boost::intrusive::list<extent, list_member_options>;
+ list pin_list;
+ ~pin_state() {
+ ceph_assert(pin_list.empty());
+ ceph_assert(tid == 0);
+ ceph_assert(pin_type == NONE);
+ }
+ void _open(uint64_t in_tid, pin_type_t in_type) {
+ ceph_assert(pin_type == NONE);
+ ceph_assert(in_tid > 0);
+ tid = in_tid;
+ pin_type = in_type;
+ }
+ };
+
+ void release_pin(pin_state &p) {
+ for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
+ std::unique_ptr<extent> extent(&*iter); // we now own this
+ iter++; // unlink will invalidate
+ ceph_assert(extent->parent_extent_set);
+ auto &eset = *(extent->parent_extent_set);
+ extent->unlink();
+ remove_and_destroy_if_empty(eset);
+ }
+ p.tid = 0;
+ p.pin_type = pin_state::NONE;
+ }
+
+public:
+ class write_pin : private pin_state {
+ friend class ExtentCache;
+ private:
+ void open(uint64_t in_tid) {
+ _open(in_tid, pin_state::WRITE);
+ }
+ public:
+ write_pin() : pin_state() {}
+ };
+
+ void open_write_pin(write_pin &pin) {
+ pin.open(next_write_tid++);
+ }
+
+ /**
+ * Reserves extents required for rmw, and learn
+ * which need to be read
+ *
+ * Pins all extents in to_write. Returns subset of to_read not
+ * currently present in the cache. Caller must obtain those
+ * extents before calling get_remaining_extents_for_rmw.
+ *
+ * Transition table:
+ * - Empty -> Write Pending pin.reqid
+ * - Write Pending N -> Write Pending pin.reqid
+ * - Write Pinned N -> Write Pinned pin.reqid
+ *
+ * @param oid [in] object undergoing rmw
+ * @param pin [in,out] pin to use (obtained from create_write_pin)
+ * @param to_write [in] extents which will be written
+ * @param to_read [in] extents to read prior to write (must be subset
+ * of to_write)
+ * @return subset of to_read which isn't already present or pending
+ */
+ extent_set reserve_extents_for_rmw(
+ const hobject_t &oid,
+ write_pin &pin,
+ const extent_set &to_write,
+ const extent_set &to_read);
+
+ /**
+ * Gets extents required for rmw not returned from
+ * reserve_extents_for_rmw
+ *
+ * Requested extents (to_get) must be the set to_read \ the set
+ * returned from reserve_extents_for_rmw. No transition table,
+ * all extents at this point must be present and already pinned
+ * for this pin by reserve_extents_for_rmw.
+ *
+ * @param oid [in] object
+ * @param pin [in,out] pin associated with this IO
+ * @param to_get [in] extents to get (see above for restrictions)
+ * @return map of buffers from to_get
+ */
+ extent_map get_remaining_extents_for_rmw(
+ const hobject_t &oid,
+ write_pin &pin,
+ const extent_set &to_get);
+
+ /**
+ * Updates the cache to reflect the rmw write
+ *
+ * All presented extents must already have been specified in
+ * reserve_extents_for_rmw under to_write.
+ *
+ * Transition table:
+ * - Empty -> invalid, must call reserve_extents_for_rmw first
+ * - Write Pending N -> Write Pinned N, update buffer
+ * (assert N >= pin.reqid)
+ * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
+ *
+ * @param oid [in] object
+ * @param pin [in,out] pin associated with this IO
+ * @param extents [in] map of buffers to update
+ * @return void
+ */
+ void present_rmw_update(
+ const hobject_t &oid,
+ write_pin &pin,
+ const extent_map &extents);
+
+ /**
+ * Release all buffers pinned by pin
+ */
+ void release_write_pin(
+ write_pin &pin) {
+ release_pin(pin);
+ }
+
+ std::ostream &print(std::ostream &out) const;
+};
+
+std::ostream &operator <<(std::ostream &lhs, const ExtentCache &cache);
+
+#endif
diff --git a/src/osd/HitSet.cc b/src/osd/HitSet.cc
new file mode 100644
index 000000000..03475d36f
--- /dev/null
+++ b/src/osd/HitSet.cc
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "HitSet.h"
+#include "common/Formatter.h"
+
+using std::ostream;
+using std::list;
+using ceph::Formatter;
+
+// -- HitSet --
+
+HitSet::HitSet(const HitSet::Params& params)
+ : sealed(false)
+{
+ switch (params.get_type()) {
+ case TYPE_BLOOM:
+ {
+ BloomHitSet::Params *p =
+ static_cast<BloomHitSet::Params*>(params.impl.get());
+ impl.reset(new BloomHitSet(p));
+ }
+ break;
+
+ case TYPE_EXPLICIT_HASH:
+ impl.reset(new ExplicitHashHitSet(static_cast<ExplicitHashHitSet::Params*>(params.impl.get())));
+ break;
+
+ case TYPE_EXPLICIT_OBJECT:
+ impl.reset(new ExplicitObjectHitSet(static_cast<ExplicitObjectHitSet::Params*>(params.impl.get())));
+ break;
+
+ default:
+ assert (0 == "unknown HitSet type");
+ }
+}
+
+void HitSet::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(sealed, bl);
+ if (impl) {
+ encode((__u8)impl->get_type(), bl);
+ impl->encode(bl);
+ } else {
+ encode((__u8)TYPE_NONE, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void HitSet::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ decode(sealed, bl);
+ __u8 type;
+ decode(type, bl);
+ switch ((impl_type_t)type) {
+ case TYPE_EXPLICIT_HASH:
+ impl.reset(new ExplicitHashHitSet);
+ break;
+ case TYPE_EXPLICIT_OBJECT:
+ impl.reset(new ExplicitObjectHitSet);
+ break;
+ case TYPE_BLOOM:
+ impl.reset(new BloomHitSet);
+ break;
+ case TYPE_NONE:
+ impl.reset(NULL);
+ break;
+ default:
+ throw ceph::buffer::malformed_input("unrecognized HitMap type");
+ }
+ if (impl)
+ impl->decode(bl);
+ DECODE_FINISH(bl);
+}
+
+void HitSet::dump(Formatter *f) const
+{
+ f->dump_string("type", get_type_name());
+ f->dump_string("sealed", sealed ? "yes" : "no");
+ if (impl)
+ impl->dump(f);
+}
+
+void HitSet::generate_test_instances(list<HitSet*>& o)
+{
+ o.push_back(new HitSet);
+ o.push_back(new HitSet(new BloomHitSet(10, .1, 1)));
+ o.back()->insert(hobject_t());
+ o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+ o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+ o.push_back(new HitSet(new ExplicitHashHitSet));
+ o.back()->insert(hobject_t());
+ o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+ o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+ o.push_back(new HitSet(new ExplicitObjectHitSet));
+ o.back()->insert(hobject_t());
+ o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+ o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+}
+
+HitSet::Params::Params(const Params& o) noexcept
+{
+ if (o.get_type() != TYPE_NONE) {
+ create_impl(o.get_type());
+ // it's annoying to write virtual operator= methods; use encode/decode
+ // instead.
+ ceph::buffer::list bl;
+ o.impl->encode(bl);
+ auto p = bl.cbegin();
+ impl->decode(p);
+ } // else we don't need to do anything
+}
+
+const HitSet::Params& HitSet::Params::operator=(const Params& o)
+{
+ create_impl(o.get_type());
+ if (o.impl) {
+ // it's annoying to write virtual operator= methods; use encode/decode
+ // instead.
+ ceph::buffer::list bl;
+ o.impl->encode(bl);
+ auto p = bl.cbegin();
+ impl->decode(p);
+ }
+ return *this;
+}
+
+void HitSet::Params::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ if (impl) {
+ encode((__u8)impl->get_type(), bl);
+ impl->encode(bl);
+ } else {
+ encode((__u8)TYPE_NONE, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+bool HitSet::Params::create_impl(impl_type_t type)
+{
+ switch ((impl_type_t)type) {
+ case TYPE_EXPLICIT_HASH:
+ impl.reset(new ExplicitHashHitSet::Params);
+ break;
+ case TYPE_EXPLICIT_OBJECT:
+ impl.reset(new ExplicitObjectHitSet::Params);
+ break;
+ case TYPE_BLOOM:
+ impl.reset(new BloomHitSet::Params);
+ break;
+ case TYPE_NONE:
+ impl.reset(NULL);
+ break;
+ default:
+ return false;
+ }
+ return true;
+}
+
+void HitSet::Params::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ __u8 type;
+ decode(type, bl);
+ if (!create_impl((impl_type_t)type))
+ throw ceph::buffer::malformed_input("unrecognized HitMap type");
+ if (impl)
+ impl->decode(bl);
+ DECODE_FINISH(bl);
+}
+
+void HitSet::Params::dump(Formatter *f) const
+{
+ f->dump_string("type", HitSet::get_type_name(get_type()));
+ if (impl)
+ impl->dump(f);
+}
+
+void HitSet::Params::generate_test_instances(list<HitSet::Params*>& o)
+{
+#define loop_hitset_params(kind) \
+{ \
+ list<kind::Params*> params; \
+ kind::Params::generate_test_instances(params); \
+ for (list<kind::Params*>::iterator i = params.begin(); \
+ i != params.end(); ++i) \
+ o.push_back(new Params(*i)); \
+}
+ o.push_back(new Params);
+ o.push_back(new Params(new BloomHitSet::Params));
+ loop_hitset_params(BloomHitSet);
+ o.push_back(new Params(new ExplicitHashHitSet::Params));
+ loop_hitset_params(ExplicitHashHitSet);
+ o.push_back(new Params(new ExplicitObjectHitSet::Params));
+ loop_hitset_params(ExplicitObjectHitSet);
+}
+
+ostream& operator<<(ostream& out, const HitSet::Params& p) {
+ out << HitSet::get_type_name(p.get_type());
+ if (p.impl) {
+ out << "{";
+ p.impl->dump_stream(out);
+ }
+ out << "}";
+ return out;
+}
+
+
+void ExplicitHashHitSet::dump(Formatter *f) const {
+ f->dump_unsigned("insert_count", count);
+ f->open_array_section("hash_set");
+ for (ceph::unordered_set<uint32_t>::const_iterator p = hits.begin();
+ p != hits.end();
+ ++p)
+ f->dump_unsigned("hash", *p);
+ f->close_section();
+}
+
+void ExplicitObjectHitSet::dump(Formatter *f) const {
+ f->dump_unsigned("insert_count", count);
+ f->open_array_section("set");
+ for (ceph::unordered_set<hobject_t>::const_iterator p = hits.begin();
+ p != hits.end();
+ ++p) {
+ f->open_object_section("object");
+ p->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void BloomHitSet::Params::dump(Formatter *f) const {
+ f->dump_float("false_positive_probability", get_fpp());
+ f->dump_int("target_size", target_size);
+ f->dump_int("seed", seed);
+}
+
+void BloomHitSet::dump(Formatter *f) const {
+ f->open_object_section("bloom_filter");
+ bloom.dump(f);
+ f->close_section();
+}
diff --git a/src/osd/HitSet.h b/src/osd/HitSet.h
new file mode 100644
index 000000000..dedc45ed4
--- /dev/null
+++ b/src/osd/HitSet.h
@@ -0,0 +1,455 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_HITSET_H
+#define CEPH_OSD_HITSET_H
+
+#include <string_view>
+
+#include <boost/scoped_ptr.hpp>
+
+#include "include/encoding.h"
+#include "include/unordered_set.h"
+#include "common/bloom_filter.hpp"
+#include "common/hobject.h"
+
+/**
+ * generic container for a HitSet
+ *
+ * Encapsulate a HitSetImpl of any type. Expose a generic interface
+ * to users and wrap the encoded object with a type so that it can be
+ * safely decoded later.
+ */
+
+class HitSet {
+public:
+ typedef enum {
+ TYPE_NONE = 0,
+ TYPE_EXPLICIT_HASH = 1,
+ TYPE_EXPLICIT_OBJECT = 2,
+ TYPE_BLOOM = 3
+ } impl_type_t;
+
+ static std::string_view get_type_name(impl_type_t t) {
+ switch (t) {
+ case TYPE_NONE: return "none";
+ case TYPE_EXPLICIT_HASH: return "explicit_hash";
+ case TYPE_EXPLICIT_OBJECT: return "explicit_object";
+ case TYPE_BLOOM: return "bloom";
+ default: return "???";
+ }
+ }
+ std::string_view get_type_name() const {
+ if (impl)
+ return get_type_name(impl->get_type());
+ return get_type_name(TYPE_NONE);
+ }
+
+ /// abstract interface for a HitSet implementation
+ class Impl {
+ public:
+ virtual impl_type_t get_type() const = 0;
+ virtual bool is_full() const = 0;
+ virtual void insert(const hobject_t& o) = 0;
+ virtual bool contains(const hobject_t& o) const = 0;
+ virtual unsigned insert_count() const = 0;
+ virtual unsigned approx_unique_insert_count() const = 0;
+ virtual void encode(ceph::buffer::list &bl) const = 0;
+ virtual void decode(ceph::buffer::list::const_iterator& p) = 0;
+ virtual void dump(ceph::Formatter *f) const = 0;
+ virtual Impl* clone() const = 0;
+ virtual void seal() {}
+ virtual ~Impl() {}
+ };
+
+ boost::scoped_ptr<Impl> impl;
+ bool sealed;
+
+ class Params {
+ /// create an Impl* of the given type
+ bool create_impl(impl_type_t t);
+
+ public:
+ class Impl {
+ public:
+ virtual impl_type_t get_type() const = 0;
+ virtual HitSet::Impl *get_new_impl() const = 0;
+ virtual void encode(ceph::buffer::list &bl) const {}
+ virtual void decode(ceph::buffer::list::const_iterator& p) {}
+ virtual void dump(ceph::Formatter *f) const {}
+ virtual void dump_stream(std::ostream& o) const {}
+ virtual ~Impl() {}
+ };
+
+ Params() {}
+ explicit Params(Impl *i) : impl(i) {}
+ virtual ~Params() {}
+
+ boost::scoped_ptr<Params::Impl> impl;
+
+ impl_type_t get_type() const {
+ if (impl)
+ return impl->get_type();
+ return TYPE_NONE;
+ }
+
+ Params(const Params& o) noexcept;
+ const Params& operator=(const Params& o);
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<HitSet::Params*>& o);
+
+ friend std::ostream& operator<<(std::ostream& out, const HitSet::Params& p);
+ };
+
+ HitSet() : impl(NULL), sealed(false) {}
+ explicit HitSet(Impl *i) : impl(i), sealed(false) {}
+ explicit HitSet(const HitSet::Params& params);
+
+ HitSet(const HitSet& o) {
+ sealed = o.sealed;
+ if (o.impl)
+ impl.reset(o.impl->clone());
+ else
+ impl.reset(NULL);
+ }
+ const HitSet& operator=(const HitSet& o) {
+ sealed = o.sealed;
+ if (o.impl)
+ impl.reset(o.impl->clone());
+ else
+ impl.reset(NULL);
+ return *this;
+ }
+
+
+ bool is_full() const {
+ return impl->is_full();
+ }
+ /// insert a hash into the set
+ void insert(const hobject_t& o) {
+ impl->insert(o);
+ }
+ /// query whether a hash is in the set
+ bool contains(const hobject_t& o) const {
+ return impl->contains(o);
+ }
+
+ unsigned insert_count() const {
+ return impl->insert_count();
+ }
+ unsigned approx_unique_insert_count() const {
+ return impl->approx_unique_insert_count();
+ }
+ void seal() {
+ ceph_assert(!sealed);
+ sealed = true;
+ impl->seal();
+ }
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<HitSet*>& o);
+
+private:
+ void reset_to_type(impl_type_t type);
+};
+WRITE_CLASS_ENCODER(HitSet)
+WRITE_CLASS_ENCODER(HitSet::Params)
+
+typedef boost::shared_ptr<HitSet> HitSetRef;
+
+std::ostream& operator<<(std::ostream& out, const HitSet::Params& p);
+
+/**
+ * explicitly enumerate hash hits in the set
+ */
+class ExplicitHashHitSet : public HitSet::Impl {
+ uint64_t count;
+ ceph::unordered_set<uint32_t> hits;
+public:
+ class Params : public HitSet::Params::Impl {
+ public:
+ HitSet::impl_type_t get_type() const override {
+ return HitSet::TYPE_EXPLICIT_HASH;
+ }
+ HitSet::Impl *get_new_impl() const override {
+ return new ExplicitHashHitSet;
+ }
+ static void generate_test_instances(std::list<Params*>& o) {
+ o.push_back(new Params);
+ }
+ };
+
+ ExplicitHashHitSet() : count(0) {}
+ explicit ExplicitHashHitSet(const ExplicitHashHitSet::Params *p) : count(0) {}
+ ExplicitHashHitSet(const ExplicitHashHitSet &o) : count(o.count),
+ hits(o.hits) {}
+
+ HitSet::Impl *clone() const override {
+ return new ExplicitHashHitSet(*this);
+ }
+
+ HitSet::impl_type_t get_type() const override {
+ return HitSet::TYPE_EXPLICIT_HASH;
+ }
+ bool is_full() const override {
+ return false;
+ }
+ void insert(const hobject_t& o) override {
+ hits.insert(o.get_hash());
+ ++count;
+ }
+ bool contains(const hobject_t& o) const override {
+ return hits.count(o.get_hash());
+ }
+ unsigned insert_count() const override {
+ return count;
+ }
+ unsigned approx_unique_insert_count() const override {
+ return hits.size();
+ }
+ void encode(ceph::buffer::list &bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(count, bl);
+ encode(hits, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) override {
+ DECODE_START(1, bl);
+ decode(count, bl);
+ decode(hits, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const override;
+ static void generate_test_instances(std::list<ExplicitHashHitSet*>& o) {
+ o.push_back(new ExplicitHashHitSet);
+ o.push_back(new ExplicitHashHitSet);
+ o.back()->insert(hobject_t());
+ o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+ o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+ }
+};
+WRITE_CLASS_ENCODER(ExplicitHashHitSet)
+
+/**
+ * explicitly enumerate objects in the set
+ */
+class ExplicitObjectHitSet : public HitSet::Impl {
+ uint64_t count;
+ ceph::unordered_set<hobject_t> hits;
+public:
+ class Params : public HitSet::Params::Impl {
+ public:
+ HitSet::impl_type_t get_type() const override {
+ return HitSet::TYPE_EXPLICIT_OBJECT;
+ }
+ HitSet::Impl *get_new_impl() const override {
+ return new ExplicitObjectHitSet;
+ }
+ static void generate_test_instances(std::list<Params*>& o) {
+ o.push_back(new Params);
+ }
+ };
+
+ ExplicitObjectHitSet() : count(0) {}
+ explicit ExplicitObjectHitSet(const ExplicitObjectHitSet::Params *p) : count(0) {}
+ ExplicitObjectHitSet(const ExplicitObjectHitSet &o) : count(o.count),
+ hits(o.hits) {}
+
+ HitSet::Impl *clone() const override {
+ return new ExplicitObjectHitSet(*this);
+ }
+
+ HitSet::impl_type_t get_type() const override {
+ return HitSet::TYPE_EXPLICIT_OBJECT;
+ }
+ bool is_full() const override {
+ return false;
+ }
+ void insert(const hobject_t& o) override {
+ hits.insert(o);
+ ++count;
+ }
+ bool contains(const hobject_t& o) const override {
+ return hits.count(o);
+ }
+ unsigned insert_count() const override {
+ return count;
+ }
+ unsigned approx_unique_insert_count() const override {
+ return hits.size();
+ }
+ void encode(ceph::buffer::list &bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(count, bl);
+ encode(hits, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) override {
+ DECODE_START(1, bl);
+ decode(count, bl);
+ decode(hits, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const override;
+ static void generate_test_instances(std::list<ExplicitObjectHitSet*>& o) {
+ o.push_back(new ExplicitObjectHitSet);
+ o.push_back(new ExplicitObjectHitSet);
+ o.back()->insert(hobject_t());
+ o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+ o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+ }
+};
+WRITE_CLASS_ENCODER(ExplicitObjectHitSet)
+
+/**
+ * use a bloom_filter to track hits to the set
+ */
+class BloomHitSet : public HitSet::Impl {
+ compressible_bloom_filter bloom;
+
+public:
+ HitSet::impl_type_t get_type() const override {
+ return HitSet::TYPE_BLOOM;
+ }
+
+ class Params : public HitSet::Params::Impl {
+ public:
+ HitSet::impl_type_t get_type() const override {
+ return HitSet::TYPE_BLOOM;
+ }
+ HitSet::Impl *get_new_impl() const override {
+ return new BloomHitSet;
+ }
+
+ uint32_t fpp_micro; ///< false positive probability / 1M
+ uint64_t target_size; ///< number of unique insertions we expect to this HitSet
+ uint64_t seed; ///< seed to use when initializing the bloom filter
+
+ Params()
+ : fpp_micro(0), target_size(0), seed(0) {}
+ Params(double fpp, uint64_t t, uint64_t s)
+ : fpp_micro(fpp * 1000000.0), target_size(t), seed(s) {}
+ Params(const Params &o)
+ : fpp_micro(o.fpp_micro),
+ target_size(o.target_size),
+ seed(o.seed) {}
+ ~Params() override {}
+
+ double get_fpp() const {
+ return (double)fpp_micro / 1000000.0;
+ }
+ void set_fpp(double f) {
+ fpp_micro = (unsigned)(llrintl(f * 1000000.0));
+ }
+
+ void encode(ceph::buffer::list& bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(fpp_micro, bl);
+ encode(target_size, bl);
+ encode(seed, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) override {
+ DECODE_START(1, bl);
+ decode(fpp_micro, bl);
+ decode(target_size, bl);
+ decode(seed, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const override;
+ void dump_stream(std::ostream& o) const override {
+ o << "false_positive_probability: "
+ << get_fpp() << ", target_size: " << target_size
+ << ", seed: " << seed;
+ }
+ static void generate_test_instances(std::list<Params*>& o) {
+ o.push_back(new Params);
+ o.push_back(new Params);
+ (*o.rbegin())->fpp_micro = 123456;
+ (*o.rbegin())->target_size = 300;
+ (*o.rbegin())->seed = 99;
+ }
+ };
+
+ BloomHitSet() {}
+ BloomHitSet(unsigned inserts, double fpp, int seed)
+ : bloom(inserts, fpp, seed)
+ {}
+ explicit BloomHitSet(const BloomHitSet::Params *p) : bloom(p->target_size,
+ p->get_fpp(),
+ p->seed)
+ {}
+
+ BloomHitSet(const BloomHitSet &o) {
+ // oh god
+ ceph::buffer::list bl;
+ o.encode(bl);
+ auto bli = std::cbegin(bl);
+ this->decode(bli);
+ }
+
+ HitSet::Impl *clone() const override {
+ return new BloomHitSet(*this);
+ }
+
+ bool is_full() const override {
+ return bloom.is_full();
+ }
+
+ void insert(const hobject_t& o) override {
+ bloom.insert(o.get_hash());
+ }
+ bool contains(const hobject_t& o) const override {
+ return bloom.contains(o.get_hash());
+ }
+ unsigned insert_count() const override {
+ return bloom.element_count();
+ }
+ unsigned approx_unique_insert_count() const override {
+ return bloom.approx_unique_element_count();
+ }
+ void seal() override {
+ // aim for a density of .5 (50% of bit set)
+ double pc = bloom.density() * 2.0;
+ if (pc < 1.0)
+ bloom.compress(pc);
+ }
+
+ void encode(ceph::buffer::list &bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(bloom, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) override {
+ DECODE_START(1, bl);
+ decode(bloom, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const override;
+ static void generate_test_instances(std::list<BloomHitSet*>& o) {
+ o.push_back(new BloomHitSet);
+ o.push_back(new BloomHitSet(10, .1, 1));
+ o.back()->insert(hobject_t());
+ o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+ o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+ }
+};
+WRITE_CLASS_ENCODER(BloomHitSet)
+
+#endif
diff --git a/src/osd/MissingLoc.cc b/src/osd/MissingLoc.cc
new file mode 100644
index 000000000..d45220a82
--- /dev/null
+++ b/src/osd/MissingLoc.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MissingLoc.h"
+
+#define dout_context cct
+#undef dout_prefix
+#define dout_prefix (gen_prefix(*_dout))
+#define dout_subsys ceph_subsys_osd
+
+using std::set;
+
+bool MissingLoc::readable_with_acting(
+ const hobject_t &hoid,
+ const set<pg_shard_t> &acting,
+ eversion_t* v) const {
+ if (!needs_recovery(hoid, v))
+ return true;
+ if (is_deleted(hoid))
+ return false;
+ auto missing_loc_entry = missing_loc.find(hoid);
+ if (missing_loc_entry == missing_loc.end())
+ return false;
+ const set<pg_shard_t> &locs = missing_loc_entry->second;
+ ldout(cct, 10) << __func__ << ": locs:" << locs << dendl;
+ set<pg_shard_t> have_acting;
+ for (auto i = locs.begin(); i != locs.end(); ++i) {
+ if (acting.count(*i))
+ have_acting.insert(*i);
+ }
+ return (*is_readable)(have_acting);
+}
+
+void MissingLoc::add_batch_sources_info(
+ const set<pg_shard_t> &sources,
+ HBHandle *handle)
+{
+ ldout(cct, 10) << __func__ << ": adding sources in batch "
+ << sources.size() << dendl;
+ unsigned loop = 0;
+ bool sources_updated = false;
+ for (auto i = needs_recovery_map.begin();
+ i != needs_recovery_map.end();
+ ++i) {
+ if (handle && ++loop >= cct->_conf->osd_loop_before_reset_tphandle) {
+ handle->reset_tp_timeout();
+ loop = 0;
+ }
+ if (i->second.is_delete())
+ continue;
+
+ auto p = missing_loc.find(i->first);
+ if (p == missing_loc.end()) {
+ p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
+ } else {
+ _dec_count(p->second);
+ }
+ missing_loc[i->first].insert(sources.begin(), sources.end());
+ _inc_count(p->second);
+
+ if (!sources_updated) {
+ missing_loc_sources.insert(sources.begin(), sources.end());
+ sources_updated = true;
+ }
+ }
+}
+
+bool MissingLoc::add_source_info(
+ pg_shard_t fromosd,
+ const pg_info_t &oinfo,
+ const pg_missing_t &omissing,
+ HBHandle *handle)
+{
+ bool found_missing = false;
+ unsigned loop = 0;
+ bool sources_updated = false;
+ // found items?
+ for (auto p = needs_recovery_map.begin();
+ p != needs_recovery_map.end();
+ ++p) {
+ const hobject_t &soid(p->first);
+ eversion_t need = p->second.need;
+ if (handle && ++loop >= cct->_conf->osd_loop_before_reset_tphandle) {
+ handle->reset_tp_timeout();
+ loop = 0;
+ }
+ if (p->second.is_delete()) {
+ ldout(cct, 10) << __func__ << " " << soid
+ << " delete, ignoring source" << dendl;
+ continue;
+ }
+ if (oinfo.last_update < need) {
+ ldout(cct, 10) << "search_for_missing " << soid << " " << need
+ << " also missing on osd." << fromosd
+ << " (last_update " << oinfo.last_update
+ << " < needed " << need << ")" << dendl;
+ continue;
+ }
+ if (p->first >= oinfo.last_backfill) {
+ // FIXME: this is _probably_ true, although it could conceivably
+ // be in the undefined region! Hmm!
+ ldout(cct, 10) << "search_for_missing " << soid << " " << need
+ << " also missing on osd." << fromosd
+ << " (past last_backfill " << oinfo.last_backfill
+ << ")" << dendl;
+ continue;
+ }
+ if (omissing.is_missing(soid)) {
+ ldout(cct, 10) << "search_for_missing " << soid << " " << need
+ << " also missing on osd." << fromosd << dendl;
+ continue;
+ }
+
+ ldout(cct, 10) << "search_for_missing " << soid << " " << need
+ << " is on osd." << fromosd << dendl;
+
+ {
+ auto p = missing_loc.find(soid);
+ if (p == missing_loc.end()) {
+ p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
+ } else {
+ _dec_count(p->second);
+ }
+ p->second.insert(fromosd);
+ _inc_count(p->second);
+ }
+
+ if (!sources_updated) {
+ missing_loc_sources.insert(fromosd);
+ sources_updated = true;
+ }
+ found_missing = true;
+ }
+
+ ldout(cct, 20) << "needs_recovery_map missing " << needs_recovery_map
+ << dendl;
+ return found_missing;
+}
+
+void MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
+{
+ set<pg_shard_t> now_down;
+ for (auto p = missing_loc_sources.begin();
+ p != missing_loc_sources.end();
+ ) {
+ if (osdmap->is_up(p->osd)) {
+ ++p;
+ continue;
+ }
+ ldout(cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
+ now_down.insert(*p);
+ missing_loc_sources.erase(p++);
+ }
+
+ if (now_down.empty()) {
+ ldout(cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
+ } else {
+ ldout(cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
+ << missing_loc_sources << dendl;
+
+ // filter missing_loc
+ auto p = missing_loc.begin();
+ while (p != missing_loc.end()) {
+ auto q = p->second.begin();
+ bool changed = false;
+ while (q != p->second.end()) {
+ if (now_down.count(*q)) {
+ if (!changed) {
+ changed = true;
+ _dec_count(p->second);
+ }
+ p->second.erase(q++);
+ } else {
+ ++q;
+ }
+ }
+ if (p->second.empty()) {
+ missing_loc.erase(p++);
+ } else {
+ if (changed) {
+ _inc_count(p->second);
+ }
+ ++p;
+ }
+ }
+ }
+}
+
+void MissingLoc::remove_stray_recovery_sources(pg_shard_t stray)
+{
+ ldout(cct, 10) << __func__ << " remove osd " << stray << " from missing_loc" << dendl;
+ // filter missing_loc
+ auto p = missing_loc.begin();
+ while (p != missing_loc.end()) {
+ auto q = p->second.begin();
+ bool changed = false;
+ while (q != p->second.end()) {
+ if (*q == stray) {
+ if (!changed) {
+ changed = true;
+ _dec_count(p->second);
+ }
+ p->second.erase(q++);
+ } else {
+ ++q;
+ }
+ }
+ if (p->second.empty()) {
+ missing_loc.erase(p++);
+ } else {
+ if (changed) {
+ _inc_count(p->second);
+ }
+ ++p;
+ }
+ }
+ // filter missing_loc_sources
+ for (auto p = missing_loc_sources.begin(); p != missing_loc_sources.end();) {
+ if (*p != stray) {
+ ++p;
+ continue;
+ }
+ ldout(cct, 10) << __func__ << " remove osd" << stray << " from missing_loc_sources" << dendl;
+ missing_loc_sources.erase(p++);
+ }
+}
diff --git a/src/osd/MissingLoc.h b/src/osd/MissingLoc.h
new file mode 100644
index 000000000..9bce3ceda
--- /dev/null
+++ b/src/osd/MissingLoc.h
@@ -0,0 +1,353 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <set>
+
+#include "OSDMap.h"
+#include "common/HBHandle.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "osd_types.h"
+
+class MissingLoc {
+ public:
+
+ class MappingInfo {
+ public:
+ virtual const std::set<pg_shard_t> &get_upset() const = 0;
+ virtual bool is_ec_pg() const = 0;
+ virtual int get_pg_size() const = 0;
+ virtual ~MappingInfo() {}
+ };
+
+ // a loc_count indicates how many locations we know in each of
+ // these distinct sets
+ struct loc_count_t {
+ int up = 0; //< up
+ int other = 0; //< other
+
+ friend bool operator<(const loc_count_t& l,
+ const loc_count_t& r) {
+ return (l.up < r.up ||
+ (l.up == r.up &&
+ (l.other < r.other)));
+ }
+ friend std::ostream& operator<<(std::ostream& out, const loc_count_t& l) {
+ ceph_assert(l.up >= 0);
+ ceph_assert(l.other >= 0);
+ return out << "(" << l.up << "+" << l.other << ")";
+ }
+ };
+
+
+ using missing_by_count_t = std::map<shard_id_t, std::map<loc_count_t,int>>;
+ private:
+ loc_count_t _get_count(const std::set<pg_shard_t> &shards) {
+ loc_count_t r;
+ for (auto s : shards) {
+ if (mapping_info->get_upset().count(s)) {
+ r.up++;
+ } else {
+ r.other++;
+ }
+ }
+ return r;
+ }
+
+ std::map<hobject_t, pg_missing_item> needs_recovery_map;
+ std::map<hobject_t, std::set<pg_shard_t> > missing_loc;
+ std::set<pg_shard_t> missing_loc_sources;
+
+ // for every entry in missing_loc, we count how many of each type of shard we have,
+ // and maintain totals here. The sum of the values for this std::map will always equal
+ // missing_loc.size().
+ missing_by_count_t missing_by_count;
+
+ void pgs_by_shard_id(
+ const std::set<pg_shard_t>& s,
+ std::map<shard_id_t, std::set<pg_shard_t> >& pgsbs) {
+ if (mapping_info->is_ec_pg()) {
+ int num_shards = mapping_info->get_pg_size();
+ // For completely missing shards initialize with empty std::set<pg_shard_t>
+ for (int i = 0 ; i < num_shards ; ++i) {
+ shard_id_t shard(i);
+ pgsbs[shard];
+ }
+ for (auto pgs: s)
+ pgsbs[pgs.shard].insert(pgs);
+ } else {
+ pgsbs[shard_id_t::NO_SHARD] = s;
+ }
+ }
+
+ void _inc_count(const std::set<pg_shard_t>& s) {
+ std::map< shard_id_t, std::set<pg_shard_t> > pgsbs;
+ pgs_by_shard_id(s, pgsbs);
+ for (auto shard: pgsbs)
+ ++missing_by_count[shard.first][_get_count(shard.second)];
+ }
+ void _dec_count(const std::set<pg_shard_t>& s) {
+ std::map< shard_id_t, std::set<pg_shard_t> > pgsbs;
+ pgs_by_shard_id(s, pgsbs);
+ for (auto shard: pgsbs) {
+ auto p = missing_by_count[shard.first].find(_get_count(shard.second));
+ ceph_assert(p != missing_by_count[shard.first].end());
+ if (--p->second == 0) {
+ missing_by_count[shard.first].erase(p);
+ }
+ }
+ }
+
+ spg_t pgid;
+ MappingInfo *mapping_info;
+ DoutPrefixProvider *dpp;
+ CephContext *cct;
+ std::set<pg_shard_t> empty_set;
+ public:
+ boost::scoped_ptr<IsPGReadablePredicate> is_readable;
+ boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
+ explicit MissingLoc(
+ spg_t pgid,
+ MappingInfo *mapping_info,
+ DoutPrefixProvider *dpp,
+ CephContext *cct)
+ : pgid(pgid), mapping_info(mapping_info), dpp(dpp), cct(cct) { }
+ void set_backend_predicates(
+ IsPGReadablePredicate *_is_readable,
+ IsPGRecoverablePredicate *_is_recoverable) {
+ is_readable.reset(_is_readable);
+ is_recoverable.reset(_is_recoverable);
+ }
+ const IsPGRecoverablePredicate &get_recoverable_predicate() const {
+ return *is_recoverable;
+ }
+ std::ostream& gen_prefix(std::ostream& out) const {
+ return dpp->gen_prefix(out);
+ }
+ bool needs_recovery(
+ const hobject_t &hoid,
+ eversion_t *v = 0) const {
+ std::map<hobject_t, pg_missing_item>::const_iterator i =
+ needs_recovery_map.find(hoid);
+ if (i == needs_recovery_map.end())
+ return false;
+ if (v)
+ *v = i->second.need;
+ return true;
+ }
+ bool is_deleted(const hobject_t &hoid) const {
+ auto i = needs_recovery_map.find(hoid);
+ if (i == needs_recovery_map.end())
+ return false;
+ return i->second.is_delete();
+ }
+ bool is_unfound(const hobject_t &hoid) const {
+ auto it = needs_recovery_map.find(hoid);
+ if (it == needs_recovery_map.end()) {
+ return false;
+ }
+ if (it->second.is_delete()) {
+ return false;
+ }
+ auto mit = missing_loc.find(hoid);
+ return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
+ }
+ bool readable_with_acting(
+ const hobject_t &hoid,
+ const std::set<pg_shard_t> &acting,
+ eversion_t* v = 0) const;
+ uint64_t num_unfound() const {
+ uint64_t ret = 0;
+ for (std::map<hobject_t, pg_missing_item>::const_iterator i =
+ needs_recovery_map.begin();
+ i != needs_recovery_map.end();
+ ++i) {
+ if (i->second.is_delete())
+ continue;
+ auto mi = missing_loc.find(i->first);
+ if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
+ ++ret;
+ }
+ return ret;
+ }
+
+ bool have_unfound() const {
+ for (std::map<hobject_t, pg_missing_item>::const_iterator i =
+ needs_recovery_map.begin();
+ i != needs_recovery_map.end();
+ ++i) {
+ if (i->second.is_delete())
+ continue;
+ auto mi = missing_loc.find(i->first);
+ if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
+ return true;
+ }
+ return false;
+ }
+ void clear() {
+ needs_recovery_map.clear();
+ missing_loc.clear();
+ missing_loc_sources.clear();
+ missing_by_count.clear();
+ }
+
+ void add_location(const hobject_t &hoid, pg_shard_t location) {
+ auto p = missing_loc.find(hoid);
+ if (p == missing_loc.end()) {
+ p = missing_loc.emplace(hoid, std::set<pg_shard_t>()).first;
+ } else {
+ _dec_count(p->second);
+ }
+ p->second.insert(location);
+ _inc_count(p->second);
+ }
+ void remove_location(const hobject_t &hoid, pg_shard_t location) {
+ auto p = missing_loc.find(hoid);
+ if (p != missing_loc.end()) {
+ _dec_count(p->second);
+ p->second.erase(location);
+ if (p->second.empty()) {
+ missing_loc.erase(p);
+ } else {
+ _inc_count(p->second);
+ }
+ }
+ }
+
+ void clear_location(const hobject_t &hoid) {
+ auto p = missing_loc.find(hoid);
+ if (p != missing_loc.end()) {
+ _dec_count(p->second);
+ missing_loc.erase(p);
+ }
+ }
+
+ void add_active_missing(const pg_missing_t &missing) {
+ for (std::map<hobject_t, pg_missing_item>::const_iterator i =
+ missing.get_items().begin();
+ i != missing.get_items().end();
+ ++i) {
+ std::map<hobject_t, pg_missing_item>::const_iterator j =
+ needs_recovery_map.find(i->first);
+ if (j == needs_recovery_map.end()) {
+ needs_recovery_map.insert(*i);
+ } else {
+ if (i->second.need != j->second.need) {
+ lgeneric_dout(cct, 0) << this << " " << pgid << " unexpected need for "
+ << i->first << " have " << j->second
+ << " tried to add " << i->second << dendl;
+ ceph_assert(0 == "unexpected need for missing item");
+ }
+ }
+ }
+ }
+
+ void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
+ needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
+ }
+ void revise_need(const hobject_t &hoid, eversion_t need) {
+ auto it = needs_recovery_map.find(hoid);
+ ceph_assert(it != needs_recovery_map.end());
+ it->second.need = need;
+ }
+
+ /// Adds info about a possible recovery source
+ bool add_source_info(
+ pg_shard_t source, ///< [in] source
+ const pg_info_t &oinfo, ///< [in] info
+ const pg_missing_t &omissing, ///< [in] (optional) missing
+ HBHandle *handle ///< [in] ThreadPool handle
+ ); ///< @return whether a new object location was discovered
+
+ /// Adds recovery sources in batch
+ void add_batch_sources_info(
+ const std::set<pg_shard_t> &sources, ///< [in] a std::set of resources which can be used for all objects
+ HBHandle *handle ///< [in] ThreadPool handle
+ );
+
+ /// Uses osdmap to update structures for now down sources
+ void check_recovery_sources(const OSDMapRef& osdmap);
+
+ /// Remove stray from recovery sources
+ void remove_stray_recovery_sources(pg_shard_t stray);
+
+ /// Call when hoid is no longer missing in acting std::set
+ void recovered(const hobject_t &hoid) {
+ needs_recovery_map.erase(hoid);
+ auto p = missing_loc.find(hoid);
+ if (p != missing_loc.end()) {
+ _dec_count(p->second);
+ missing_loc.erase(p);
+ }
+ }
+
+ /// Call to update structures for hoid after a change
+ void rebuild(
+ const hobject_t &hoid,
+ pg_shard_t self,
+ const std::set<pg_shard_t> &to_recover,
+ const pg_info_t &info,
+ const pg_missing_t &missing,
+ const std::map<pg_shard_t, pg_missing_t> &pmissing,
+ const std::map<pg_shard_t, pg_info_t> &pinfo) {
+ recovered(hoid);
+ std::optional<pg_missing_item> item;
+ auto miter = missing.get_items().find(hoid);
+ if (miter != missing.get_items().end()) {
+ item = miter->second;
+ } else {
+ for (auto &&i: to_recover) {
+ if (i == self)
+ continue;
+ auto pmiter = pmissing.find(i);
+ ceph_assert(pmiter != pmissing.end());
+ miter = pmiter->second.get_items().find(hoid);
+ if (miter != pmiter->second.get_items().end()) {
+ item = miter->second;
+ break;
+ }
+ }
+ }
+ if (!item)
+ return; // recovered!
+
+ needs_recovery_map[hoid] = *item;
+ if (item->is_delete())
+ return;
+ auto mliter =
+ missing_loc.emplace(hoid, std::set<pg_shard_t>()).first;
+ ceph_assert(info.last_backfill.is_max());
+ ceph_assert(info.last_update >= item->need);
+ if (!missing.is_missing(hoid))
+ mliter->second.insert(self);
+ for (auto &&i: pmissing) {
+ if (i.first == self)
+ continue;
+ auto pinfoiter = pinfo.find(i.first);
+ ceph_assert(pinfoiter != pinfo.end());
+ if (item->need <= pinfoiter->second.last_update &&
+ hoid <= pinfoiter->second.last_backfill &&
+ !i.second.is_missing(hoid))
+ mliter->second.insert(i.first);
+ }
+ _inc_count(mliter->second);
+ }
+
+ const std::set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
+ auto it = missing_loc.find(hoid);
+ return it == missing_loc.end() ? empty_set : it->second;
+ }
+ const std::map<hobject_t, std::set<pg_shard_t>> &get_missing_locs() const {
+ return missing_loc;
+ }
+ const std::map<hobject_t, pg_missing_item> &get_needs_recovery() const {
+ return needs_recovery_map;
+ }
+
+ const missing_by_count_t &get_missing_by_count() const {
+ return missing_by_count;
+ }
+};
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
new file mode 100644
index 000000000..4066a679f
--- /dev/null
+++ b/src/osd/OSD.cc
@@ -0,0 +1,11378 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+
+#include <cctype>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <time.h>
+#include <boost/scoped_ptr.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#include "osd/PG.h"
+#include "osd/scrub_machine.h"
+#include "osd/pg_scrubber.h"
+
+#include "include/types.h"
+#include "include/compat.h"
+#include "include/random.h"
+
+#include "OSD.h"
+#include "OSDMap.h"
+#include "Watch.h"
+#include "osdc/Objecter.h"
+
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_releases.h"
+#include "common/ceph_time.h"
+#include "common/version.h"
+#include "common/async/blocked_completion.h"
+#include "common/pick_address.h"
+#include "common/blkdev.h"
+#include "common/numa.h"
+
+#include "os/ObjectStore.h"
+#ifdef HAVE_LIBFUSE
+#include "os/FuseStore.h"
+#endif
+
+#include "PrimaryLogPG.h"
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "mon/MonClient.h"
+
+#include "messages/MLog.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDMarkMeDead.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDPGReadyToMerge.h"
+
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGNotify2.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGQuery2.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGInfo2.h"
+#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDForceRecovery.h"
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+
+#include "messages/MOSDPeeringOp.h"
+
+#include "messages/MOSDAlive.h"
+
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MOSDRepScrub.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "messages/MPGStats.h"
+
+#include "messages/MWatchNotify.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGPull.h"
+
+#include "messages/MMonGetPurgedSnaps.h"
+#include "messages/MMonGetPurgedSnapsReply.h"
+
+#include "common/perf_counters.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+#include "common/AsyncReserver.h"
+#include "common/HeartbeatMap.h"
+#include "common/admin_socket.h"
+#include "common/ceph_context.h"
+
+#include "global/signal_handler.h"
+#include "global/pidfile.h"
+
+#include "include/color.h"
+#include "perfglue/cpu_profiler.h"
+#include "perfglue/heap_profiler.h"
+
+#include "osd/ClassHandler.h"
+#include "osd/OpRequest.h"
+
+#include "auth/AuthAuthorizeHandler.h"
+#include "auth/RotatingKeyRing.h"
+
+#include "objclass/objclass.h"
+
+#include "common/cmdparse.h"
+#include "include/str_list.h"
+#include "include/util.h"
+
+#include "include/ceph_assert.h"
+#include "common/config.h"
+#include "common/EventTrace.h"
+
+#include "json_spirit/json_spirit_reader.h"
+#include "json_spirit/json_spirit_writer.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/osd.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
+
+using std::deque;
+using std::list;
+using std::lock_guard;
+using std::make_pair;
+using std::make_tuple;
+using std::make_unique;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::fixed_u_to_string;
+using ceph::Formatter;
+using ceph::heartbeat_handle_d;
+using ceph::make_mutex;
+
+using namespace ceph::osd::scheduler;
+using TOPNSPC::common::cmd_getval;
+
+static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
+ return *_dout << "osd." << whoami << " " << epoch << " ";
+}
+
+//Initial features in new superblock.
+//Features here are also automatically upgraded
+CompatSet OSD::get_osd_initial_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+
+//Features are added here that this OSD supports.
+CompatSet OSD::get_osd_compat_set() {
+ CompatSet compat = get_osd_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
+OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
+ osd(osd),
+ cct(osd->cct),
+ whoami(osd->whoami), store(osd->store),
+ log_client(osd->log_client), clog(osd->clog),
+ pg_recovery_stats(osd->pg_recovery_stats),
+ cluster_messenger(osd->cluster_messenger),
+ client_messenger(osd->client_messenger),
+ logger(osd->logger),
+ recoverystate_perf(osd->recoverystate_perf),
+ monc(osd->monc),
+ osd_max_object_size(cct->_conf, "osd_max_object_size"),
+ osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
+ publish_lock{ceph::make_mutex("OSDService::publish_lock")},
+ pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
+ max_oldest_map(0),
+ scrubs_local(0),
+ scrubs_remote(0),
+ agent_valid_iterator(false),
+ agent_ops(0),
+ flush_mode_high_count(0),
+ agent_active(true),
+ agent_thread(this),
+ agent_stop_flag(false),
+ agent_timer(osd->client_messenger->cct, agent_timer_lock),
+ last_recalibrate(ceph_clock_now()),
+ promote_max_objects(0),
+ promote_max_bytes(0),
+ poolctx(poolctx),
+ objecter(make_unique<Objecter>(osd->client_messenger->cct,
+ osd->objecter_messenger,
+ osd->monc, poolctx)),
+ m_objecter_finishers(cct->_conf->osd_objecter_finishers),
+ watch_timer(osd->client_messenger->cct, watch_lock),
+ next_notif_id(0),
+ recovery_request_timer(cct, recovery_request_lock, false),
+ sleep_timer(cct, sleep_lock, false),
+ reserver_finisher(cct),
+ local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
+ cct->_conf->osd_min_recovery_priority),
+ remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
+ cct->_conf->osd_min_recovery_priority),
+ snap_reserver(cct, &reserver_finisher,
+ cct->_conf->osd_max_trimming_pgs),
+ recovery_ops_active(0),
+ recovery_ops_reserved(0),
+ recovery_paused(false),
+ map_cache(cct, cct->_conf->osd_map_cache_size),
+ map_bl_cache(cct->_conf->osd_map_cache_size),
+ map_bl_inc_cache(cct->_conf->osd_map_cache_size),
+ cur_state(NONE),
+ cur_ratio(0), physical_ratio(0),
+ boot_epoch(0), up_epoch(0), bind_epoch(0)
+{
+ objecter->init();
+
+ for (int i = 0; i < m_objecter_finishers; i++) {
+ ostringstream str;
+ str << "objecter-finisher-" << i;
+ auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
+ objecter_finishers.push_back(std::move(fin));
+ }
+}
+
+#ifdef PG_DEBUG_REFS
+void OSDService::add_pgid(spg_t pgid, PG *pg) {
+ std::lock_guard l(pgid_lock);
+ if (!pgid_tracker.count(pgid)) {
+ live_pgs[pgid] = pg;
+ }
+ pgid_tracker[pgid]++;
+}
+void OSDService::remove_pgid(spg_t pgid, PG *pg)
+{
+ std::lock_guard l(pgid_lock);
+ ceph_assert(pgid_tracker.count(pgid));
+ ceph_assert(pgid_tracker[pgid] > 0);
+ pgid_tracker[pgid]--;
+ if (pgid_tracker[pgid] == 0) {
+ pgid_tracker.erase(pgid);
+ live_pgs.erase(pgid);
+ }
+}
+void OSDService::dump_live_pgids()
+{
+ std::lock_guard l(pgid_lock);
+ derr << "live pgids:" << dendl;
+ for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
+ i != pgid_tracker.cend();
+ ++i) {
+ derr << "\t" << *i << dendl;
+ live_pgs[i->first]->dump_live_ids();
+ }
+}
+#endif
+
+
+ceph::signedspan OSDService::get_mnow()
+{
+ return ceph::mono_clock::now() - osd->startup_time;
+}
+
+void OSDService::identify_splits_and_merges(
+ OSDMapRef old_map,
+ OSDMapRef new_map,
+ spg_t pgid,
+ set<pair<spg_t,epoch_t>> *split_children,
+ set<pair<spg_t,epoch_t>> *merge_pgs)
+{
+ if (!old_map->have_pg_pool(pgid.pool())) {
+ return;
+ }
+ int old_pgnum = old_map->get_pg_num(pgid.pool());
+ auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
+ if (p == osd->pg_num_history.pg_nums.end()) {
+ return;
+ }
+ dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
+ << " to e" << new_map->get_epoch()
+ << " pg_nums " << p->second << dendl;
+ deque<spg_t> queue;
+ queue.push_back(pgid);
+ set<spg_t> did;
+ while (!queue.empty()) {
+ auto cur = queue.front();
+ queue.pop_front();
+ did.insert(cur);
+ unsigned pgnum = old_pgnum;
+ for (auto q = p->second.lower_bound(old_map->get_epoch());
+ q != p->second.end() &&
+ q->first <= new_map->get_epoch();
+ ++q) {
+ if (pgnum < q->second) {
+ // split?
+ if (cur.ps() < pgnum) {
+ set<spg_t> children;
+ if (cur.is_split(pgnum, q->second, &children)) {
+ dout(20) << __func__ << " " << cur << " e" << q->first
+ << " pg_num " << pgnum << " -> " << q->second
+ << " children " << children << dendl;
+ for (auto i : children) {
+ split_children->insert(make_pair(i, q->first));
+ if (!did.count(i))
+ queue.push_back(i);
+ }
+ }
+ } else if (cur.ps() < q->second) {
+ dout(20) << __func__ << " " << cur << " e" << q->first
+ << " pg_num " << pgnum << " -> " << q->second
+ << " is a child" << dendl;
+ // normally we'd capture this from the parent, but it's
+ // possible the parent doesn't exist yet (it will be
+ // fabricated to allow an intervening merge). note this PG
+ // as a split child here to be sure we catch it.
+ split_children->insert(make_pair(cur, q->first));
+ } else {
+ dout(20) << __func__ << " " << cur << " e" << q->first
+ << " pg_num " << pgnum << " -> " << q->second
+ << " is post-split, skipping" << dendl;
+ }
+ } else if (merge_pgs) {
+ // merge?
+ if (cur.ps() >= q->second) {
+ if (cur.ps() < pgnum) {
+ spg_t parent;
+ if (cur.is_merge_source(pgnum, q->second, &parent)) {
+ set<spg_t> children;
+ parent.is_split(q->second, pgnum, &children);
+ dout(20) << __func__ << " " << cur << " e" << q->first
+ << " pg_num " << pgnum << " -> " << q->second
+ << " is merge source, target " << parent
+ << ", source(s) " << children << dendl;
+ merge_pgs->insert(make_pair(parent, q->first));
+ if (!did.count(parent)) {
+ // queue (and re-scan) parent in case it might not exist yet
+ // and there are some future splits pending on it
+ queue.push_back(parent);
+ }
+ for (auto c : children) {
+ merge_pgs->insert(make_pair(c, q->first));
+ if (!did.count(c))
+ queue.push_back(c);
+ }
+ }
+ } else {
+ dout(20) << __func__ << " " << cur << " e" << q->first
+ << " pg_num " << pgnum << " -> " << q->second
+ << " is beyond old pgnum, skipping" << dendl;
+ }
+ } else {
+ set<spg_t> children;
+ if (cur.is_split(q->second, pgnum, &children)) {
+ dout(20) << __func__ << " " << cur << " e" << q->first
+ << " pg_num " << pgnum << " -> " << q->second
+ << " is merge target, source " << children << dendl;
+ for (auto c : children) {
+ merge_pgs->insert(make_pair(c, q->first));
+ if (!did.count(c))
+ queue.push_back(c);
+ }
+ merge_pgs->insert(make_pair(cur, q->first));
+ }
+ }
+ }
+ pgnum = q->second;
+ }
+ }
+}
+
+void OSDService::need_heartbeat_peer_update()
+{
+ osd->need_heartbeat_peer_update();
+}
+
+HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
+{
+ std::lock_guard l(hb_stamp_lock);
+ if (peer >= hb_stamps.size()) {
+ hb_stamps.resize(peer + 1);
+ }
+ if (!hb_stamps[peer]) {
+ hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
+ }
+ return hb_stamps[peer];
+}
+
+void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
+{
+ osd->enqueue_peering_evt(
+ spgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ epoch, epoch,
+ RenewLease())));
+}
+
+void OSDService::start_shutdown()
+{
+ {
+ std::lock_guard l(agent_timer_lock);
+ agent_timer.shutdown();
+ }
+
+ {
+ std::lock_guard l(sleep_lock);
+ sleep_timer.shutdown();
+ }
+
+ {
+ std::lock_guard l(recovery_request_lock);
+ recovery_request_timer.shutdown();
+ }
+}
+
+void OSDService::shutdown_reserver()
+{
+ reserver_finisher.wait_for_empty();
+ reserver_finisher.stop();
+}
+
+void OSDService::shutdown()
+{
+ mono_timer.suspend();
+
+ {
+ std::lock_guard l(watch_lock);
+ watch_timer.shutdown();
+ }
+
+ objecter->shutdown();
+ for (auto& f : objecter_finishers) {
+ f->wait_for_empty();
+ f->stop();
+ }
+
+ publish_map(OSDMapRef());
+ next_osdmap = OSDMapRef();
+}
+
+void OSDService::init()
+{
+ reserver_finisher.start();
+ for (auto& f : objecter_finishers) {
+ f->start();
+ }
+ objecter->set_client_incarnation(0);
+
+ // deprioritize objecter in daemonperf output
+ objecter->get_logger()->set_prio_adjust(-3);
+
+ watch_timer.init();
+ agent_timer.init();
+ mono_timer.resume();
+
+ agent_thread.create("osd_srv_agent");
+
+ if (cct->_conf->osd_recovery_delay_start)
+ defer_recovery(cct->_conf->osd_recovery_delay_start);
+}
+
+void OSDService::final_init()
+{
+ objecter->start(osdmap.get());
+}
+
+void OSDService::activate_map()
+{
+ // wake/unwake the tiering agent
+ std::lock_guard l{agent_lock};
+ agent_active =
+ !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
+ osd->is_active();
+ agent_cond.notify_all();
+}
+
+void OSDService::request_osdmap_update(epoch_t e)
+{
+ osd->osdmap_subscribe(e, false);
+}
+
+
+class AgentTimeoutCB : public Context {
+ PGRef pg;
+public:
+ explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
+ void finish(int) override {
+ pg->agent_choose_mode_restart();
+ }
+};
+
+void OSDService::agent_entry()
+{
+ dout(10) << __func__ << " start" << dendl;
+ std::unique_lock agent_locker{agent_lock};
+
+ while (!agent_stop_flag) {
+ if (agent_queue.empty()) {
+ dout(20) << __func__ << " empty queue" << dendl;
+ agent_cond.wait(agent_locker);
+ continue;
+ }
+ uint64_t level = agent_queue.rbegin()->first;
+ set<PGRef>& top = agent_queue.rbegin()->second;
+ dout(10) << __func__
+ << " tiers " << agent_queue.size()
+ << ", top is " << level
+ << " with pgs " << top.size()
+ << ", ops " << agent_ops << "/"
+ << cct->_conf->osd_agent_max_ops
+ << (agent_active ? " active" : " NOT ACTIVE")
+ << dendl;
+ dout(20) << __func__ << " oids " << agent_oids << dendl;
+ int max = cct->_conf->osd_agent_max_ops - agent_ops;
+ int agent_flush_quota = max;
+ if (!flush_mode_high_count)
+ agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
+ if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
+ agent_cond.wait(agent_locker);
+ continue;
+ }
+
+ if (!agent_valid_iterator || agent_queue_pos == top.end()) {
+ agent_queue_pos = top.begin();
+ agent_valid_iterator = true;
+ }
+ PGRef pg = *agent_queue_pos;
+ dout(10) << "high_count " << flush_mode_high_count
+ << " agent_ops " << agent_ops
+ << " flush_quota " << agent_flush_quota << dendl;
+ agent_locker.unlock();
+ if (!pg->agent_work(max, agent_flush_quota)) {
+ dout(10) << __func__ << " " << pg->pg_id
+ << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
+ << " seconds" << dendl;
+
+ logger->inc(l_osd_tier_delay);
+ // Queue a timer to call agent_choose_mode for this pg in 5 seconds
+ std::lock_guard timer_locker{agent_timer_lock};
+ Context *cb = new AgentTimeoutCB(pg);
+ agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
+ }
+ agent_locker.lock();
+ }
+ dout(10) << __func__ << " finish" << dendl;
+}
+
+void OSDService::agent_stop()
+{
+ {
+ std::lock_guard l(agent_lock);
+
+ // By this time all ops should be cancelled
+ ceph_assert(agent_ops == 0);
+ // By this time all PGs are shutdown and dequeued
+ if (!agent_queue.empty()) {
+ set<PGRef>& top = agent_queue.rbegin()->second;
+ derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
+ ceph_abort_msg("agent queue not empty");
+ }
+
+ agent_stop_flag = true;
+ agent_cond.notify_all();
+ }
+ agent_thread.join();
+}
+
+// -------------------------------------
+
+void OSDService::promote_throttle_recalibrate()
+{
+ utime_t now = ceph_clock_now();
+ double dur = now - last_recalibrate;
+ last_recalibrate = now;
+ unsigned prob = promote_probability_millis;
+
+ uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
+ uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
+
+ unsigned min_prob = 1;
+
+ uint64_t attempts, obj, bytes;
+ promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
+ dout(10) << __func__ << " " << attempts << " attempts, promoted "
+ << obj << " objects and " << byte_u_t(bytes) << "; target "
+ << target_obj_sec << " obj/sec or "
+ << byte_u_t(target_bytes_sec) << "/sec"
+ << dendl;
+
+ // calculate what the probability *should* be, given the targets
+ unsigned new_prob;
+ if (attempts && dur > 0) {
+ uint64_t avg_size = 1;
+ if (obj)
+ avg_size = std::max<uint64_t>(bytes / obj, 1);
+ unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
+ unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
+ / (double)attempts;
+ dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
+ << avg_size << dendl;
+ if (target_obj_sec && target_bytes_sec)
+ new_prob = std::min(po, pb);
+ else if (target_obj_sec)
+ new_prob = po;
+ else if (target_bytes_sec)
+ new_prob = pb;
+ else
+ new_prob = 1000;
+ } else {
+ new_prob = 1000;
+ }
+ dout(20) << __func__ << " new_prob " << new_prob << dendl;
+
+ // correct for persistent skew between target rate and actual rate, adjust
+ double ratio = 1.0;
+ unsigned actual = 0;
+ if (attempts && obj) {
+ actual = obj * 1000 / attempts;
+ ratio = (double)actual / (double)prob;
+ new_prob = (double)new_prob / ratio;
+ }
+ new_prob = std::max(new_prob, min_prob);
+ new_prob = std::min(new_prob, 1000u);
+
+ // adjust
+ prob = (prob + new_prob) / 2;
+ prob = std::max(prob, min_prob);
+ prob = std::min(prob, 1000u);
+ dout(10) << __func__ << " actual " << actual
+ << ", actual/prob ratio " << ratio
+ << ", adjusted new_prob " << new_prob
+ << ", prob " << promote_probability_millis << " -> " << prob
+ << dendl;
+ promote_probability_millis = prob;
+
+ // set hard limits for this interval to mitigate stampedes
+ promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
+ promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
+}
+
+// -------------------------------------
+
+float OSDService::get_failsafe_full_ratio()
+{
+ float full_ratio = cct->_conf->osd_failsafe_full_ratio;
+ if (full_ratio > 1.0) full_ratio /= 100.0;
+ return full_ratio;
+}
+
+OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
+{
+ // The OSDMap ratios take precendence. So if the failsafe is .95 and
+ // the admin sets the cluster full to .96, the failsafe moves up to .96
+ // too. (Not that having failsafe == full is ideal, but it's better than
+ // dropping writes before the clusters appears full.)
+ OSDMapRef osdmap = get_osdmap();
+ if (!osdmap || osdmap->get_epoch() == 0) {
+ return NONE;
+ }
+ float nearfull_ratio = osdmap->get_nearfull_ratio();
+ float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
+ float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
+ float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
+
+ if (osdmap->require_osd_release < ceph_release_t::luminous) {
+ // use the failsafe for nearfull and full; the mon isn't using the
+ // flags anyway because we're mid-upgrade.
+ full_ratio = failsafe_ratio;
+ backfillfull_ratio = failsafe_ratio;
+ nearfull_ratio = failsafe_ratio;
+ } else if (full_ratio <= 0 ||
+ backfillfull_ratio <= 0 ||
+ nearfull_ratio <= 0) {
+ derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
+ // use failsafe flag. ick. the monitor did something wrong or the user
+ // did something stupid.
+ full_ratio = failsafe_ratio;
+ backfillfull_ratio = failsafe_ratio;
+ nearfull_ratio = failsafe_ratio;
+ }
+
+ if (injectfull_state > NONE && injectfull) {
+ inject = "(Injected)";
+ return injectfull_state;
+ } else if (pratio > failsafe_ratio) {
+ return FAILSAFE;
+ } else if (ratio > full_ratio) {
+ return FULL;
+ } else if (ratio > backfillfull_ratio) {
+ return BACKFILLFULL;
+ } else if (pratio > nearfull_ratio) {
+ return NEARFULL;
+ }
+ return NONE;
+}
+
+void OSDService::check_full_status(float ratio, float pratio)
+{
+ std::lock_guard l(full_status_lock);
+
+ cur_ratio = ratio;
+ physical_ratio = pratio;
+
+ string inject;
+ s_names new_state;
+ new_state = recalc_full_state(ratio, pratio, inject);
+
+ dout(20) << __func__ << " cur ratio " << ratio
+ << ", physical ratio " << pratio
+ << ", new state " << get_full_state_name(new_state)
+ << " " << inject
+ << dendl;
+
+ // warn
+ if (cur_state != new_state) {
+ dout(10) << __func__ << " " << get_full_state_name(cur_state)
+ << " -> " << get_full_state_name(new_state) << dendl;
+ if (new_state == FAILSAFE) {
+ clog->error() << "full status failsafe engaged, dropping updates, now "
+ << (int)roundf(ratio * 100) << "% full";
+ } else if (cur_state == FAILSAFE) {
+ clog->error() << "full status failsafe disengaged, no longer dropping "
+ << "updates, now " << (int)roundf(ratio * 100) << "% full";
+ }
+ cur_state = new_state;
+ }
+}
+
+bool OSDService::need_fullness_update()
+{
+ OSDMapRef osdmap = get_osdmap();
+ s_names cur = NONE;
+ if (osdmap->exists(whoami)) {
+ if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
+ cur = FULL;
+ } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
+ cur = BACKFILLFULL;
+ } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
+ cur = NEARFULL;
+ }
+ }
+ s_names want = NONE;
+ if (is_full())
+ want = FULL;
+ else if (is_backfillfull())
+ want = BACKFILLFULL;
+ else if (is_nearfull())
+ want = NEARFULL;
+ return want != cur;
+}
+
+bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
+{
+ if (injectfull && injectfull_state >= type) {
+ // injectfull is either a count of the number of times to return failsafe full
+ // or if -1 then always return full
+ if (injectfull > 0)
+ --injectfull;
+ ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
+ << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
+{
+ std::lock_guard l(full_status_lock);
+
+ if (_check_inject_full(dpp, type))
+ return true;
+
+ if (cur_state >= type)
+ ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
+ << " physical " << physical_ratio << dendl;
+
+ return cur_state >= type;
+}
+
+bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
+{
+ ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
+ {
+ std::lock_guard l(full_status_lock);
+ if (_check_inject_full(dpp, type)) {
+ return true;
+ }
+ }
+
+ float pratio;
+ float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
+
+ string notused;
+ s_names tentative_state = recalc_full_state(ratio, pratio, notused);
+
+ if (tentative_state >= type)
+ ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
+
+ return tentative_state >= type;
+}
+
+bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
+{
+ return _check_full(dpp, FAILSAFE);
+}
+
+bool OSDService::check_full(DoutPrefixProvider *dpp) const
+{
+ return _check_full(dpp, FULL);
+}
+
+bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
+{
+ return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
+}
+
+bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
+{
+ return _check_full(dpp, BACKFILLFULL);
+}
+
+bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
+{
+ return _check_full(dpp, NEARFULL);
+}
+
+bool OSDService::is_failsafe_full() const
+{
+ std::lock_guard l(full_status_lock);
+ return cur_state == FAILSAFE;
+}
+
+bool OSDService::is_full() const
+{
+ std::lock_guard l(full_status_lock);
+ return cur_state >= FULL;
+}
+
+bool OSDService::is_backfillfull() const
+{
+ std::lock_guard l(full_status_lock);
+ return cur_state >= BACKFILLFULL;
+}
+
+bool OSDService::is_nearfull() const
+{
+ std::lock_guard l(full_status_lock);
+ return cur_state >= NEARFULL;
+}
+
+void OSDService::set_injectfull(s_names type, int64_t count)
+{
+ std::lock_guard l(full_status_lock);
+ injectfull_state = type;
+ injectfull = count;
+}
+
+void OSDService::set_statfs(const struct store_statfs_t &stbuf,
+ osd_alert_list_t& alerts)
+{
+ uint64_t bytes = stbuf.total;
+ uint64_t avail = stbuf.available;
+ uint64_t used = stbuf.get_used_raw();
+
+ // For testing fake statfs values so it doesn't matter if all
+ // OSDs are using the same partition.
+ if (cct->_conf->fake_statfs_for_testing) {
+ uint64_t total_num_bytes = 0;
+ vector<PGRef> pgs;
+ osd->_get_pgs(&pgs);
+ for (auto p : pgs) {
+ total_num_bytes += p->get_stats_num_bytes();
+ }
+ bytes = cct->_conf->fake_statfs_for_testing;
+ if (total_num_bytes < bytes)
+ avail = bytes - total_num_bytes;
+ else
+ avail = 0;
+ dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
+ << " adjust available " << avail
+ << dendl;
+ used = bytes - avail;
+ }
+
+ logger->set(l_osd_stat_bytes, bytes);
+ logger->set(l_osd_stat_bytes_used, used);
+ logger->set(l_osd_stat_bytes_avail, avail);
+
+ std::lock_guard l(stat_lock);
+ osd_stat.statfs = stbuf;
+ osd_stat.os_alerts.clear();
+ osd_stat.os_alerts[whoami].swap(alerts);
+ if (cct->_conf->fake_statfs_for_testing) {
+ osd_stat.statfs.total = bytes;
+ osd_stat.statfs.available = avail;
+ // For testing don't want used to go negative, so clear reserved
+ osd_stat.statfs.internally_reserved = 0;
+ }
+}
+
+osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
+ int num_pgs)
+{
+ utime_t now = ceph_clock_now();
+ auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
+ std::lock_guard l(stat_lock);
+ osd_stat.hb_peers.swap(hb_peers);
+ osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
+ osd_stat.num_pgs = num_pgs;
+ // Clean entries that aren't updated
+ // This is called often enough that we can just remove 1 at a time
+ for (auto i: osd_stat.hb_pingtime) {
+ if (i.second.last_update == 0)
+ continue;
+ if (stale_time && now.sec() - i.second.last_update > stale_time) {
+ dout(20) << __func__ << " time out heartbeat for osd " << i.first
+ << " last_update " << i.second.last_update << dendl;
+ osd_stat.hb_pingtime.erase(i.first);
+ break;
+ }
+ }
+ return osd_stat;
+}
+
+void OSDService::inc_osd_stat_repaired()
+{
+ std::lock_guard l(stat_lock);
+ osd_stat.num_shards_repaired++;
+ return;
+}
+
+float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
+ uint64_t adjust_used)
+{
+ *pratio =
+ ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
+
+ if (adjust_used) {
+ dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
+ if (new_stat.statfs.available > adjust_used)
+ new_stat.statfs.available -= adjust_used;
+ else
+ new_stat.statfs.available = 0;
+ dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
+ }
+
+ // Check all pgs and adjust kb_used to include all pending backfill data
+ int backfill_adjusted = 0;
+ vector<PGRef> pgs;
+ osd->_get_pgs(&pgs);
+ for (auto p : pgs) {
+ backfill_adjusted += p->pg_stat_adjust(&new_stat);
+ }
+ if (backfill_adjusted) {
+ dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
+ }
+ return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
+}
+
+void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
+{
+ OSDMapRef next_map = get_nextmap_reserved();
+ // service map is always newer/newest
+ ceph_assert(from_epoch <= next_map->get_epoch());
+
+ if (next_map->is_down(peer) ||
+ next_map->get_info(peer).up_from > from_epoch) {
+ m->put();
+ release_map(next_map);
+ return;
+ }
+ ConnectionRef peer_con;
+ if (peer == whoami) {
+ peer_con = osd->cluster_messenger->get_loopback_connection();
+ } else {
+ peer_con = osd->cluster_messenger->connect_to_osd(
+ next_map->get_cluster_addrs(peer), false, true);
+ }
+ maybe_share_map(peer_con.get(), next_map);
+ peer_con->send_message(m);
+ release_map(next_map);
+}
+
+void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
+{
+ OSDMapRef next_map = get_nextmap_reserved();
+ // service map is always newer/newest
+ ceph_assert(from_epoch <= next_map->get_epoch());
+
+ for (auto& iter : messages) {
+ if (next_map->is_down(iter.first) ||
+ next_map->get_info(iter.first).up_from > from_epoch) {
+ iter.second->put();
+ continue;
+ }
+ ConnectionRef peer_con;
+ if (iter.first == whoami) {
+ peer_con = osd->cluster_messenger->get_loopback_connection();
+ } else {
+ peer_con = osd->cluster_messenger->connect_to_osd(
+ next_map->get_cluster_addrs(iter.first), false, true);
+ }
+ maybe_share_map(peer_con.get(), next_map);
+ peer_con->send_message(iter.second);
+ }
+ release_map(next_map);
+}
+ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
+{
+ OSDMapRef next_map = get_nextmap_reserved();
+ // service map is always newer/newest
+ ceph_assert(from_epoch <= next_map->get_epoch());
+
+ if (next_map->is_down(peer) ||
+ next_map->get_info(peer).up_from > from_epoch) {
+ release_map(next_map);
+ return NULL;
+ }
+ ConnectionRef con;
+ if (peer == whoami) {
+ con = osd->cluster_messenger->get_loopback_connection();
+ } else {
+ con = osd->cluster_messenger->connect_to_osd(
+ next_map->get_cluster_addrs(peer), false, true);
+ }
+ release_map(next_map);
+ return con;
+}
+
+pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
+{
+ OSDMapRef next_map = get_nextmap_reserved();
+ // service map is always newer/newest
+ ceph_assert(from_epoch <= next_map->get_epoch());
+
+ pair<ConnectionRef,ConnectionRef> ret;
+ if (next_map->is_down(peer) ||
+ next_map->get_info(peer).up_from > from_epoch) {
+ release_map(next_map);
+ return ret;
+ }
+ ret.first = osd->hb_back_client_messenger->connect_to_osd(
+ next_map->get_hb_back_addrs(peer));
+ ret.second = osd->hb_front_client_messenger->connect_to_osd(
+ next_map->get_hb_front_addrs(peer));
+ release_map(next_map);
+ return ret;
+}
+
+entity_name_t OSDService::get_cluster_msgr_name() const
+{
+ return cluster_messenger->get_myname();
+}
+
+void OSDService::queue_want_pg_temp(pg_t pgid,
+ const vector<int>& want,
+ bool forced)
+{
+ std::lock_guard l(pg_temp_lock);
+ auto p = pg_temp_pending.find(pgid);
+ if (p == pg_temp_pending.end() ||
+ p->second.acting != want ||
+ forced) {
+ pg_temp_wanted[pgid] = {want, forced};
+ }
+}
+
+void OSDService::remove_want_pg_temp(pg_t pgid)
+{
+ std::lock_guard l(pg_temp_lock);
+ pg_temp_wanted.erase(pgid);
+ pg_temp_pending.erase(pgid);
+}
+
+void OSDService::_sent_pg_temp()
+{
+#ifdef HAVE_STDLIB_MAP_SPLICING
+ pg_temp_pending.merge(pg_temp_wanted);
+#else
+ pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
+ make_move_iterator(end(pg_temp_wanted)));
+#endif
+ pg_temp_wanted.clear();
+}
+
+void OSDService::requeue_pg_temp()
+{
+ std::lock_guard l(pg_temp_lock);
+ // wanted overrides pending. note that remove_want_pg_temp
+ // clears the item out of both.
+ unsigned old_wanted = pg_temp_wanted.size();
+ unsigned old_pending = pg_temp_pending.size();
+ _sent_pg_temp();
+ pg_temp_wanted.swap(pg_temp_pending);
+ dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
+ << pg_temp_wanted.size() << dendl;
+}
+
+std::ostream& operator<<(std::ostream& out,
+ const OSDService::pg_temp_t& pg_temp)
+{
+ out << pg_temp.acting;
+ if (pg_temp.forced) {
+ out << " (forced)";
+ }
+ return out;
+}
+
+void OSDService::send_pg_temp()
+{
+ std::lock_guard l(pg_temp_lock);
+ if (pg_temp_wanted.empty())
+ return;
+ dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
+ MOSDPGTemp *ms[2] = {nullptr, nullptr};
+ for (auto& [pgid, pg_temp] : pg_temp_wanted) {
+ auto& m = ms[pg_temp.forced];
+ if (!m) {
+ m = new MOSDPGTemp(osdmap->get_epoch());
+ m->forced = pg_temp.forced;
+ }
+ m->pg_temp.emplace(pgid, pg_temp.acting);
+ }
+ for (auto m : ms) {
+ if (m) {
+ monc->send_mon_message(m);
+ }
+ }
+ _sent_pg_temp();
+}
+
+void OSDService::send_pg_created(pg_t pgid)
+{
+ std::lock_guard l(pg_created_lock);
+ dout(20) << __func__ << dendl;
+ auto o = get_osdmap();
+ if (o->require_osd_release >= ceph_release_t::luminous) {
+ pg_created.insert(pgid);
+ monc->send_mon_message(new MOSDPGCreated(pgid));
+ }
+}
+
+void OSDService::send_pg_created()
+{
+ std::lock_guard l(pg_created_lock);
+ dout(20) << __func__ << dendl;
+ auto o = get_osdmap();
+ if (o->require_osd_release >= ceph_release_t::luminous) {
+ for (auto pgid : pg_created) {
+ monc->send_mon_message(new MOSDPGCreated(pgid));
+ }
+ }
+}
+
+void OSDService::prune_pg_created()
+{
+ std::lock_guard l(pg_created_lock);
+ dout(20) << __func__ << dendl;
+ auto o = get_osdmap();
+ auto i = pg_created.begin();
+ while (i != pg_created.end()) {
+ auto p = o->get_pg_pool(i->pool());
+ if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
+ dout(20) << __func__ << " pruning " << *i << dendl;
+ i = pg_created.erase(i);
+ } else {
+ dout(20) << __func__ << " keeping " << *i << dendl;
+ ++i;
+ }
+ }
+}
+
+
+// --------------------------------------
+// dispatch
+
+bool OSDService::can_inc_scrubs()
+{
+ bool can_inc = false;
+ std::lock_guard l(sched_scrub_lock);
+
+ if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+ dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
+ << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
+ can_inc = true;
+ } else {
+ dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
+ << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
+ }
+
+ return can_inc;
+}
+
+bool OSDService::inc_scrubs_local()
+{
+ bool result = false;
+ std::lock_guard l{sched_scrub_lock};
+ if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+ dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
+ result = true;
+ ++scrubs_local;
+ } else {
+ dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
+ }
+ return result;
+}
+
+void OSDService::dec_scrubs_local()
+{
+ std::lock_guard l{sched_scrub_lock};
+ dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
+ --scrubs_local;
+ ceph_assert(scrubs_local >= 0);
+}
+
+bool OSDService::inc_scrubs_remote()
+{
+ bool result = false;
+ std::lock_guard l{sched_scrub_lock};
+ if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+ dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+ result = true;
+ ++scrubs_remote;
+ } else {
+ dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
+ }
+ return result;
+}
+
+void OSDService::dec_scrubs_remote()
+{
+ std::lock_guard l{sched_scrub_lock};
+ dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
+ << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+ --scrubs_remote;
+ ceph_assert(scrubs_remote >= 0);
+}
+
+void OSDService::dump_scrub_reservations(Formatter *f)
+{
+ std::lock_guard l{sched_scrub_lock};
+ f->dump_int("scrubs_local", scrubs_local);
+ f->dump_int("scrubs_remote", scrubs_remote);
+ f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
+}
+
+void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
+ epoch_t *_bind_epoch) const
+{
+ std::lock_guard l(epoch_lock);
+ if (_boot_epoch)
+ *_boot_epoch = boot_epoch;
+ if (_up_epoch)
+ *_up_epoch = up_epoch;
+ if (_bind_epoch)
+ *_bind_epoch = bind_epoch;
+}
+
+void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
+ const epoch_t *_bind_epoch)
+{
+ std::lock_guard l(epoch_lock);
+ if (_boot_epoch) {
+ ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
+ boot_epoch = *_boot_epoch;
+ }
+ if (_up_epoch) {
+ ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
+ up_epoch = *_up_epoch;
+ }
+ if (_bind_epoch) {
+ ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
+ bind_epoch = *_bind_epoch;
+ }
+}
+
+bool OSDService::prepare_to_stop()
+{
+ std::unique_lock l(is_stopping_lock);
+ if (get_state() != NOT_STOPPING)
+ return false;
+
+ OSDMapRef osdmap = get_osdmap();
+ if (osdmap && osdmap->is_up(whoami)) {
+ dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
+ set_state(PREPARING_TO_STOP);
+ monc->send_mon_message(
+ new MOSDMarkMeDown(
+ monc->get_fsid(),
+ whoami,
+ osdmap->get_addrs(whoami),
+ osdmap->get_epoch(),
+ true, // request ack
+ true // mark as down and dead
+ ));
+ const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
+ is_stopping_cond.wait_for(l, timeout,
+ [this] { return get_state() == STOPPING; });
+ }
+
+ dout(0) << __func__ << " starting shutdown" << dendl;
+ set_state(STOPPING);
+ return true;
+}
+
+void OSDService::got_stop_ack()
+{
+ std::scoped_lock l(is_stopping_lock);
+ if (get_state() == PREPARING_TO_STOP) {
+ dout(0) << __func__ << " starting shutdown" << dendl;
+ set_state(STOPPING);
+ is_stopping_cond.notify_all();
+ } else {
+ dout(10) << __func__ << " ignoring msg" << dendl;
+ }
+}
+
+MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
+ OSDSuperblock& sblock)
+{
+ MOSDMap *m = new MOSDMap(monc->get_fsid(),
+ osdmap->get_encoding_features());
+ m->oldest_map = max_oldest_map;
+ m->newest_map = sblock.newest_map;
+
+ int max = cct->_conf->osd_map_message_max;
+ ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
+
+ if (since < m->oldest_map) {
+ // we don't have the next map the target wants, so start with a
+ // full map.
+ bufferlist bl;
+ dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
+ << since << ", starting with full map" << dendl;
+ since = m->oldest_map;
+ if (!get_map_bl(since, bl)) {
+ derr << __func__ << " missing full map " << since << dendl;
+ goto panic;
+ }
+ max--;
+ max_bytes -= bl.length();
+ m->maps[since] = std::move(bl);
+ }
+ for (epoch_t e = since + 1; e <= to; ++e) {
+ bufferlist bl;
+ if (get_inc_map_bl(e, bl)) {
+ m->incremental_maps[e] = std::move(bl);
+ } else {
+ dout(10) << __func__ << " missing incremental map " << e << dendl;
+ if (!get_map_bl(e, bl)) {
+ derr << __func__ << " also missing full map " << e << dendl;
+ goto panic;
+ }
+ m->maps[e] = std::move(bl);
+ }
+ max--;
+ max_bytes -= bl.length();
+ if (max <= 0 || max_bytes <= 0) {
+ break;
+ }
+ }
+ return m;
+
+ panic:
+ if (!m->maps.empty() ||
+ !m->incremental_maps.empty()) {
+ // send what we have so far
+ return m;
+ }
+ // send something
+ bufferlist bl;
+ if (get_inc_map_bl(m->newest_map, bl)) {
+ m->incremental_maps[m->newest_map] = std::move(bl);
+ } else {
+ derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
+ if (!get_map_bl(m->newest_map, bl)) {
+ derr << __func__ << " unable to load latest full map " << m->newest_map
+ << dendl;
+ ceph_abort();
+ }
+ m->maps[m->newest_map] = std::move(bl);
+ }
+ return m;
+}
+
+void OSDService::send_map(MOSDMap *m, Connection *con)
+{
+ con->send_message(m);
+}
+
+void OSDService::send_incremental_map(epoch_t since, Connection *con,
+ const OSDMapRef& osdmap)
+{
+ epoch_t to = osdmap->get_epoch();
+ dout(10) << "send_incremental_map " << since << " -> " << to
+ << " to " << con << " " << con->get_peer_addr() << dendl;
+
+ MOSDMap *m = NULL;
+ while (!m) {
+ OSDSuperblock sblock(get_superblock());
+ if (since < sblock.oldest_map) {
+ // just send latest full map
+ MOSDMap *m = new MOSDMap(monc->get_fsid(),
+ osdmap->get_encoding_features());
+ m->oldest_map = max_oldest_map;
+ m->newest_map = sblock.newest_map;
+ get_map_bl(to, m->maps[to]);
+ send_map(m, con);
+ return;
+ }
+
+ if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
+ dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
+ << ", only sending most recent" << dendl;
+ since = to - cct->_conf->osd_map_share_max_epochs;
+ }
+
+ m = build_incremental_map_msg(since, to, sblock);
+ }
+ send_map(m, con);
+}
+
+bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
+{
+ bool found = map_bl_cache.lookup(e, &bl);
+ if (found) {
+ logger->inc(l_osd_map_bl_cache_hit);
+ return true;
+ }
+ logger->inc(l_osd_map_bl_cache_miss);
+ found = store->read(meta_ch,
+ OSD::get_osdmap_pobject_name(e), 0, 0, bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
+ if (found) {
+ _add_map_bl(e, bl);
+ }
+ return found;
+}
+
+bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
+{
+ std::lock_guard l(map_cache_lock);
+ bool found = map_bl_inc_cache.lookup(e, &bl);
+ if (found) {
+ logger->inc(l_osd_map_bl_cache_hit);
+ return true;
+ }
+ logger->inc(l_osd_map_bl_cache_miss);
+ found = store->read(meta_ch,
+ OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
+ if (found) {
+ _add_map_inc_bl(e, bl);
+ }
+ return found;
+}
+
+void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
+{
+ dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
+ // cache a contiguous buffer
+ if (bl.get_num_buffers() > 1) {
+ bl.rebuild();
+ }
+ bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
+ map_bl_cache.add(e, bl);
+}
+
+void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
+{
+ dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
+ // cache a contiguous buffer
+ if (bl.get_num_buffers() > 1) {
+ bl.rebuild();
+ }
+ bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
+ map_bl_inc_cache.add(e, bl);
+}
+
+OSDMapRef OSDService::_add_map(OSDMap *o)
+{
+ epoch_t e = o->get_epoch();
+
+ if (cct->_conf->osd_map_dedup) {
+ // Dedup against an existing map at a nearby epoch
+ OSDMapRef for_dedup = map_cache.lower_bound(e);
+ if (for_dedup) {
+ OSDMap::dedup(for_dedup.get(), o);
+ }
+ }
+ bool existed;
+ OSDMapRef l = map_cache.add(e, o, &existed);
+ if (existed) {
+ delete o;
+ }
+ return l;
+}
+
+OSDMapRef OSDService::try_get_map(epoch_t epoch)
+{
+ std::lock_guard l(map_cache_lock);
+ OSDMapRef retval = map_cache.lookup(epoch);
+ if (retval) {
+ dout(30) << "get_map " << epoch << " -cached" << dendl;
+ logger->inc(l_osd_map_cache_hit);
+ return retval;
+ }
+ {
+ logger->inc(l_osd_map_cache_miss);
+ epoch_t lb = map_cache.cached_key_lower_bound();
+ if (epoch < lb) {
+ dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
+ logger->inc(l_osd_map_cache_miss_low);
+ logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
+ }
+ }
+
+ OSDMap *map = new OSDMap;
+ if (epoch > 0) {
+ dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
+ bufferlist bl;
+ if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
+ derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
+ delete map;
+ return OSDMapRef();
+ }
+ map->decode(bl);
+ } else {
+ dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
+ }
+ return _add_map(map);
+}
+
+// ops
+
+
+void OSDService::reply_op_error(OpRequestRef op, int err)
+{
+ reply_op_error(op, err, eversion_t(), 0, {});
+}
+
+void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
+ version_t uv,
+ vector<pg_log_op_return_item_t> op_returns)
+{
+ auto m = op->get_req<MOSDOp>();
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+ int flags;
+ flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
+
+ MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
+ !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
+ reply->set_reply_versions(v, uv);
+ reply->set_op_returns(op_returns);
+ m->get_connection()->send_message(reply);
+}
+
+void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
+{
+ if (!cct->_conf->osd_debug_misdirected_ops) {
+ return;
+ }
+
+ auto m = op->get_req<MOSDOp>();
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+ ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
+
+ if (pg->is_ec_pg()) {
+ /**
+ * OSD recomputes op target based on current OSDMap. With an EC pg, we
+ * can get this result:
+ * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
+ * [CRUSH_ITEM_NONE, 2, 3]/3
+ * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
+ * [3, 2, 3]/3
+ * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
+ * -- misdirected op
+ * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
+ * it and fulfils it
+ *
+ * We can't compute the op target based on the sending map epoch due to
+ * splitting. The simplest thing is to detect such cases here and drop
+ * them without an error (the client will resend anyway).
+ */
+ ceph_assert(m->get_map_epoch() <= superblock.newest_map);
+ OSDMapRef opmap = try_get_map(m->get_map_epoch());
+ if (!opmap) {
+ dout(7) << __func__ << ": " << *pg << " no longer have map for "
+ << m->get_map_epoch() << ", dropping" << dendl;
+ return;
+ }
+ pg_t _pgid = m->get_raw_pg();
+ spg_t pgid;
+ if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
+ _pgid = opmap->raw_pg_to_pg(_pgid);
+ if (opmap->get_primary_shard(_pgid, &pgid) &&
+ pgid.shard != pg->pg_id.shard) {
+ dout(7) << __func__ << ": " << *pg << " primary changed since "
+ << m->get_map_epoch() << ", dropping" << dendl;
+ return;
+ }
+ }
+
+ dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
+ clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
+ << " pg " << m->get_raw_pg()
+ << " to osd." << whoami
+ << " not " << pg->get_acting()
+ << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
+}
+
+void OSDService::enqueue_back(OpSchedulerItem&& qi)
+{
+ osd->op_shardedwq.queue(std::move(qi));
+}
+
+void OSDService::enqueue_front(OpSchedulerItem&& qi)
+{
+ osd->op_shardedwq.queue_front(std::move(qi));
+}
+
+void OSDService::queue_recovery_context(
+ PG *pg,
+ GenContext<ThreadPool::TPHandle&> *c)
+{
+ epoch_t e = get_osdmap_epoch();
+ enqueue_back(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(
+ new PGRecoveryContext(pg->get_pgid(), c, e)),
+ cct->_conf->osd_recovery_cost,
+ cct->_conf->osd_recovery_priority,
+ ceph_clock_now(),
+ 0,
+ e));
+}
+
+void OSDService::queue_for_snap_trim(PG *pg)
+{
+ dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
+ enqueue_back(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(
+ new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
+ cct->_conf->osd_snap_trim_cost,
+ cct->_conf->osd_snap_trim_priority,
+ ceph_clock_now(),
+ 0,
+ pg->get_osdmap_epoch()));
+}
+
+template <class MSG_TYPE>
+void OSDService::queue_scrub_event_msg(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority,
+ Scrub::act_token_t act_token)
+{
+ const auto epoch = pg->get_osdmap_epoch();
+ auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
+ dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
+ << ". Epoch: " << epoch << " token: " << act_token << dendl;
+
+ enqueue_back(OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
+ pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
+}
+
+template <class MSG_TYPE>
+void OSDService::queue_scrub_event_msg(PG* pg,
+ Scrub::scrub_prio_t with_priority)
+{
+ const auto epoch = pg->get_osdmap_epoch();
+ auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
+ dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
+
+ enqueue_back(OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
+ pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
+}
+
+void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ queue_scrub_event_msg<PGScrub>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
+}
+
+void OSDService::queue_for_rep_scrub(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority,
+ Scrub::act_token_t act_token)
+{
+ queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
+}
+
+void OSDService::queue_for_rep_scrub_resched(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority,
+ Scrub::act_token_t act_token)
+{
+ // Resulting scrub event: 'SchedReplica'
+ queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
+ act_token);
+}
+
+void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'RemotesReserved'
+ queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
+}
+
+void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'ReservationFailure'
+ queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
+}
+
+void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'InternalSchedScrub'
+ queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'ActivePushesUpd'
+ queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'SelectedChunkFree'
+ queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'ChunkIsBusy'
+ queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'Unblocked'
+ queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'DigestUpdate'
+ queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'IntLocalMapDone'
+ queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'GotReplicas'
+ queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'MapsCompared'
+ queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'ReplicaPushesUpd'
+ queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_is_finished(PG *pg)
+{
+ // Resulting scrub event: 'ScrubFinished'
+ queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
+}
+
+void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
+{
+ // Resulting scrub event: 'NextChunk'
+ queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
+}
+
+void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
+{
+ dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
+ enqueue_back(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(
+ new PGDelete(pgid, e)),
+ cct->_conf->osd_pg_delete_cost,
+ cct->_conf->osd_pg_delete_priority,
+ ceph_clock_now(),
+ 0,
+ e));
+}
+
+bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
+{
+ return osd->try_finish_pg_delete(pg, old_pg_num);
+}
+
+// ---
+
+void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
+{
+ std::lock_guard l(merge_lock);
+ dout(10) << __func__ << " " << pg->pg_id << dendl;
+ ready_to_merge_source[pg->pg_id.pgid] = version;
+ assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
+ _send_ready_to_merge();
+}
+
+void OSDService::set_ready_to_merge_target(PG *pg,
+ eversion_t version,
+ epoch_t last_epoch_started,
+ epoch_t last_epoch_clean)
+{
+ std::lock_guard l(merge_lock);
+ dout(10) << __func__ << " " << pg->pg_id << dendl;
+ ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
+ make_tuple(version,
+ last_epoch_started,
+ last_epoch_clean)));
+ assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
+ _send_ready_to_merge();
+}
+
+void OSDService::set_not_ready_to_merge_source(pg_t source)
+{
+ std::lock_guard l(merge_lock);
+ dout(10) << __func__ << " " << source << dendl;
+ not_ready_to_merge_source.insert(source);
+ assert(ready_to_merge_source.count(source) == 0);
+ _send_ready_to_merge();
+}
+
+void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
+{
+ std::lock_guard l(merge_lock);
+ dout(10) << __func__ << " " << target << " source " << source << dendl;
+ not_ready_to_merge_target[target] = source;
+ assert(ready_to_merge_target.count(target) == 0);
+ _send_ready_to_merge();
+}
+
+void OSDService::send_ready_to_merge()
+{
+ std::lock_guard l(merge_lock);
+ _send_ready_to_merge();
+}
+
+void OSDService::_send_ready_to_merge()
+{
+ dout(20) << __func__
+ << " ready_to_merge_source " << ready_to_merge_source
+ << " not_ready_to_merge_source " << not_ready_to_merge_source
+ << " ready_to_merge_target " << ready_to_merge_target
+ << " not_ready_to_merge_target " << not_ready_to_merge_target
+ << " sent_ready_to_merge_source " << sent_ready_to_merge_source
+ << dendl;
+ for (auto src : not_ready_to_merge_source) {
+ if (sent_ready_to_merge_source.count(src) == 0) {
+ monc->send_mon_message(new MOSDPGReadyToMerge(
+ src,
+ {}, {}, 0, 0,
+ false,
+ osdmap->get_epoch()));
+ sent_ready_to_merge_source.insert(src);
+ }
+ }
+ for (auto p : not_ready_to_merge_target) {
+ if (sent_ready_to_merge_source.count(p.second) == 0) {
+ monc->send_mon_message(new MOSDPGReadyToMerge(
+ p.second,
+ {}, {}, 0, 0,
+ false,
+ osdmap->get_epoch()));
+ sent_ready_to_merge_source.insert(p.second);
+ }
+ }
+ for (auto src : ready_to_merge_source) {
+ if (not_ready_to_merge_source.count(src.first) ||
+ not_ready_to_merge_target.count(src.first.get_parent())) {
+ continue;
+ }
+ auto p = ready_to_merge_target.find(src.first.get_parent());
+ if (p != ready_to_merge_target.end() &&
+ sent_ready_to_merge_source.count(src.first) == 0) {
+ monc->send_mon_message(new MOSDPGReadyToMerge(
+ src.first, // source pgid
+ src.second, // src version
+ std::get<0>(p->second), // target version
+ std::get<1>(p->second), // PG's last_epoch_started
+ std::get<2>(p->second), // PG's last_epoch_clean
+ true,
+ osdmap->get_epoch()));
+ sent_ready_to_merge_source.insert(src.first);
+ }
+ }
+}
+
+void OSDService::clear_ready_to_merge(PG *pg)
+{
+ std::lock_guard l(merge_lock);
+ dout(10) << __func__ << " " << pg->pg_id << dendl;
+ ready_to_merge_source.erase(pg->pg_id.pgid);
+ ready_to_merge_target.erase(pg->pg_id.pgid);
+ not_ready_to_merge_source.erase(pg->pg_id.pgid);
+ not_ready_to_merge_target.erase(pg->pg_id.pgid);
+ sent_ready_to_merge_source.erase(pg->pg_id.pgid);
+}
+
+void OSDService::clear_sent_ready_to_merge()
+{
+ std::lock_guard l(merge_lock);
+ sent_ready_to_merge_source.clear();
+}
+
+void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
+{
+ std::lock_guard l(merge_lock);
+ auto i = sent_ready_to_merge_source.begin();
+ while (i != sent_ready_to_merge_source.end()) {
+ if (!osdmap->pg_exists(*i)) {
+ dout(10) << __func__ << " " << *i << dendl;
+ i = sent_ready_to_merge_source.erase(i);
+ } else {
+ ++i;
+ }
+ }
+}
+
+// ---
+
+void OSDService::_queue_for_recovery(
+ std::pair<epoch_t, PGRef> p,
+ uint64_t reserved_pushes)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
+ enqueue_back(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(
+ new PGRecovery(
+ p.second->get_pgid(), p.first, reserved_pushes)),
+ cct->_conf->osd_recovery_cost,
+ cct->_conf->osd_recovery_priority,
+ ceph_clock_now(),
+ 0,
+ p.first));
+}
+
+// ====================================================================
+// OSD
+
+#undef dout_prefix
+#define dout_prefix *_dout
+
+// Commands shared between OSD's console and admin console:
+namespace ceph::osd_cmds {
+
+int heap(CephContext& cct,
+ const cmdmap_t& cmdmap,
+ std::ostream& outos,
+ std::ostream& erros);
+
+} // namespace ceph::osd_cmds
+
+int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
+{
+ int ret;
+
+ OSDSuperblock sb;
+ bufferlist sbbl;
+ ObjectStore::CollectionHandle ch;
+
+ // if we are fed a uuid for this osd, use it.
+ store->set_fsid(cct->_conf->osd_uuid);
+
+ ret = store->mkfs();
+ if (ret) {
+ derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
+ << cpp_strerror(ret) << dendl;
+ goto free_store;
+ }
+
+ store->set_cache_shards(1); // doesn't matter for mkfs!
+
+ ret = store->mount();
+ if (ret) {
+ derr << "OSD::mkfs: couldn't mount ObjectStore: error "
+ << cpp_strerror(ret) << dendl;
+ goto free_store;
+ }
+
+ ch = store->open_collection(coll_t::meta());
+ if (ch) {
+ ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
+ if (ret < 0) {
+ derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
+ goto free_store;
+ }
+ /* if we already have superblock, check content of superblock */
+ dout(0) << " have superblock" << dendl;
+ auto p = sbbl.cbegin();
+ decode(sb, p);
+ if (whoami != sb.whoami) {
+ derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
+ << dendl;
+ ret = -EINVAL;
+ goto umount_store;
+ }
+ if (fsid != sb.cluster_fsid) {
+ derr << "provided cluster fsid " << fsid
+ << " != superblock's " << sb.cluster_fsid << dendl;
+ ret = -EINVAL;
+ goto umount_store;
+ }
+ } else {
+ // create superblock
+ sb.cluster_fsid = fsid;
+ sb.osd_fsid = store->get_fsid();
+ sb.whoami = whoami;
+ sb.compat_features = get_osd_initial_compat_set();
+
+ bufferlist bl;
+ encode(sb, bl);
+
+ ObjectStore::CollectionHandle ch = store->create_new_collection(
+ coll_t::meta());
+ ObjectStore::Transaction t;
+ t.create_collection(coll_t::meta(), 0);
+ t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
+ ret = store->queue_transaction(ch, std::move(t));
+ if (ret) {
+ derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
+ << "queue_transaction returned " << cpp_strerror(ret) << dendl;
+ goto umount_store;
+ }
+ ch->flush();
+ }
+
+ ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
+ if (ret) {
+ derr << "OSD::mkfs: failed to write fsid file: error "
+ << cpp_strerror(ret) << dendl;
+ goto umount_store;
+ }
+
+umount_store:
+ if (ch) {
+ ch.reset();
+ }
+ store->umount();
+free_store:
+ delete store;
+ return ret;
+}
+
+int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
+{
+ char val[80];
+ int r;
+
+ snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
+ r = store->write_meta("magic", val);
+ if (r < 0)
+ return r;
+
+ snprintf(val, sizeof(val), "%d", whoami);
+ r = store->write_meta("whoami", val);
+ if (r < 0)
+ return r;
+
+ cluster_fsid.print(val);
+ r = store->write_meta("ceph_fsid", val);
+ if (r < 0)
+ return r;
+
+ string key = cct->_conf.get_val<string>("key");
+ if (key.size()) {
+ r = store->write_meta("osd_key", key);
+ if (r < 0)
+ return r;
+ } else {
+ string keyfile = cct->_conf.get_val<string>("keyfile");
+ if (!keyfile.empty()) {
+ bufferlist keybl;
+ string err;
+ r = keybl.read_file(keyfile.c_str(), &err);
+ if (r < 0) {
+ derr << __func__ << " failed to read keyfile " << keyfile << ": "
+ << err << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ r = store->write_meta("osd_key", keybl.to_str());
+ if (r < 0)
+ return r;
+ }
+ }
+ if (!osdspec_affinity.empty()) {
+ r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
+ if (r < 0)
+ return r;
+ }
+
+ r = store->write_meta("ceph_version_when_created", pretty_version_to_str());
+ if (r < 0)
+ return r;
+
+ ostringstream created_at;
+ utime_t now = ceph_clock_now();
+ now.gmtime(created_at);
+ r = store->write_meta("created_at", created_at.str());
+ if (r < 0)
+ return r;
+
+ r = store->write_meta("ready", "ready");
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int OSD::peek_meta(ObjectStore *store,
+ std::string *magic,
+ uuid_d *cluster_fsid,
+ uuid_d *osd_fsid,
+ int *whoami,
+ ceph_release_t *require_osd_release)
+{
+ string val;
+
+ int r = store->read_meta("magic", &val);
+ if (r < 0)
+ return r;
+ *magic = val;
+
+ r = store->read_meta("whoami", &val);
+ if (r < 0)
+ return r;
+ *whoami = atoi(val.c_str());
+
+ r = store->read_meta("ceph_fsid", &val);
+ if (r < 0)
+ return r;
+ r = cluster_fsid->parse(val.c_str());
+ if (!r)
+ return -EINVAL;
+
+ r = store->read_meta("fsid", &val);
+ if (r < 0) {
+ *osd_fsid = uuid_d();
+ } else {
+ r = osd_fsid->parse(val.c_str());
+ if (!r)
+ return -EINVAL;
+ }
+
+ r = store->read_meta("require_osd_release", &val);
+ if (r >= 0) {
+ *require_osd_release = ceph_release_from_name(val);
+ }
+
+ return 0;
+}
+
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
+
+// cons/des
+
+OSD::OSD(CephContext *cct_, ObjectStore *store_,
+ int id,
+ Messenger *internal_messenger,
+ Messenger *external_messenger,
+ Messenger *hb_client_front,
+ Messenger *hb_client_back,
+ Messenger *hb_front_serverm,
+ Messenger *hb_back_serverm,
+ Messenger *osdc_messenger,
+ MonClient *mc,
+ const std::string &dev, const std::string &jdev,
+ ceph::async::io_context_pool& poolctx) :
+ Dispatcher(cct_),
+ tick_timer(cct, osd_lock),
+ tick_timer_without_osd_lock(cct, tick_timer_lock),
+ gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
+ cluster_messenger(internal_messenger),
+ client_messenger(external_messenger),
+ objecter_messenger(osdc_messenger),
+ monc(mc),
+ mgrc(cct_, client_messenger, &mc->monmap),
+ logger(create_logger()),
+ recoverystate_perf(create_recoverystate_perf()),
+ store(store_),
+ log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
+ clog(log_client.create_channel()),
+ whoami(id),
+ dev_path(dev), journal_path(jdev),
+ store_is_rotational(store->is_rotational()),
+ trace_endpoint("0.0.0.0", 0, "osd"),
+ asok_hook(NULL),
+ m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
+ "osd_pg_epoch_max_lag_factor")),
+ osd_compat(get_osd_compat_set()),
+ osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
+ get_num_op_threads()),
+ heartbeat_stop(false),
+ heartbeat_need_update(true),
+ hb_front_client_messenger(hb_client_front),
+ hb_back_client_messenger(hb_client_back),
+ hb_front_server_messenger(hb_front_serverm),
+ hb_back_server_messenger(hb_back_serverm),
+ daily_loadavg(0.0),
+ heartbeat_thread(this),
+ heartbeat_dispatcher(this),
+ op_tracker(cct, cct->_conf->osd_enable_op_tracker,
+ cct->_conf->osd_num_op_tracker_shard),
+ test_ops_hook(NULL),
+ op_shardedwq(
+ this,
+ ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
+ ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
+ &osd_op_tp),
+ last_pg_create_epoch(0),
+ boot_finisher(cct),
+ up_thru_wanted(0),
+ requested_full_first(0),
+ requested_full_last(0),
+ service(this, poolctx)
+{
+
+ if (!gss_ktfile_client.empty()) {
+ // Assert we can export environment variable
+ /*
+ The default client keytab is used, if it is present and readable,
+ to automatically obtain initial credentials for GSSAPI client
+ applications. The principal name of the first entry in the client
+ keytab is used by default when obtaining initial credentials.
+ 1. The KRB5_CLIENT_KTNAME environment variable.
+ 2. The default_client_keytab_name profile variable in [libdefaults].
+ 3. The hardcoded default, DEFCKTNAME.
+ */
+ const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
+ gss_ktfile_client.c_str(), 1));
+ ceph_assert(set_result == 0);
+ }
+
+ monc->set_messenger(client_messenger);
+ op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
+ op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
+ cct->_conf->osd_op_history_slow_op_threshold);
+ ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
+#ifdef WITH_BLKIN
+ std::stringstream ss;
+ ss << "osd." << whoami;
+ trace_endpoint.copy_name(ss.str());
+#endif
+
+ // initialize shards
+ num_shards = get_num_op_shards();
+ for (uint32_t i = 0; i < num_shards; i++) {
+ OSDShard *one_shard = new OSDShard(
+ i,
+ cct,
+ this);
+ shards.push_back(one_shard);
+ }
+}
+
+OSD::~OSD()
+{
+ while (!shards.empty()) {
+ delete shards.back();
+ shards.pop_back();
+ }
+ cct->get_perfcounters_collection()->remove(recoverystate_perf);
+ cct->get_perfcounters_collection()->remove(logger);
+ delete recoverystate_perf;
+ delete logger;
+ delete store;
+}
+
+double OSD::get_tick_interval() const
+{
+ // vary +/- 5% to avoid scrub scheduling livelocks
+ constexpr auto delta = 0.05;
+ return (OSD_TICK_INTERVAL *
+ ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
+}
+
+void OSD::handle_signal(int signum)
+{
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+ shutdown();
+}
+
+int OSD::pre_init()
+{
+ std::lock_guard lock(osd_lock);
+ if (is_stopping())
+ return 0;
+
+ if (store->test_mount_in_use()) {
+ derr << "OSD::pre_init: object store '" << dev_path << "' is "
+ << "currently in use. (Is ceph-osd already running?)" << dendl;
+ return -EBUSY;
+ }
+
+ cct->_conf.add_observer(this);
+ return 0;
+}
+
+int OSD::set_numa_affinity()
+{
+ // storage numa node
+ int store_node = -1;
+ store->get_numa_node(&store_node, nullptr, nullptr);
+ if (store_node >= 0) {
+ dout(1) << __func__ << " storage numa node " << store_node << dendl;
+ }
+
+ // check network numa node(s)
+ int front_node = -1, back_node = -1;
+ string front_iface = pick_iface(
+ cct,
+ client_messenger->get_myaddrs().front().get_sockaddr_storage());
+ string back_iface = pick_iface(
+ cct,
+ cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
+ int r = get_iface_numa_node(front_iface, &front_node);
+ if (r >= 0 && front_node >= 0) {
+ dout(1) << __func__ << " public network " << front_iface << " numa node "
+ << front_node << dendl;
+ r = get_iface_numa_node(back_iface, &back_node);
+ if (r >= 0 && back_node >= 0) {
+ dout(1) << __func__ << " cluster network " << back_iface << " numa node "
+ << back_node << dendl;
+ if (front_node == back_node &&
+ front_node == store_node) {
+ dout(1) << " objectstore and network numa nodes all match" << dendl;
+ if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
+ numa_node = front_node;
+ }
+ } else if (front_node != back_node) {
+ dout(1) << __func__ << " public and cluster network numa nodes do not match"
+ << dendl;
+ } else {
+ dout(1) << __func__ << " objectstore and network numa nodes do not match"
+ << dendl;
+ }
+ } else if (back_node == -2) {
+ dout(1) << __func__ << " cluster network " << back_iface
+ << " ports numa nodes do not match" << dendl;
+ } else {
+ derr << __func__ << " unable to identify cluster interface '" << back_iface
+ << "' numa node: " << cpp_strerror(r) << dendl;
+ }
+ } else if (front_node == -2) {
+ dout(1) << __func__ << " public network " << front_iface
+ << " ports numa nodes do not match" << dendl;
+ } else {
+ derr << __func__ << " unable to identify public interface '" << front_iface
+ << "' numa node: " << cpp_strerror(r) << dendl;
+ }
+ if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
+ // this takes precedence over the automagic logic above
+ numa_node = node;
+ }
+ if (numa_node >= 0) {
+ int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
+ if (r < 0) {
+ dout(1) << __func__ << " unable to determine numa node " << numa_node
+ << " CPUs" << dendl;
+ numa_node = -1;
+ } else {
+ dout(1) << __func__ << " setting numa affinity to node " << numa_node
+ << " cpus "
+ << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
+ << dendl;
+ r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
+ << dendl;
+ numa_node = -1;
+ }
+ }
+ } else {
+ dout(1) << __func__ << " not setting numa affinity" << dendl;
+ }
+ return 0;
+}
+
+// asok
+
+class OSDSocketHook : public AdminSocketHook {
+ OSD *osd;
+public:
+ explicit OSDSocketHook(OSD *o) : osd(o) {}
+ int call(std::string_view prefix, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& ss,
+ bufferlist& out) override {
+ ceph_abort("should use async hook");
+ }
+ void call_async(
+ std::string_view prefix,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ const bufferlist& inbl,
+ std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
+ try {
+ osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
+ } catch (const TOPNSPC::common::bad_cmd_get& e) {
+ bufferlist empty;
+ on_finish(-EINVAL, e.what(), empty);
+ }
+ }
+};
+
+std::set<int64_t> OSD::get_mapped_pools()
+{
+ std::set<int64_t> pools;
+ std::vector<spg_t> pgids;
+ _get_pgids(&pgids);
+ for (const auto &pgid : pgids) {
+ pools.insert(pgid.pool());
+ }
+ return pools;
+}
+
+void OSD::asok_command(
+ std::string_view prefix, const cmdmap_t& cmdmap,
+ Formatter *f,
+ const bufferlist& inbl,
+ std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+ int ret = 0;
+ stringstream ss; // stderr error message stream
+ bufferlist outbl; // if empty at end, we'll dump formatter as output
+
+ // --- PG commands are routed here to PG::do_command ---
+ if (prefix == "pg" ||
+ prefix == "query" ||
+ prefix == "mark_unfound_lost" ||
+ prefix == "list_unfound" ||
+ prefix == "scrub" ||
+ prefix == "deep_scrub"
+ ) {
+ string pgidstr;
+ pg_t pgid;
+ if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+ ss << "no pgid specified";
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "couldn't parse pgid '" << pgidstr << "'";
+ ret = -EINVAL;
+ goto out;
+ }
+ spg_t pcand;
+ PGRef pg;
+ if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
+ (pg = _lookup_lock_pg(pcand))) {
+ if (pg->is_primary()) {
+ cmdmap_t new_cmdmap = cmdmap;
+ try {
+ pg->do_command(prefix, new_cmdmap, inbl, on_finish);
+ pg->unlock();
+ return; // the pg handler calls on_finish directly
+ } catch (const TOPNSPC::common::bad_cmd_get& e) {
+ pg->unlock();
+ ss << e.what();
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ ss << "not primary for pgid " << pgid;
+ // do not reply; they will get newer maps and realize they
+ // need to resend.
+ pg->unlock();
+ ret = -EAGAIN;
+ goto out;
+ }
+ } else {
+ ss << "i don't have pgid " << pgid;
+ ret = -ENOENT;
+ }
+ }
+
+ // --- OSD commands follow ---
+
+ else if (prefix == "status") {
+ lock_guard l(osd_lock);
+ f->open_object_section("status");
+ f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
+ f->dump_stream("osd_fsid") << superblock.osd_fsid;
+ f->dump_unsigned("whoami", superblock.whoami);
+ f->dump_string("state", get_state_name(get_state()));
+ f->dump_unsigned("oldest_map", superblock.oldest_map);
+ f->dump_unsigned("newest_map", superblock.newest_map);
+ f->dump_unsigned("num_pgs", num_pgs);
+ f->close_section();
+ } else if (prefix == "flush_journal") {
+ store->flush_journal();
+ } else if (prefix == "dump_ops_in_flight" ||
+ prefix == "ops" ||
+ prefix == "dump_blocked_ops" ||
+ prefix == "dump_historic_ops" ||
+ prefix == "dump_historic_ops_by_duration" ||
+ prefix == "dump_historic_slow_ops") {
+
+ const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
+even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
+will start to track new ops received afterwards.";
+
+ set<string> filters;
+ vector<string> filter_str;
+ if (cmd_getval(cmdmap, "filterstr", filter_str)) {
+ copy(filter_str.begin(), filter_str.end(),
+ inserter(filters, filters.end()));
+ }
+
+ if (prefix == "dump_ops_in_flight" ||
+ prefix == "ops") {
+ if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
+ ss << error_str;
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (prefix == "dump_blocked_ops") {
+ if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
+ ss << error_str;
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (prefix == "dump_historic_ops") {
+ if (!op_tracker.dump_historic_ops(f, false, filters)) {
+ ss << error_str;
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (prefix == "dump_historic_ops_by_duration") {
+ if (!op_tracker.dump_historic_ops(f, true, filters)) {
+ ss << error_str;
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (prefix == "dump_historic_slow_ops") {
+ if (!op_tracker.dump_historic_slow_ops(f, filters)) {
+ ss << error_str;
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ } else if (prefix == "dump_op_pq_state") {
+ f->open_object_section("pq");
+ op_shardedwq.dump(f);
+ f->close_section();
+ } else if (prefix == "dump_blocklist") {
+ list<pair<entity_addr_t,utime_t> > bl;
+ list<pair<entity_addr_t,utime_t> > rbl;
+ OSDMapRef curmap = service.get_osdmap();
+ curmap->get_blocklist(&bl, &rbl);
+
+ f->open_array_section("blocklist");
+ for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
+ it != bl.end(); ++it) {
+ f->open_object_section("entry");
+ f->open_object_section("entity_addr_t");
+ it->first.dump(f);
+ f->close_section(); //entity_addr_t
+ it->second.localtime(f->dump_stream("expire_time"));
+ f->close_section(); //entry
+ }
+ f->close_section(); //blocklist
+ f->open_array_section("range_blocklist");
+ for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
+ it != rbl.end(); ++it) {
+ f->open_object_section("entry");
+ f->open_object_section("entity_addr_t");
+ it->first.dump(f);
+ f->close_section(); //entity_addr_t
+ it->second.localtime(f->dump_stream("expire_time"));
+ f->close_section(); //entry
+ }
+ f->close_section(); //blocklist
+ } else if (prefix == "dump_watchers") {
+ list<obj_watch_item_t> watchers;
+ // scan pg's
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ list<obj_watch_item_t> pg_watchers;
+ pg->get_watchers(&pg_watchers);
+ watchers.splice(watchers.end(), pg_watchers);
+ }
+
+ f->open_array_section("watchers");
+ for (list<obj_watch_item_t>::iterator it = watchers.begin();
+ it != watchers.end(); ++it) {
+
+ f->open_object_section("watch");
+
+ f->dump_string("namespace", it->obj.nspace);
+ f->dump_string("object", it->obj.oid.name);
+
+ f->open_object_section("entity_name");
+ it->wi.name.dump(f);
+ f->close_section(); //entity_name_t
+
+ f->dump_unsigned("cookie", it->wi.cookie);
+ f->dump_unsigned("timeout", it->wi.timeout_seconds);
+
+ f->open_object_section("entity_addr_t");
+ it->wi.addr.dump(f);
+ f->close_section(); //entity_addr_t
+
+ f->close_section(); //watch
+ }
+
+ f->close_section(); //watchers
+ } else if (prefix == "dump_recovery_reservations") {
+ f->open_object_section("reservations");
+ f->open_object_section("local_reservations");
+ service.local_reserver.dump(f);
+ f->close_section();
+ f->open_object_section("remote_reservations");
+ service.remote_reserver.dump(f);
+ f->close_section();
+ f->close_section();
+ } else if (prefix == "dump_scrub_reservations") {
+ f->open_object_section("scrub_reservations");
+ service.dump_scrub_reservations(f);
+ f->close_section();
+ } else if (prefix == "get_latest_osdmap") {
+ get_latest_osdmap();
+ } else if (prefix == "set_heap_property") {
+ string property;
+ int64_t value = 0;
+ string error;
+ bool success = false;
+ if (!cmd_getval(cmdmap, "property", property)) {
+ error = "unable to get property";
+ success = false;
+ } else if (!cmd_getval(cmdmap, "value", value)) {
+ error = "unable to get value";
+ success = false;
+ } else if (value < 0) {
+ error = "negative value not allowed";
+ success = false;
+ } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
+ error = "invalid property";
+ success = false;
+ } else {
+ success = true;
+ }
+ f->open_object_section("result");
+ f->dump_string("error", error);
+ f->dump_bool("success", success);
+ f->close_section();
+ } else if (prefix == "get_heap_property") {
+ string property;
+ size_t value = 0;
+ string error;
+ bool success = false;
+ if (!cmd_getval(cmdmap, "property", property)) {
+ error = "unable to get property";
+ success = false;
+ } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
+ error = "invalid property";
+ success = false;
+ } else {
+ success = true;
+ }
+ f->open_object_section("result");
+ f->dump_string("error", error);
+ f->dump_bool("success", success);
+ f->dump_int("value", value);
+ f->close_section();
+ } else if (prefix == "dump_objectstore_kv_stats") {
+ store->get_db_statistics(f);
+ } else if (prefix == "dump_scrubs") {
+ service.dumps_scrub(f);
+ } else if (prefix == "calc_objectstore_db_histogram") {
+ store->generate_db_histogram(f);
+ } else if (prefix == "flush_store_cache") {
+ store->flush_cache(&ss);
+ } else if (prefix == "dump_pgstate_history") {
+ f->open_object_section("pgstate_history");
+ f->open_array_section("pgs");
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ f->open_object_section("pg");
+ f->dump_stream("pg") << pg->pg_id;
+ f->dump_string("currently", pg->get_current_state());
+ pg->dump_pgstate_history(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ } else if (prefix == "compact") {
+ dout(1) << "triggering manual compaction" << dendl;
+ auto start = ceph::coarse_mono_clock::now();
+ store->compact();
+ auto end = ceph::coarse_mono_clock::now();
+ double duration = std::chrono::duration<double>(end-start).count();
+ dout(1) << "finished manual compaction in "
+ << duration
+ << " seconds" << dendl;
+ f->open_object_section("compact_result");
+ f->dump_float("elapsed_time", duration);
+ f->close_section();
+ } else if (prefix == "get_mapped_pools") {
+ f->open_array_section("mapped_pools");
+ set<int64_t> poollist = get_mapped_pools();
+ for (auto pool : poollist) {
+ f->dump_int("pool_id", pool);
+ }
+ f->close_section();
+ } else if (prefix == "smart") {
+ string devid;
+ cmd_getval(cmdmap, "devid", devid);
+ ostringstream out;
+ probe_smart(devid, out);
+ outbl.append(out.str());
+ } else if (prefix == "list_devices") {
+ set<string> devnames;
+ store->get_devices(&devnames);
+ f->open_array_section("list_devices");
+ for (auto dev : devnames) {
+ if (dev.find("dm-") == 0) {
+ continue;
+ }
+ string err;
+ f->open_object_section("device");
+ f->dump_string("device", "/dev/" + dev);
+ f->dump_string("device_id", get_device_id(dev, &err));
+ f->close_section();
+ }
+ f->close_section();
+ } else if (prefix == "send_beacon") {
+ lock_guard l(osd_lock);
+ if (is_active()) {
+ send_beacon(ceph::coarse_mono_clock::now());
+ }
+ }
+
+ else if (prefix == "cluster_log") {
+ vector<string> msg;
+ cmd_getval(cmdmap, "message", msg);
+ if (msg.empty()) {
+ ret = -EINVAL;
+ ss << "ignoring empty log message";
+ goto out;
+ }
+ string message = msg.front();
+ for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
+ message += " " + *a;
+ string lvl;
+ cmd_getval(cmdmap, "level", lvl);
+ clog_type level = string_to_clog_type(lvl);
+ if (level < 0) {
+ ret = -EINVAL;
+ ss << "unknown level '" << lvl << "'";
+ goto out;
+ }
+ clog->do_log(level, message);
+ }
+
+ else if (prefix == "bench") {
+ int64_t count;
+ int64_t bsize;
+ int64_t osize, onum;
+ // default count 1G, size 4MB
+ cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
+ cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
+ cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
+ cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
+ double elapsed = 0.0;
+
+ ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+ if (ret != 0) {
+ goto out;
+ }
+
+ double rate = count / elapsed;
+ double iops = rate / bsize;
+ f->open_object_section("osd_bench_results");
+ f->dump_int("bytes_written", count);
+ f->dump_int("blocksize", bsize);
+ f->dump_float("elapsed_sec", elapsed);
+ f->dump_float("bytes_per_sec", rate);
+ f->dump_float("iops", iops);
+ f->close_section();
+ }
+
+ else if (prefix == "flush_pg_stats") {
+ mgrc.send_pgstats();
+ f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
+ }
+
+ else if (prefix == "heap") {
+ std::stringstream outss;
+ ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
+ outbl.append(outss);
+ }
+
+ else if (prefix == "debug dump_missing") {
+ f->open_array_section("pgs");
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ string s = stringify(pg->pg_id);
+ f->open_array_section(s.c_str());
+ pg->lock();
+ pg->dump_missing(f);
+ pg->unlock();
+ f->close_section();
+ }
+ f->close_section();
+ }
+
+ else if (prefix == "debug kick_recovery_wq") {
+ int64_t delay;
+ cmd_getval(cmdmap, "delay", delay);
+ ostringstream oss;
+ oss << delay;
+ ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
+ if (ret != 0) {
+ ss << "kick_recovery_wq: error setting "
+ << "osd_recovery_delay_start to '" << delay << "': error "
+ << ret;
+ goto out;
+ }
+ cct->_conf.apply_changes(nullptr);
+ ss << "kicking recovery queue. set osd_recovery_delay_start "
+ << "to " << cct->_conf->osd_recovery_delay_start;
+ }
+
+ else if (prefix == "cpu_profiler") {
+ ostringstream ds;
+ string arg;
+ cmd_getval(cmdmap, "arg", arg);
+ vector<string> argvec;
+ get_str_vec(arg, argvec);
+ cpu_profiler_handle_command(argvec, ds);
+ outbl.append(ds.str());
+ }
+
+ else if (prefix == "dump_pg_recovery_stats") {
+ lock_guard l(osd_lock);
+ pg_recovery_stats.dump_formatted(f);
+ }
+
+ else if (prefix == "reset_pg_recovery_stats") {
+ lock_guard l(osd_lock);
+ pg_recovery_stats.reset();
+ }
+
+ else if (prefix == "perf histogram dump") {
+ std::string logger;
+ std::string counter;
+ cmd_getval(cmdmap, "logger", logger);
+ cmd_getval(cmdmap, "counter", counter);
+ cct->get_perfcounters_collection()->dump_formatted_histograms(
+ f, false, logger, counter);
+ }
+
+ else if (prefix == "cache drop") {
+ lock_guard l(osd_lock);
+ dout(20) << "clearing all caches" << dendl;
+ // Clear the objectstore's cache - onode and buffer for Bluestore,
+ // system's pagecache for Filestore
+ ret = store->flush_cache(&ss);
+ if (ret < 0) {
+ ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
+ goto out;
+ }
+ // Clear the objectcontext cache (per PG)
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg: pgs) {
+ pg->clear_cache();
+ }
+ }
+
+ else if (prefix == "cache status") {
+ lock_guard l(osd_lock);
+ int obj_ctx_count = 0;
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg: pgs) {
+ obj_ctx_count += pg->get_cache_obj_count();
+ }
+ f->open_object_section("cache_status");
+ f->dump_int("object_ctx", obj_ctx_count);
+ store->dump_cache_stats(f);
+ f->close_section();
+ }
+
+ else if (prefix == "scrub_purged_snaps") {
+ lock_guard l(osd_lock);
+ scrub_purged_snaps();
+ }
+
+ else if (prefix == "dump_osd_network") {
+ lock_guard l(osd_lock);
+ int64_t value = 0;
+ if (!(cmd_getval(cmdmap, "value", value))) {
+ // Convert milliseconds to microseconds
+ value = static_cast<double>(g_conf().get_val<double>(
+ "mon_warn_on_slow_ping_time")) * 1000;
+ if (value == 0) {
+ double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+ value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+ value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+ }
+ } else {
+ // Convert user input to microseconds
+ value *= 1000;
+ }
+ if (value < 0) value = 0;
+
+ struct osd_ping_time_t {
+ uint32_t pingtime;
+ int to;
+ bool back;
+ std::array<uint32_t,3> times;
+ std::array<uint32_t,3> min;
+ std::array<uint32_t,3> max;
+ uint32_t last;
+ uint32_t last_update;
+
+ bool operator<(const osd_ping_time_t& rhs) const {
+ if (pingtime < rhs.pingtime)
+ return true;
+ if (pingtime > rhs.pingtime)
+ return false;
+ if (to < rhs.to)
+ return true;
+ if (to > rhs.to)
+ return false;
+ return back;
+ }
+ };
+
+ set<osd_ping_time_t> sorted;
+ // Get pingtimes under lock and not on the stack
+ map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
+ service.get_hb_pingtime(pingtimes);
+ for (auto j : *pingtimes) {
+ if (j.second.last_update == 0)
+ continue;
+ osd_ping_time_t item;
+ item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+ item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+ if (item.pingtime >= value) {
+ item.to = j.first;
+ item.times[0] = j.second.back_pingtime[0];
+ item.times[1] = j.second.back_pingtime[1];
+ item.times[2] = j.second.back_pingtime[2];
+ item.min[0] = j.second.back_min[0];
+ item.min[1] = j.second.back_min[1];
+ item.min[2] = j.second.back_min[2];
+ item.max[0] = j.second.back_max[0];
+ item.max[1] = j.second.back_max[1];
+ item.max[2] = j.second.back_max[2];
+ item.last = j.second.back_last;
+ item.back = true;
+ item.last_update = j.second.last_update;
+ sorted.emplace(item);
+ }
+ if (j.second.front_last == 0)
+ continue;
+ item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+ item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+ if (item.pingtime >= value) {
+ item.to = j.first;
+ item.times[0] = j.second.front_pingtime[0];
+ item.times[1] = j.second.front_pingtime[1];
+ item.times[2] = j.second.front_pingtime[2];
+ item.min[0] = j.second.front_min[0];
+ item.min[1] = j.second.front_min[1];
+ item.min[2] = j.second.front_min[2];
+ item.max[0] = j.second.front_max[0];
+ item.max[1] = j.second.front_max[1];
+ item.max[2] = j.second.front_max[2];
+ item.last = j.second.front_last;
+ item.last_update = j.second.last_update;
+ item.back = false;
+ sorted.emplace(item);
+ }
+ }
+ delete pingtimes;
+ //
+ // Network ping times (1min 5min 15min)
+ f->open_object_section("network_ping_times");
+ f->dump_int("threshold", value / 1000);
+ f->open_array_section("entries");
+ for (auto &sitem : boost::adaptors::reverse(sorted)) {
+ ceph_assert(sitem.pingtime >= value);
+ f->open_object_section("entry");
+
+ const time_t lu(sitem.last_update);
+ char buffer[26];
+ string lustr(ctime_r(&lu, buffer));
+ lustr.pop_back(); // Remove trailing \n
+ auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+ f->dump_string("last update", lustr);
+ f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
+ f->dump_int("from osd", whoami);
+ f->dump_int("to osd", sitem.to);
+ f->dump_string("interface", (sitem.back ? "back" : "front"));
+ f->open_object_section("average");
+ f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
+ f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
+ f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
+ f->close_section(); // average
+ f->open_object_section("min");
+ f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
+ f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
+ f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
+ f->close_section(); // min
+ f->open_object_section("max");
+ f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
+ f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
+ f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
+ f->close_section(); // max
+ f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
+ f->close_section(); // entry
+ }
+ f->close_section(); // entries
+ f->close_section(); // network_ping_times
+ } else {
+ ceph_abort_msg("broken asok registration");
+ }
+
+ out:
+ on_finish(ret, ss.str(), outbl);
+}
+
+int OSD::run_osd_bench_test(
+ int64_t count,
+ int64_t bsize,
+ int64_t osize,
+ int64_t onum,
+ double *elapsed,
+ ostream &ss)
+{
+ int ret = 0;
+ uint32_t duration = cct->_conf->osd_bench_duration;
+
+ if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
+ // let us limit the block size because the next checks rely on it
+ // having a sane value. If we allow any block size to be set things
+ // can still go sideways.
+ ss << "block 'size' values are capped at "
+ << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
+ << " a higher value, please adjust 'osd_bench_max_block_size'";
+ ret = -EINVAL;
+ return ret;
+ } else if (bsize < (int64_t) (1 << 20)) {
+ // entering the realm of small block sizes.
+ // limit the count to a sane value, assuming a configurable amount of
+ // IOPS and duration, so that the OSD doesn't get hung up on this,
+ // preventing timeouts from going off
+ int64_t max_count =
+ bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
+ if (count > max_count) {
+ ss << "'count' values greater than " << max_count
+ << " for a block size of " << byte_u_t(bsize) << ", assuming "
+ << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
+ << " for " << duration << " seconds,"
+ << " can cause ill effects on osd. "
+ << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
+ << " value if you wish to use a higher 'count'.";
+ ret = -EINVAL;
+ return ret;
+ }
+ } else {
+ // 1MB block sizes are big enough so that we get more stuff done.
+ // However, to avoid the osd from getting hung on this and having
+ // timers being triggered, we are going to limit the count assuming
+ // a configurable throughput and duration.
+ // NOTE: max_count is the total amount of bytes that we believe we
+ // will be able to write during 'duration' for the given
+ // throughput. The block size hardly impacts this unless it's
+ // way too big. Given we already check how big the block size
+ // is, it's safe to assume everything will check out.
+ int64_t max_count =
+ cct->_conf->osd_bench_large_size_max_throughput * duration;
+ if (count > max_count) {
+ ss << "'count' values greater than " << max_count
+ << " for a block size of " << byte_u_t(bsize) << ", assuming "
+ << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
+ << " for " << duration << " seconds,"
+ << " can cause ill effects on osd. "
+ << " Please adjust 'osd_bench_large_size_max_throughput'"
+ << " with a higher value if you wish to use a higher 'count'.";
+ ret = -EINVAL;
+ return ret;
+ }
+ }
+
+ if (osize && bsize > osize) {
+ bsize = osize;
+ }
+
+ dout(1) << " bench count " << count
+ << " bsize " << byte_u_t(bsize) << dendl;
+
+ ObjectStore::Transaction cleanupt;
+
+ if (osize && onum) {
+ bufferlist bl;
+ bufferptr bp(osize);
+ bp.zero();
+ bl.push_back(std::move(bp));
+ bl.rebuild_page_aligned();
+ for (int i=0; i<onum; ++i) {
+ char nm[30];
+ snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
+ object_t oid(nm);
+ hobject_t soid(sobject_t(oid, 0));
+ ObjectStore::Transaction t;
+ t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
+ store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ cleanupt.remove(coll_t(), ghobject_t(soid));
+ }
+ }
+
+ bufferlist bl;
+ bufferptr bp(bsize);
+ bp.zero();
+ bl.push_back(std::move(bp));
+ bl.rebuild_page_aligned();
+
+ {
+ C_SaferCond waiter;
+ if (!service.meta_ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+ }
+
+ utime_t start = ceph_clock_now();
+ for (int64_t pos = 0; pos < count; pos += bsize) {
+ char nm[30];
+ unsigned offset = 0;
+ if (onum && osize) {
+ snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
+ offset = rand() % (osize / bsize) * bsize;
+ } else {
+ snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
+ }
+ object_t oid(nm);
+ hobject_t soid(sobject_t(oid, 0));
+ ObjectStore::Transaction t;
+ t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
+ store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ if (!onum || !osize) {
+ cleanupt.remove(coll_t::meta(), ghobject_t(soid));
+ }
+ }
+
+ {
+ C_SaferCond waiter;
+ if (!service.meta_ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+ }
+ utime_t end = ceph_clock_now();
+ *elapsed = end - start;
+
+ // clean up
+ store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
+ {
+ C_SaferCond waiter;
+ if (!service.meta_ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+ }
+
+ return ret;
+}
+
+class TestOpsSocketHook : public AdminSocketHook {
+ OSDService *service;
+ ObjectStore *store;
+public:
+ TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override {
+ int r = 0;
+ stringstream outss;
+ try {
+ test_ops(service, store, command, cmdmap, outss);
+ out.append(outss);
+ } catch (const TOPNSPC::common::bad_cmd_get& e) {
+ errss << e.what();
+ r = -EINVAL;
+ }
+ return r;
+ }
+ void test_ops(OSDService *service, ObjectStore *store,
+ std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
+
+};
+
+class OSD::C_Tick : public Context {
+ OSD *osd;
+ public:
+ explicit C_Tick(OSD *o) : osd(o) {}
+ void finish(int r) override {
+ osd->tick();
+ }
+};
+
+class OSD::C_Tick_WithoutOSDLock : public Context {
+ OSD *osd;
+ public:
+ explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
+ void finish(int r) override {
+ osd->tick_without_osd_lock();
+ }
+};
+
+int OSD::enable_disable_fuse(bool stop)
+{
+#ifdef HAVE_LIBFUSE
+ int r;
+ string mntpath = cct->_conf->osd_data + "/fuse";
+ if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
+ dout(1) << __func__ << " disabling" << dendl;
+ fuse_store->stop();
+ delete fuse_store;
+ fuse_store = NULL;
+ r = ::rmdir(mntpath.c_str());
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to rmdir " << mntpath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+ }
+ if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
+ dout(1) << __func__ << " enabling" << dendl;
+ r = ::mkdir(mntpath.c_str(), 0700);
+ if (r < 0)
+ r = -errno;
+ if (r < 0 && r != -EEXIST) {
+ derr << __func__ << " unable to create " << mntpath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ fuse_store = new FuseStore(store, mntpath);
+ r = fuse_store->start();
+ if (r < 0) {
+ derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
+ delete fuse_store;
+ fuse_store = NULL;
+ return r;
+ }
+ }
+#endif // HAVE_LIBFUSE
+ return 0;
+}
+
+size_t OSD::get_num_cache_shards()
+{
+ return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
+}
+
+int OSD::get_num_op_shards()
+{
+ if (cct->_conf->osd_op_num_shards)
+ return cct->_conf->osd_op_num_shards;
+ if (store_is_rotational)
+ return cct->_conf->osd_op_num_shards_hdd;
+ else
+ return cct->_conf->osd_op_num_shards_ssd;
+}
+
+int OSD::get_num_op_threads()
+{
+ if (cct->_conf->osd_op_num_threads_per_shard)
+ return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
+ if (store_is_rotational)
+ return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
+ else
+ return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
+}
+
+float OSD::get_osd_recovery_sleep()
+{
+ if (cct->_conf->osd_recovery_sleep)
+ return cct->_conf->osd_recovery_sleep;
+ if (!store_is_rotational && !journal_is_rotational)
+ return cct->_conf->osd_recovery_sleep_ssd;
+ else if (store_is_rotational && !journal_is_rotational)
+ return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
+ else
+ return cct->_conf->osd_recovery_sleep_hdd;
+}
+
+float OSD::get_osd_delete_sleep()
+{
+ float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
+ if (osd_delete_sleep > 0)
+ return osd_delete_sleep;
+ if (!store_is_rotational && !journal_is_rotational)
+ return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
+ if (store_is_rotational && !journal_is_rotational)
+ return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
+ return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
+}
+
+int OSD::get_recovery_max_active()
+{
+ if (cct->_conf->osd_recovery_max_active)
+ return cct->_conf->osd_recovery_max_active;
+ if (store_is_rotational)
+ return cct->_conf->osd_recovery_max_active_hdd;
+ else
+ return cct->_conf->osd_recovery_max_active_ssd;
+}
+
+float OSD::get_osd_snap_trim_sleep()
+{
+ float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
+ if (osd_snap_trim_sleep > 0)
+ return osd_snap_trim_sleep;
+ if (!store_is_rotational && !journal_is_rotational)
+ return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
+ if (store_is_rotational && !journal_is_rotational)
+ return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
+ return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
+}
+
+int OSD::init()
+{
+ OSDMapRef osdmap;
+ CompatSet initial, diff;
+ std::lock_guard lock(osd_lock);
+ if (is_stopping())
+ return 0;
+
+ tick_timer.init();
+ tick_timer_without_osd_lock.init();
+ service.recovery_request_timer.init();
+ service.sleep_timer.init();
+
+ boot_finisher.start();
+
+ {
+ string val;
+ store->read_meta("require_osd_release", &val);
+ last_require_osd_release = ceph_release_from_name(val);
+ }
+
+ // mount.
+ dout(2) << "init " << dev_path
+ << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
+ << dendl;
+ dout(2) << "journal " << journal_path << dendl;
+ ceph_assert(store); // call pre_init() first!
+
+ store->set_cache_shards(get_num_cache_shards());
+
+ int r = store->mount();
+ if (r < 0) {
+ derr << "OSD:init: unable to mount object store" << dendl;
+ return r;
+ }
+ journal_is_rotational = store->is_journal_rotational();
+ dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
+ << dendl;
+
+ enable_disable_fuse(false);
+
+ dout(2) << "boot" << dendl;
+
+ service.meta_ch = store->open_collection(coll_t::meta());
+
+ // initialize the daily loadavg with current 15min loadavg
+ double loadavgs[3];
+ if (getloadavg(loadavgs, 3) == 3) {
+ daily_loadavg = loadavgs[2];
+ } else {
+ derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+ daily_loadavg = 1.0;
+ }
+
+ int rotating_auth_attempts = 0;
+ auto rotating_auth_timeout =
+ g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
+
+ // sanity check long object name handling
+ {
+ hobject_t l;
+ l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
+ l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
+ l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
+ r = store->validate_hobject_key(l);
+ if (r < 0) {
+ derr << "backend (" << store->get_type() << ") is unable to support max "
+ << "object name[space] len" << dendl;
+ derr << " osd max object name len = "
+ << cct->_conf->osd_max_object_name_len << dendl;
+ derr << " osd max object namespace len = "
+ << cct->_conf->osd_max_object_namespace_len << dendl;
+ derr << cpp_strerror(r) << dendl;
+ if (cct->_conf->osd_check_max_object_name_len_on_startup) {
+ goto out;
+ }
+ derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
+ << dendl;
+ } else {
+ dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
+ }
+ }
+
+ // read superblock
+ r = read_superblock();
+ if (r < 0) {
+ derr << "OSD::init() : unable to read osd superblock" << dendl;
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (osd_compat.compare(superblock.compat_features) < 0) {
+ derr << "The disk uses features unsupported by the executable." << dendl;
+ derr << " ondisk features " << superblock.compat_features << dendl;
+ derr << " daemon features " << osd_compat << dendl;
+
+ if (osd_compat.writeable(superblock.compat_features)) {
+ CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+ derr << "it is still writeable, though. Missing features: " << diff << dendl;
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ else {
+ CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+ derr << "Cannot write to disk! Missing features: " << diff << dendl;
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+
+ assert_warn(whoami == superblock.whoami);
+ if (whoami != superblock.whoami) {
+ derr << "OSD::init: superblock says osd"
+ << superblock.whoami << " but I am osd." << whoami << dendl;
+ r = -EINVAL;
+ goto out;
+ }
+
+ startup_time = ceph::mono_clock::now();
+
+ // load up "current" osdmap
+ assert_warn(!get_osdmap());
+ if (get_osdmap()) {
+ derr << "OSD::init: unable to read current osdmap" << dendl;
+ r = -EINVAL;
+ goto out;
+ }
+ osdmap = get_map(superblock.current_epoch);
+ set_osdmap(osdmap);
+
+ // make sure we don't have legacy pgs deleting
+ {
+ vector<coll_t> ls;
+ int r = store->list_collections(ls);
+ ceph_assert(r >= 0);
+ for (auto c : ls) {
+ spg_t pgid;
+ if (c.is_pg(&pgid) &&
+ !osdmap->have_pg_pool(pgid.pool())) {
+ ghobject_t oid = make_final_pool_info_oid(pgid.pool());
+ if (!store->exists(service.meta_ch, oid)) {
+ derr << __func__ << " missing pg_pool_t for deleted pool "
+ << pgid.pool() << " for pg " << pgid
+ << "; please downgrade to luminous and allow "
+ << "pg deletion to complete before upgrading" << dendl;
+ ceph_abort();
+ }
+ }
+ }
+ }
+
+ initial = get_osd_initial_compat_set();
+ diff = superblock.compat_features.unsupported(initial);
+ if (superblock.compat_features.merge(initial)) {
+ // Are we adding SNAPMAPPER2?
+ if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
+ dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
+ << dendl;
+ auto ch = service.meta_ch;
+ auto hoid = make_snapmapper_oid();
+ unsigned max = cct->_conf->osd_target_transaction_size;
+ r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
+ if (r < 0)
+ goto out;
+ }
+ // We need to persist the new compat_set before we
+ // do anything else
+ dout(5) << "Upgrading superblock adding: " << diff << dendl;
+ ObjectStore::Transaction t;
+ write_superblock(t);
+ r = store->queue_transaction(service.meta_ch, std::move(t));
+ if (r < 0)
+ goto out;
+ }
+
+ // make sure snap mapper object exists
+ if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
+ dout(10) << "init creating/touching snapmapper object" << dendl;
+ ObjectStore::Transaction t;
+ t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
+ r = store->queue_transaction(service.meta_ch, std::move(t));
+ if (r < 0)
+ goto out;
+ }
+ if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
+ dout(10) << "init creating/touching purged_snaps object" << dendl;
+ ObjectStore::Transaction t;
+ t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
+ r = store->queue_transaction(service.meta_ch, std::move(t));
+ if (r < 0)
+ goto out;
+ }
+
+ if (cct->_conf->osd_open_classes_on_start) {
+ int r = ClassHandler::get_instance().open_all_classes();
+ if (r)
+ dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
+ }
+
+ check_osdmap_features();
+
+ {
+ epoch_t bind_epoch = osdmap->get_epoch();
+ service.set_epochs(NULL, NULL, &bind_epoch);
+ }
+
+ clear_temp_objects();
+
+ // initialize osdmap references in sharded wq
+ for (auto& shard : shards) {
+ std::lock_guard l(shard->osdmap_lock);
+ shard->shard_osdmap = osdmap;
+ }
+
+ // load up pgs (as they previously existed)
+ load_pgs();
+
+ dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
+
+ if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
+ dout(2) << "compacting object store's omap" << dendl;
+ store->compact();
+ }
+
+ // prime osd stats
+ {
+ struct store_statfs_t stbuf;
+ osd_alert_list_t alerts;
+ int r = store->statfs(&stbuf, &alerts);
+ ceph_assert(r == 0);
+ service.set_statfs(stbuf, alerts);
+ }
+
+ // client_messenger's auth_client will be set up by monc->init() later.
+ for (auto m : { cluster_messenger,
+ objecter_messenger,
+ hb_front_client_messenger,
+ hb_back_client_messenger,
+ hb_front_server_messenger,
+ hb_back_server_messenger } ) {
+ m->set_auth_client(monc);
+ }
+ for (auto m : { client_messenger,
+ cluster_messenger,
+ hb_front_server_messenger,
+ hb_back_server_messenger }) {
+ m->set_auth_server(monc);
+ }
+ monc->set_handle_authentication_dispatcher(this);
+
+ monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
+ | CEPH_ENTITY_TYPE_MGR);
+ r = monc->init();
+ if (r < 0)
+ goto out;
+
+ mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
+ mgrc.set_perf_metric_query_cb(
+ [this](const ConfigPayload &config_payload) {
+ set_perf_queries(config_payload);
+ },
+ [this] {
+ return get_perf_reports();
+ });
+ mgrc.init();
+
+ // tell monc about log_client so it will know about mon session resets
+ monc->set_log_client(&log_client);
+ update_log_config();
+
+ // i'm ready!
+ client_messenger->add_dispatcher_tail(&mgrc);
+ client_messenger->add_dispatcher_tail(this);
+ cluster_messenger->add_dispatcher_head(this);
+
+ hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+
+ objecter_messenger->add_dispatcher_head(service.objecter.get());
+
+ service.init();
+ service.publish_map(osdmap);
+ service.publish_superblock(superblock);
+ service.max_oldest_map = superblock.oldest_map;
+
+ for (auto& shard : shards) {
+ // put PGs in a temporary set because we may modify pg_slots
+ // unordered_map below.
+ set<PGRef> pgs;
+ for (auto& i : shard->pg_slots) {
+ PGRef pg = i.second->pg;
+ if (!pg) {
+ continue;
+ }
+ pgs.insert(pg);
+ }
+ for (auto pg : pgs) {
+ std::scoped_lock l{*pg};
+ set<pair<spg_t,epoch_t>> new_children;
+ set<pair<spg_t,epoch_t>> merge_pgs;
+ service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
+ &new_children, &merge_pgs);
+ if (!new_children.empty()) {
+ for (auto shard : shards) {
+ shard->prime_splits(osdmap, &new_children);
+ }
+ assert(new_children.empty());
+ }
+ if (!merge_pgs.empty()) {
+ for (auto shard : shards) {
+ shard->prime_merges(osdmap, &merge_pgs);
+ }
+ assert(merge_pgs.empty());
+ }
+ }
+ }
+
+ osd_op_tp.start();
+
+ // start the heartbeat
+ heartbeat_thread.create("osd_srv_heartbt");
+
+ // tick
+ tick_timer.add_event_after(get_tick_interval(),
+ new C_Tick(this));
+ {
+ std::lock_guard l(tick_timer_lock);
+ tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
+ new C_Tick_WithoutOSDLock(this));
+ }
+
+ osd_lock.unlock();
+
+ r = monc->authenticate();
+ if (r < 0) {
+ derr << __func__ << " authentication failed: " << cpp_strerror(r)
+ << dendl;
+ exit(1);
+ }
+
+ while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
+ derr << "unable to obtain rotating service keys; retrying" << dendl;
+ ++rotating_auth_attempts;
+ if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
+ derr << __func__ << " wait_auth_rotating timed out" << dendl;
+ exit(1);
+ }
+ }
+
+ r = update_crush_device_class();
+ if (r < 0) {
+ derr << __func__ << " unable to update_crush_device_class: "
+ << cpp_strerror(r) << dendl;
+ exit(1);
+ }
+
+ r = update_crush_location();
+ if (r < 0) {
+ derr << __func__ << " unable to update_crush_location: "
+ << cpp_strerror(r) << dendl;
+ exit(1);
+ }
+
+ osd_lock.lock();
+ if (is_stopping())
+ return 0;
+
+ // start objecter *after* we have authenticated, so that we don't ignore
+ // the OSDMaps it requests.
+ service.final_init();
+
+ check_config();
+
+ dout(10) << "ensuring pgs have consumed prior maps" << dendl;
+ consume_map();
+
+ dout(0) << "done with init, starting boot process" << dendl;
+
+ // subscribe to any pg creations
+ monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
+
+ // MgrClient needs this (it doesn't have MonClient reference itself)
+ monc->sub_want("mgrmap", 0, 0);
+
+ // we don't need to ask for an osdmap here; objecter will
+ //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
+
+ monc->renew_subs();
+
+ start_boot();
+
+ // Override a few options if mclock scheduler is enabled.
+ maybe_override_max_osd_capacity_for_qos();
+ maybe_override_options_for_qos();
+
+ return 0;
+
+out:
+ enable_disable_fuse(true);
+ store->umount();
+ delete store;
+ store = NULL;
+ return r;
+}
+
+void OSD::final_init()
+{
+ AdminSocket *admin_socket = cct->get_admin_socket();
+ asok_hook = new OSDSocketHook(this);
+ int r = admin_socket->register_command("status", asok_hook,
+ "high-level status of OSD");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("flush_journal",
+ asok_hook,
+ "flush the journal to permanent store");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_ops_in_flight " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
+ "show the ops currently in flight");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
+ "show the ops currently in flight");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_blocked_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
+ "show the blocked ops currently in flight");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_historic_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
+ "show recent ops");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_historic_slow_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
+ "show slowest recent ops");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_historic_ops_by_duration " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
+ "show slowest recent ops, sorted by duration");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_op_pq_state",
+ asok_hook,
+ "dump op priority queue state");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_blocklist",
+ asok_hook,
+ "dump blocklisted clients and times");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_watchers",
+ asok_hook,
+ "show clients which have active watches,"
+ " and on which objects");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_recovery_reservations",
+ asok_hook,
+ "show recovery reservations");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_scrub_reservations",
+ asok_hook,
+ "show scrub reservations");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("get_latest_osdmap",
+ asok_hook,
+ "force osd to update the latest map from "
+ "the mon");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("set_heap_property " \
+ "name=property,type=CephString " \
+ "name=value,type=CephInt",
+ asok_hook,
+ "update malloc extension heap property");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("get_heap_property " \
+ "name=property,type=CephString",
+ asok_hook,
+ "get malloc extension heap property");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("dump_objectstore_kv_stats",
+ asok_hook,
+ "print statistics of kvdb which used by bluestore");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("dump_scrubs",
+ asok_hook,
+ "print scheduled scrubs");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("calc_objectstore_db_histogram",
+ asok_hook,
+ "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("flush_store_cache",
+ asok_hook,
+ "Flush bluestore internal cache");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_pgstate_history",
+ asok_hook,
+ "show recent state history");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("compact",
+ asok_hook,
+ "Commpact object store's omap."
+ " WARNING: Compaction probably slows your requests");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("get_mapped_pools",
+ asok_hook,
+ "dump pools whose PG(s) are mapped to this OSD.");
+
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
+ asok_hook,
+ "probe OSD devices for SMART data.");
+
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command("list_devices",
+ asok_hook,
+ "list OSD devices.");
+ r = admin_socket->register_command("send_beacon",
+ asok_hook,
+ "send OSD beacon to mon immediately");
+
+ r = admin_socket->register_command(
+ "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+ "Dump osd heartbeat network ping times");
+ ceph_assert(r == 0);
+
+ test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
+ // Note: pools are CephString instead of CephPoolname because
+ // these commands traditionally support both pool names and numbers
+ r = admin_socket->register_command(
+ "setomapval " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=key,type=CephString "\
+ "name=val,type=CephString",
+ test_ops_hook,
+ "set omap key");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "rmomapkey " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=key,type=CephString",
+ test_ops_hook,
+ "remove omap key");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "setomapheader " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=header,type=CephString",
+ test_ops_hook,
+ "set omap header");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command(
+ "getomap " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname",
+ test_ops_hook,
+ "output entire object map");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command(
+ "truncobj " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=len,type=CephInt",
+ test_ops_hook,
+ "truncate object to length");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command(
+ "injectdataerr " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=shardid,type=CephInt,req=false,range=0|255",
+ test_ops_hook,
+ "inject data error to an object");
+ ceph_assert(r == 0);
+
+ r = admin_socket->register_command(
+ "injectmdataerr " \
+ "name=pool,type=CephString " \
+ "name=objname,type=CephObjectname " \
+ "name=shardid,type=CephInt,req=false,range=0|255",
+ test_ops_hook,
+ "inject metadata error to an object");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "set_recovery_delay " \
+ "name=utime,type=CephInt,req=false",
+ test_ops_hook,
+ "Delay osd recovery by specified seconds");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "injectfull " \
+ "name=type,type=CephString,req=false " \
+ "name=count,type=CephInt,req=false ",
+ test_ops_hook,
+ "Inject a full disk (optional count times)");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "bench " \
+ "name=count,type=CephInt,req=false " \
+ "name=size,type=CephInt,req=false " \
+ "name=object_size,type=CephInt,req=false " \
+ "name=object_num,type=CephInt,req=false ",
+ asok_hook,
+ "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
+ "(default count=1G default size=4MB). Results in log.");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "cluster_log " \
+ "name=level,type=CephChoices,strings=error,warning,info,debug " \
+ "name=message,type=CephString,n=N",
+ asok_hook,
+ "log a message to the cluster log");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "flush_pg_stats",
+ asok_hook,
+ "flush pg stats");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "heap " \
+ "name=heapcmd,type=CephChoices,strings=" \
+ "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
+ "name=value,type=CephString,req=false",
+ asok_hook,
+ "show heap usage info (available only if compiled with tcmalloc)");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "debug dump_missing " \
+ "name=filename,type=CephFilepath",
+ asok_hook,
+ "dump missing objects to a named file");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "debug kick_recovery_wq " \
+ "name=delay,type=CephInt,range=0",
+ asok_hook,
+ "set osd_recovery_delay_start to <val>");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "cpu_profiler " \
+ "name=arg,type=CephChoices,strings=status|flush",
+ asok_hook,
+ "run cpu profiling on daemon");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "dump_pg_recovery_stats",
+ asok_hook,
+ "dump pg recovery statistics");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "reset_pg_recovery_stats",
+ asok_hook,
+ "reset pg recovery statistics");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "cache drop",
+ asok_hook,
+ "Drop all OSD caches");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "cache status",
+ asok_hook,
+ "Get OSD caches statistics");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "scrub_purged_snaps",
+ asok_hook,
+ "Scrub purged_snaps vs snapmapper index");
+ ceph_assert(r == 0);
+
+ // -- pg commands --
+ // old form: ceph pg <pgid> command ...
+ r = admin_socket->register_command(
+ "pg " \
+ "name=pgid,type=CephPgid " \
+ "name=cmd,type=CephChoices,strings=query",
+ asok_hook,
+ "");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "pg " \
+ "name=pgid,type=CephPgid " \
+ "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
+ "name=mulcmd,type=CephChoices,strings=revert|delete",
+ asok_hook,
+ "");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "pg " \
+ "name=pgid,type=CephPgid " \
+ "name=cmd,type=CephChoices,strings=list_unfound " \
+ "name=offset,type=CephString,req=false",
+ asok_hook,
+ "");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "pg " \
+ "name=pgid,type=CephPgid " \
+ "name=cmd,type=CephChoices,strings=scrub " \
+ "name=time,type=CephInt,req=false",
+ asok_hook,
+ "");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "pg " \
+ "name=pgid,type=CephPgid " \
+ "name=cmd,type=CephChoices,strings=deep_scrub " \
+ "name=time,type=CephInt,req=false",
+ asok_hook,
+ "");
+ ceph_assert(r == 0);
+ // new form: tell <pgid> <cmd> for both cli and rest
+ r = admin_socket->register_command(
+ "query",
+ asok_hook,
+ "show details of a specific pg");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "mark_unfound_lost " \
+ "name=pgid,type=CephPgid,req=false " \
+ "name=mulcmd,type=CephChoices,strings=revert|delete",
+ asok_hook,
+ "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "list_unfound " \
+ "name=pgid,type=CephPgid,req=false " \
+ "name=offset,type=CephString,req=false",
+ asok_hook,
+ "list unfound objects on this pg, perhaps starting at an offset given in JSON");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "scrub " \
+ "name=pgid,type=CephPgid,req=false " \
+ "name=time,type=CephInt,req=false",
+ asok_hook,
+ "Trigger a scheduled scrub ");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command(
+ "deep_scrub " \
+ "name=pgid,type=CephPgid,req=false " \
+ "name=time,type=CephInt,req=false",
+ asok_hook,
+ "Trigger a scheduled deep scrub ");
+ ceph_assert(r == 0);
+}
+
+PerfCounters* OSD::create_logger()
+{
+ PerfCounters* logger = build_osd_logger(cct);
+ cct->get_perfcounters_collection()->add(logger);
+ return logger;
+}
+
+PerfCounters* OSD::create_recoverystate_perf()
+{
+ PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
+ cct->get_perfcounters_collection()->add(recoverystate_perf);
+ return recoverystate_perf;
+}
+
+int OSD::shutdown()
+{
+ if (cct->_conf->osd_fast_shutdown) {
+ derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
+ if (cct->_conf->osd_fast_shutdown_notify_mon)
+ service.prepare_to_stop();
+ cct->_log->flush();
+ _exit(0);
+ }
+
+ if (!service.prepare_to_stop())
+ return 0; // already shutting down
+ osd_lock.lock();
+ if (is_stopping()) {
+ osd_lock.unlock();
+ return 0;
+ }
+ dout(0) << "shutdown" << dendl;
+
+ set_state(STATE_STOPPING);
+
+ // Debugging
+ if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
+ cct->_conf.set_val("debug_osd", "100");
+ cct->_conf.set_val("debug_journal", "100");
+ cct->_conf.set_val("debug_filestore", "100");
+ cct->_conf.set_val("debug_bluestore", "100");
+ cct->_conf.set_val("debug_ms", "100");
+ cct->_conf.apply_changes(nullptr);
+ }
+
+ // stop MgrClient earlier as it's more like an internal consumer of OSD
+ mgrc.shutdown();
+
+ service.start_shutdown();
+
+ // stop sending work to pgs. this just prevents any new work in _process
+ // from racing with on_shutdown and potentially entering the pg after.
+ op_shardedwq.drain();
+
+ // Shutdown PGs
+ {
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto pg : pgs) {
+ pg->shutdown();
+ }
+ }
+
+ // drain op queue again (in case PGs requeued something)
+ op_shardedwq.drain();
+ {
+ finished.clear(); // zap waiters (bleh, this is messy)
+ waiting_for_osdmap.clear();
+ }
+
+ // unregister commands
+ cct->get_admin_socket()->unregister_commands(asok_hook);
+ delete asok_hook;
+ asok_hook = NULL;
+
+ cct->get_admin_socket()->unregister_commands(test_ops_hook);
+ delete test_ops_hook;
+ test_ops_hook = NULL;
+
+ osd_lock.unlock();
+
+ {
+ std::lock_guard l{heartbeat_lock};
+ heartbeat_stop = true;
+ heartbeat_cond.notify_all();
+ heartbeat_peers.clear();
+ }
+ heartbeat_thread.join();
+
+ hb_back_server_messenger->mark_down_all();
+ hb_front_server_messenger->mark_down_all();
+ hb_front_client_messenger->mark_down_all();
+ hb_back_client_messenger->mark_down_all();
+
+ osd_op_tp.drain();
+ osd_op_tp.stop();
+ dout(10) << "op sharded tp stopped" << dendl;
+
+ dout(10) << "stopping agent" << dendl;
+ service.agent_stop();
+
+ boot_finisher.wait_for_empty();
+
+ osd_lock.lock();
+
+ boot_finisher.stop();
+ reset_heartbeat_peers(true);
+
+ tick_timer.shutdown();
+
+ {
+ std::lock_guard l(tick_timer_lock);
+ tick_timer_without_osd_lock.shutdown();
+ }
+
+ // note unmount epoch
+ dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
+ superblock.mounted = service.get_boot_epoch();
+ superblock.clean_thru = get_osdmap_epoch();
+ ObjectStore::Transaction t;
+ write_superblock(t);
+ int r = store->queue_transaction(service.meta_ch, std::move(t));
+ if (r) {
+ derr << "OSD::shutdown: error writing superblock: "
+ << cpp_strerror(r) << dendl;
+ }
+
+
+ service.shutdown_reserver();
+
+ // Remove PGs
+#ifdef PG_DEBUG_REFS
+ service.dump_live_pgids();
+#endif
+ while (true) {
+ vector<PGRef> pgs;
+ _get_pgs(&pgs, true);
+ if (pgs.empty()) {
+ break;
+ }
+ for (auto& pg : pgs) {
+ if (pg->is_deleted()) {
+ continue;
+ }
+ dout(20) << " kicking pg " << pg << dendl;
+ pg->lock();
+ if (pg->get_num_ref() != 1) {
+ derr << "pgid " << pg->get_pgid() << " has ref count of "
+ << pg->get_num_ref() << dendl;
+#ifdef PG_DEBUG_REFS
+ pg->dump_live_ids();
+#endif
+ if (cct->_conf->osd_shutdown_pgref_assert) {
+ ceph_abort();
+ }
+ }
+ pg->ch.reset();
+ pg->unlock();
+ }
+ }
+#ifdef PG_DEBUG_REFS
+ service.dump_live_pgids();
+#endif
+
+ osd_lock.unlock();
+ cct->_conf.remove_observer(this);
+ osd_lock.lock();
+
+ service.meta_ch.reset();
+
+ dout(10) << "syncing store" << dendl;
+ enable_disable_fuse(true);
+
+ if (cct->_conf->osd_journal_flush_on_shutdown) {
+ dout(10) << "flushing journal" << dendl;
+ store->flush_journal();
+ }
+
+ monc->shutdown();
+ osd_lock.unlock();
+ {
+ std::unique_lock l{map_lock};
+ set_osdmap(OSDMapRef());
+ }
+ for (auto s : shards) {
+ std::lock_guard l(s->osdmap_lock);
+ s->shard_osdmap = OSDMapRef();
+ }
+ service.shutdown();
+
+ std::lock_guard lock(osd_lock);
+ store->umount();
+ delete store;
+ store = nullptr;
+ dout(10) << "Store synced" << dendl;
+
+ op_tracker.on_shutdown();
+
+ ClassHandler::get_instance().shutdown();
+ client_messenger->shutdown();
+ cluster_messenger->shutdown();
+ hb_front_client_messenger->shutdown();
+ hb_back_client_messenger->shutdown();
+ objecter_messenger->shutdown();
+ hb_front_server_messenger->shutdown();
+ hb_back_server_messenger->shutdown();
+
+ return r;
+}
+
+int OSD::mon_cmd_maybe_osd_create(string &cmd)
+{
+ bool created = false;
+ while (true) {
+ dout(10) << __func__ << " cmd: " << cmd << dendl;
+ vector<string> vcmd{cmd};
+ bufferlist inbl;
+ C_SaferCond w;
+ string outs;
+ monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
+ int r = w.wait();
+ if (r < 0) {
+ if (r == -ENOENT && !created) {
+ string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
+ + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
+ vector<string> vnewcmd{newcmd};
+ bufferlist inbl;
+ C_SaferCond w;
+ string outs;
+ monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
+ int r = w.wait();
+ if (r < 0) {
+ derr << __func__ << " fail: osd does not exist and created failed: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ created = true;
+ continue;
+ }
+ derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+int OSD::update_crush_location()
+{
+ if (!cct->_conf->osd_crush_update_on_start) {
+ dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
+ return 0;
+ }
+
+ char weight[32];
+ if (cct->_conf->osd_crush_initial_weight >= 0) {
+ snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
+ } else {
+ struct store_statfs_t st;
+ osd_alert_list_t alerts;
+ int r = store->statfs(&st, &alerts);
+ if (r < 0) {
+ derr << "statfs: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ snprintf(weight, sizeof(weight), "%.4lf",
+ std::max(.00001,
+ double(st.total) /
+ double(1ull << 40 /* TB */)));
+ }
+
+ dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
+
+ string cmd =
+ string("{\"prefix\": \"osd crush create-or-move\", ") +
+ string("\"id\": ") + stringify(whoami) + ", " +
+ string("\"weight\":") + weight + ", " +
+ string("\"args\": [") + stringify(cct->crush_location) + "]}";
+ return mon_cmd_maybe_osd_create(cmd);
+}
+
+int OSD::update_crush_device_class()
+{
+ if (!cct->_conf->osd_class_update_on_start) {
+ dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
+ return 0;
+ }
+
+ string device_class;
+ int r = store->read_meta("crush_device_class", &device_class);
+ if (r < 0 || device_class.empty()) {
+ device_class = store->get_default_device_class();
+ }
+
+ if (device_class.empty()) {
+ dout(20) << __func__ << " no device class stored locally" << dendl;
+ return 0;
+ }
+
+ string cmd =
+ string("{\"prefix\": \"osd crush set-device-class\", ") +
+ string("\"class\": \"") + device_class + string("\", ") +
+ string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
+
+ r = mon_cmd_maybe_osd_create(cmd);
+ if (r == -EBUSY) {
+ // good, already bound to a device-class
+ return 0;
+ } else {
+ return r;
+ }
+}
+
+void OSD::write_superblock(ObjectStore::Transaction& t)
+{
+ dout(10) << "write_superblock " << superblock << dendl;
+
+ //hack: at minimum it's using the baseline feature set
+ if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
+ superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+
+ bufferlist bl;
+ encode(superblock, bl);
+ t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
+}
+
+int OSD::read_superblock()
+{
+ bufferlist bl;
+ int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
+ if (r < 0)
+ return r;
+
+ auto p = bl.cbegin();
+ decode(superblock, p);
+
+ dout(10) << "read_superblock " << superblock << dendl;
+
+ return 0;
+}
+
+void OSD::clear_temp_objects()
+{
+ dout(10) << __func__ << dendl;
+ vector<coll_t> ls;
+ store->list_collections(ls);
+ for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ spg_t pgid;
+ if (!p->is_pg(&pgid))
+ continue;
+
+ // list temp objects
+ dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
+
+ vector<ghobject_t> temps;
+ ghobject_t next;
+ while (1) {
+ vector<ghobject_t> objects;
+ auto ch = store->open_collection(*p);
+ ceph_assert(ch);
+ store->collection_list(ch, next, ghobject_t::get_max(),
+ store->get_ideal_list_max(),
+ &objects, &next);
+ if (objects.empty())
+ break;
+ vector<ghobject_t>::iterator q;
+ for (q = objects.begin(); q != objects.end(); ++q) {
+ // Hammer set pool for temps to -1, so check for clean-up
+ if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
+ temps.push_back(*q);
+ } else {
+ break;
+ }
+ }
+ // If we saw a non-temp object and hit the break above we can
+ // break out of the while loop too.
+ if (q != objects.end())
+ break;
+ }
+ if (!temps.empty()) {
+ ObjectStore::Transaction t;
+ int removed = 0;
+ for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
+ dout(20) << " removing " << *p << " object " << *q << dendl;
+ t.remove(*p, *q);
+ if (++removed > cct->_conf->osd_target_transaction_size) {
+ store->queue_transaction(service.meta_ch, std::move(t));
+ t = ObjectStore::Transaction();
+ removed = 0;
+ }
+ }
+ if (removed) {
+ store->queue_transaction(service.meta_ch, std::move(t));
+ }
+ }
+ }
+}
+
+void OSD::recursive_remove_collection(CephContext* cct,
+ ObjectStore *store, spg_t pgid,
+ coll_t tmp)
+{
+ OSDriver driver(
+ store,
+ coll_t(),
+ make_snapmapper_oid());
+
+ ObjectStore::CollectionHandle ch = store->open_collection(tmp);
+ ObjectStore::Transaction t;
+ SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
+
+ ghobject_t next;
+ int max = cct->_conf->osd_target_transaction_size;
+ vector<ghobject_t> objects;
+ objects.reserve(max);
+ while (true) {
+ objects.clear();
+ store->collection_list(ch, next, ghobject_t::get_max(),
+ max, &objects, &next);
+ generic_dout(10) << __func__ << " " << objects << dendl;
+ if (objects.empty())
+ break;
+ for (auto& p: objects) {
+ OSDriver::OSTransaction _t(driver.get_transaction(&t));
+ int r = mapper.remove_oid(p.hobj, &_t);
+ if (r != 0 && r != -ENOENT)
+ ceph_abort();
+ t.remove(tmp, p);
+ }
+ int r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ t = ObjectStore::Transaction();
+ }
+ t.remove_collection(tmp);
+ int r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+
+ C_SaferCond waiter;
+ if (!ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+}
+
+
+// ======================================================
+// PG's
+
+PG* OSD::_make_pg(
+ OSDMapRef createmap,
+ spg_t pgid)
+{
+ dout(10) << __func__ << " " << pgid << dendl;
+ pg_pool_t pi;
+ map<string,string> ec_profile;
+ string name;
+ if (createmap->have_pg_pool(pgid.pool())) {
+ pi = *createmap->get_pg_pool(pgid.pool());
+ name = createmap->get_pool_name(pgid.pool());
+ if (pi.is_erasure()) {
+ ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
+ }
+ } else {
+ // pool was deleted; grab final pg_pool_t off disk.
+ ghobject_t oid = make_final_pool_info_oid(pgid.pool());
+ bufferlist bl;
+ int r = store->read(service.meta_ch, oid, 0, 0, bl);
+ if (r < 0) {
+ derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
+ << dendl;
+ return nullptr;
+ }
+ ceph_assert(r >= 0);
+ auto p = bl.cbegin();
+ decode(pi, p);
+ decode(name, p);
+ if (p.end()) { // dev release v13.0.2 did not include ec_profile
+ derr << __func__ << " missing ec_profile from pool " << pgid.pool()
+ << " tombstone" << dendl;
+ return nullptr;
+ }
+ decode(ec_profile, p);
+ }
+ PGPool pool(createmap, pgid.pool(), pi, name);
+ PG *pg;
+ if (pi.type == pg_pool_t::TYPE_REPLICATED ||
+ pi.type == pg_pool_t::TYPE_ERASURE)
+ pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
+ else
+ ceph_abort();
+ return pg;
+}
+
+void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
+{
+ v->clear();
+ v->reserve(get_num_pgs());
+ for (auto& s : shards) {
+ std::lock_guard l(s->shard_lock);
+ for (auto& j : s->pg_slots) {
+ if (j.second->pg &&
+ !j.second->pg->is_deleted()) {
+ v->push_back(j.second->pg);
+ if (clear_too) {
+ s->_detach_pg(j.second.get());
+ }
+ }
+ }
+ }
+}
+
+void OSD::_get_pgids(vector<spg_t> *v)
+{
+ v->clear();
+ v->reserve(get_num_pgs());
+ for (auto& s : shards) {
+ std::lock_guard l(s->shard_lock);
+ for (auto& j : s->pg_slots) {
+ if (j.second->pg &&
+ !j.second->pg->is_deleted()) {
+ v->push_back(j.first);
+ }
+ }
+ }
+}
+
+void OSD::register_pg(PGRef pg)
+{
+ spg_t pgid = pg->get_pgid();
+ uint32_t shard_index = pgid.hash_to_shard(num_shards);
+ auto sdata = shards[shard_index];
+ std::lock_guard l(sdata->shard_lock);
+ auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
+ ceph_assert(r.second);
+ auto *slot = r.first->second.get();
+ dout(20) << __func__ << " " << pgid << " " << pg << dendl;
+ sdata->_attach_pg(slot, pg.get());
+}
+
+bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
+{
+ auto sdata = pg->osd_shard;
+ ceph_assert(sdata);
+ {
+ std::lock_guard l(sdata->shard_lock);
+ auto p = sdata->pg_slots.find(pg->pg_id);
+ if (p == sdata->pg_slots.end() ||
+ !p->second->pg) {
+ dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
+ return false;
+ }
+ if (p->second->waiting_for_merge_epoch) {
+ dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
+ return false;
+ }
+ dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
+ sdata->_detach_pg(p->second.get());
+ }
+
+ for (auto shard : shards) {
+ shard->unprime_split_children(pg->pg_id, old_pg_num);
+ }
+
+ // update pg count now since we might not get an osdmap any time soon.
+ if (pg->is_primary())
+ service.logger->dec(l_osd_pg_primary);
+ else if (pg->is_nonprimary())
+ service.logger->dec(l_osd_pg_replica); // misnomver
+ else
+ service.logger->dec(l_osd_pg_stray);
+
+ return true;
+}
+
+PGRef OSD::_lookup_pg(spg_t pgid)
+{
+ uint32_t shard_index = pgid.hash_to_shard(num_shards);
+ auto sdata = shards[shard_index];
+ std::lock_guard l(sdata->shard_lock);
+ auto p = sdata->pg_slots.find(pgid);
+ if (p == sdata->pg_slots.end()) {
+ return nullptr;
+ }
+ return p->second->pg;
+}
+
+PGRef OSD::_lookup_lock_pg(spg_t pgid)
+{
+ PGRef pg = _lookup_pg(pgid);
+ if (!pg) {
+ return nullptr;
+ }
+ pg->lock();
+ if (!pg->is_deleted()) {
+ return pg;
+ }
+ pg->unlock();
+ return nullptr;
+}
+
+PGRef OSD::lookup_lock_pg(spg_t pgid)
+{
+ return _lookup_lock_pg(pgid);
+}
+
+void OSD::load_pgs()
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ dout(0) << "load_pgs" << dendl;
+
+ {
+ auto pghist = make_pg_num_history_oid();
+ bufferlist bl;
+ int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
+ if (r >= 0 && bl.length() > 0) {
+ auto p = bl.cbegin();
+ decode(pg_num_history, p);
+ }
+ dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
+ }
+
+ vector<coll_t> ls;
+ int r = store->list_collections(ls);
+ if (r < 0) {
+ derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
+ }
+
+ int num = 0;
+ for (vector<coll_t>::iterator it = ls.begin();
+ it != ls.end();
+ ++it) {
+ spg_t pgid;
+ if (it->is_temp(&pgid) ||
+ (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
+ dout(10) << "load_pgs " << *it
+ << " removing, legacy or flagged for removal pg" << dendl;
+ recursive_remove_collection(cct, store, pgid, *it);
+ continue;
+ }
+
+ if (!it->is_pg(&pgid)) {
+ dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
+ continue;
+ }
+
+ dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
+ epoch_t map_epoch = 0;
+ int r = PG::peek_map_epoch(store, pgid, &map_epoch);
+ if (r < 0) {
+ derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
+ << dendl;
+ continue;
+ }
+
+ PGRef pg;
+ if (map_epoch > 0) {
+ OSDMapRef pgosdmap = service.try_get_map(map_epoch);
+ if (!pgosdmap) {
+ if (!get_osdmap()->have_pg_pool(pgid.pool())) {
+ derr << __func__ << ": could not find map for epoch " << map_epoch
+ << " on pg " << pgid << ", but the pool is not present in the "
+ << "current map, so this is probably a result of bug 10617. "
+ << "Skipping the pg for now, you can use ceph-objectstore-tool "
+ << "to clean it up later." << dendl;
+ continue;
+ } else {
+ derr << __func__ << ": have pgid " << pgid << " at epoch "
+ << map_epoch << ", but missing map. Crashing."
+ << dendl;
+ ceph_abort_msg("Missing map in load_pgs");
+ }
+ }
+ pg = _make_pg(pgosdmap, pgid);
+ } else {
+ pg = _make_pg(get_osdmap(), pgid);
+ }
+ if (!pg) {
+ recursive_remove_collection(cct, store, pgid, *it);
+ continue;
+ }
+
+ // there can be no waiters here, so we don't call _wake_pg_slot
+
+ pg->lock();
+ pg->ch = store->open_collection(pg->coll);
+
+ // read pg state, log
+ pg->read_state(store);
+
+ if (pg->dne()) {
+ dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
+ pg->ch = nullptr;
+ pg->unlock();
+ recursive_remove_collection(cct, store, pgid, *it);
+ continue;
+ }
+ {
+ uint32_t shard_index = pgid.hash_to_shard(shards.size());
+ assert(NULL != shards[shard_index]);
+ store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
+ }
+
+ pg->reg_next_scrub();
+
+ dout(10) << __func__ << " loaded " << *pg << dendl;
+ pg->unlock();
+
+ register_pg(pg);
+ ++num;
+ }
+ dout(0) << __func__ << " opened " << num << " pgs" << dendl;
+}
+
+
+PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
+ const PGCreateInfo *info)
+{
+ spg_t pgid = info->pgid;
+
+ if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
+ dout(10) << __func__ << " hit max pg, dropping" << dendl;
+ return nullptr;
+ }
+
+ PeeringCtx rctx = create_context();
+
+ OSDMapRef startmap = get_map(info->epoch);
+
+ if (info->by_mon) {
+ int64_t pool_id = pgid.pgid.pool();
+ const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+ if (!pool) {
+ dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
+ return nullptr;
+ }
+ if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
+ !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
+ // this ensures we do not process old creating messages after the
+ // pool's initial pgs have been created (and pg are subsequently
+ // allowed to split or merge).
+ dout(20) << __func__ << " dropping " << pgid
+ << "create, pool does not have CREATING flag set" << dendl;
+ return nullptr;
+ }
+ }
+
+ int up_primary, acting_primary;
+ vector<int> up, acting;
+ startmap->pg_to_up_acting_osds(
+ pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+ const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
+ if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
+ store->get_type() != "bluestore") {
+ clog->warn() << "pg " << pgid
+ << " is at risk of silent data corruption: "
+ << "the pool allows ec overwrites but is not stored in "
+ << "bluestore, so deep scrubbing will not detect bitrot";
+ }
+ create_pg_collection(
+ rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
+ init_pg_ondisk(rctx.transaction, pgid, pp);
+
+ int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
+
+ PGRef pg = _make_pg(startmap, pgid);
+ pg->ch = store->create_new_collection(pg->coll);
+
+ {
+ uint32_t shard_index = pgid.hash_to_shard(shards.size());
+ assert(NULL != shards[shard_index]);
+ store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
+ }
+
+ pg->lock(true);
+
+ // we are holding the shard lock
+ ceph_assert(!pg->is_deleted());
+
+ pg->init(
+ role,
+ up,
+ up_primary,
+ acting,
+ acting_primary,
+ info->history,
+ info->past_intervals,
+ false,
+ rctx.transaction);
+
+ pg->init_collection_pool_opts();
+
+ if (pg->is_primary()) {
+ std::lock_guard locker{m_perf_queries_lock};
+ pg->set_dynamic_perf_stats_queries(m_perf_queries);
+ }
+
+ pg->handle_initialize(rctx);
+ pg->handle_activate_map(rctx);
+
+ dispatch_context(rctx, pg.get(), osdmap, nullptr);
+
+ dout(10) << __func__ << " new pg " << *pg << dendl;
+ return pg;
+}
+
+bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
+ spg_t pgid,
+ bool is_mon_create)
+{
+ const auto max_pgs_per_osd =
+ (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+ if (num_pgs < max_pgs_per_osd) {
+ return false;
+ }
+
+ std::lock_guard l(pending_creates_lock);
+ if (is_mon_create) {
+ pending_creates_from_mon++;
+ } else {
+ bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
+ pending_creates_from_osd.emplace(pgid, is_primary);
+ }
+ dout(1) << __func__ << " withhold creation of pg " << pgid
+ << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
+ return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+ if (acting.size() > 1) {
+ return {acting[0]};
+ } else {
+ vector<int32_t> twiddled(acting.begin(), acting.end());
+ twiddled.push_back(-1);
+ return twiddled;
+ }
+}
+
+void OSD::resume_creating_pg()
+{
+ bool do_sub_pg_creates = false;
+ bool have_pending_creates = false;
+ {
+ const auto max_pgs_per_osd =
+ (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+ if (max_pgs_per_osd <= num_pgs) {
+ // this could happen if admin decreases this setting before a PG is removed
+ return;
+ }
+ unsigned spare_pgs = max_pgs_per_osd - num_pgs;
+ std::lock_guard l(pending_creates_lock);
+ if (pending_creates_from_mon > 0) {
+ dout(20) << __func__ << " pending_creates_from_mon "
+ << pending_creates_from_mon << dendl;
+ do_sub_pg_creates = true;
+ if (pending_creates_from_mon >= spare_pgs) {
+ spare_pgs = pending_creates_from_mon = 0;
+ } else {
+ spare_pgs -= pending_creates_from_mon;
+ pending_creates_from_mon = 0;
+ }
+ }
+ auto pg = pending_creates_from_osd.cbegin();
+ while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+ dout(20) << __func__ << " pg " << pg->first << dendl;
+ vector<int> acting;
+ get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
+ service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
+ pg = pending_creates_from_osd.erase(pg);
+ do_sub_pg_creates = true;
+ spare_pgs--;
+ }
+ have_pending_creates = (pending_creates_from_mon > 0 ||
+ !pending_creates_from_osd.empty());
+ }
+
+ bool do_renew_subs = false;
+ if (do_sub_pg_creates) {
+ if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+ dout(4) << __func__ << ": resolicit pg creates from mon since "
+ << last_pg_create_epoch << dendl;
+ do_renew_subs = true;
+ }
+ }
+ version_t start = get_osdmap_epoch() + 1;
+ if (have_pending_creates) {
+ // don't miss any new osdmap deleting PGs
+ if (monc->sub_want("osdmap", start, 0)) {
+ dout(4) << __func__ << ": resolicit osdmap from mon since "
+ << start << dendl;
+ do_renew_subs = true;
+ }
+ } else if (do_sub_pg_creates) {
+ // no need to subscribe the osdmap continuously anymore
+ // once the pgtemp and/or mon_subscribe(pg_creates) is sent
+ if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
+ dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
+ << start << dendl;
+ do_renew_subs = true;
+ }
+ }
+
+ if (do_renew_subs) {
+ monc->renew_subs();
+ }
+
+ service.send_pg_temp();
+}
+
+void OSD::build_initial_pg_history(
+ spg_t pgid,
+ epoch_t created,
+ utime_t created_stamp,
+ pg_history_t *h,
+ PastIntervals *pi)
+{
+ dout(10) << __func__ << " " << pgid << " created " << created << dendl;
+ *h = pg_history_t(created, created_stamp);
+
+ OSDMapRef lastmap = service.get_map(created);
+ int up_primary, acting_primary;
+ vector<int> up, acting;
+ lastmap->pg_to_up_acting_osds(
+ pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+ ostringstream debug;
+ for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
+ OSDMapRef osdmap = service.get_map(e);
+ int new_up_primary, new_acting_primary;
+ vector<int> new_up, new_acting;
+ osdmap->pg_to_up_acting_osds(
+ pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
+
+ // this is a bit imprecise, but sufficient?
+ struct min_size_predicate_t : public IsPGRecoverablePredicate {
+ const pg_pool_t *pi;
+ bool operator()(const set<pg_shard_t> &have) const {
+ return have.size() >= pi->min_size;
+ }
+ explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+ } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
+
+ bool new_interval = PastIntervals::check_new_interval(
+ acting_primary,
+ new_acting_primary,
+ acting, new_acting,
+ up_primary,
+ new_up_primary,
+ up, new_up,
+ h->same_interval_since,
+ h->last_epoch_clean,
+ osdmap.get(),
+ lastmap.get(),
+ pgid.pgid,
+ min_size_predicate,
+ pi,
+ &debug);
+ if (new_interval) {
+ h->same_interval_since = e;
+ if (up != new_up) {
+ h->same_up_since = e;
+ }
+ if (acting_primary != new_acting_primary) {
+ h->same_primary_since = e;
+ }
+ if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
+ osdmap->get_pg_num(pgid.pgid.pool()),
+ nullptr)) {
+ h->last_epoch_split = e;
+ }
+ up = new_up;
+ acting = new_acting;
+ up_primary = new_up_primary;
+ acting_primary = new_acting_primary;
+ }
+ lastmap = osdmap;
+ }
+ dout(20) << __func__ << " " << debug.str() << dendl;
+ dout(10) << __func__ << " " << *h << " " << *pi
+ << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
+ pi->get_bounds()) << ")"
+ << dendl;
+}
+
+void OSD::_add_heartbeat_peer(int p)
+{
+ if (p == whoami)
+ return;
+ HeartbeatInfo *hi;
+
+ map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
+ if (i == heartbeat_peers.end()) {
+ pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
+ if (!cons.first)
+ return;
+ assert(cons.second);
+
+ hi = &heartbeat_peers[p];
+ hi->peer = p;
+
+ auto stamps = service.get_hb_stamps(p);
+
+ auto sb = ceph::make_ref<Session>(cct, cons.first.get());
+ sb->peer = p;
+ sb->stamps = stamps;
+ hi->hb_interval_start = ceph_clock_now();
+ hi->con_back = cons.first.get();
+ hi->con_back->set_priv(sb);
+
+ auto sf = ceph::make_ref<Session>(cct, cons.second.get());
+ sf->peer = p;
+ sf->stamps = stamps;
+ hi->con_front = cons.second.get();
+ hi->con_front->set_priv(sf);
+
+ dout(10) << "_add_heartbeat_peer: new peer osd." << p
+ << " " << hi->con_back->get_peer_addr()
+ << " " << hi->con_front->get_peer_addr()
+ << dendl;
+ } else {
+ hi = &i->second;
+ }
+ hi->epoch = get_osdmap_epoch();
+}
+
+void OSD::_remove_heartbeat_peer(int n)
+{
+ map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
+ ceph_assert(q != heartbeat_peers.end());
+ dout(20) << " removing heartbeat peer osd." << n
+ << " " << q->second.con_back->get_peer_addr()
+ << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
+ << dendl;
+ q->second.clear_mark_down();
+ heartbeat_peers.erase(q);
+}
+
+void OSD::need_heartbeat_peer_update()
+{
+ if (is_stopping())
+ return;
+ dout(20) << "need_heartbeat_peer_update" << dendl;
+ heartbeat_set_peers_need_update();
+}
+
+void OSD::maybe_update_heartbeat_peers()
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+
+ if (is_waiting_for_healthy() || is_active()) {
+ utime_t now = ceph_clock_now();
+ if (last_heartbeat_resample == utime_t()) {
+ last_heartbeat_resample = now;
+ heartbeat_set_peers_need_update();
+ } else if (!heartbeat_peers_need_update()) {
+ utime_t dur = now - last_heartbeat_resample;
+ if (dur > cct->_conf->osd_heartbeat_grace) {
+ dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
+ heartbeat_set_peers_need_update();
+ last_heartbeat_resample = now;
+ // automatically clean up any stale heartbeat peers
+ // if we are unhealthy, then clean all
+ reset_heartbeat_peers(is_waiting_for_healthy());
+ }
+ }
+ }
+
+ if (!heartbeat_peers_need_update())
+ return;
+ heartbeat_clear_peers_need_update();
+
+ std::lock_guard l(heartbeat_lock);
+
+ dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
+
+
+ // build heartbeat from set
+ if (is_active()) {
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ pg->with_heartbeat_peers([&](int peer) {
+ if (get_osdmap()->is_up(peer)) {
+ _add_heartbeat_peer(peer);
+ }
+ });
+ }
+ }
+
+ // include next and previous up osds to ensure we have a fully-connected set
+ set<int> want, extras;
+ const int next = get_osdmap()->get_next_up_osd_after(whoami);
+ if (next >= 0)
+ want.insert(next);
+ int prev = get_osdmap()->get_previous_up_osd_before(whoami);
+ if (prev >= 0 && prev != next)
+ want.insert(prev);
+
+ // make sure we have at least **min_down** osds coming from different
+ // subtree level (e.g., hosts) for fast failure detection.
+ auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
+ auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
+ auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
+ get_osdmap()->get_random_up_osds_by_subtree(
+ whoami, subtree, limit, want, &want);
+
+ for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
+ dout(10) << " adding neighbor peer osd." << *p << dendl;
+ extras.insert(*p);
+ _add_heartbeat_peer(*p);
+ }
+
+ // remove down peers; enumerate extras
+ map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+ while (p != heartbeat_peers.end()) {
+ if (!get_osdmap()->is_up(p->first)) {
+ int o = p->first;
+ ++p;
+ _remove_heartbeat_peer(o);
+ continue;
+ }
+ if (p->second.epoch < get_osdmap_epoch()) {
+ extras.insert(p->first);
+ }
+ ++p;
+ }
+
+ // too few?
+ for (int n = next; n >= 0; ) {
+ if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
+ break;
+ if (!extras.count(n) && !want.count(n) && n != whoami) {
+ dout(10) << " adding random peer osd." << n << dendl;
+ extras.insert(n);
+ _add_heartbeat_peer(n);
+ }
+ n = get_osdmap()->get_next_up_osd_after(n);
+ if (n == next)
+ break; // came full circle; stop
+ }
+
+ // too many?
+ for (set<int>::iterator p = extras.begin();
+ (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
+ ++p) {
+ if (want.count(*p))
+ continue;
+ _remove_heartbeat_peer(*p);
+ }
+
+ dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
+
+ // clean up stale failure pending
+ for (auto it = failure_pending.begin(); it != failure_pending.end();) {
+ if (heartbeat_peers.count(it->first) == 0) {
+ send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
+ failure_pending.erase(it++);
+ } else {
+ it++;
+ }
+ }
+}
+
+void OSD::reset_heartbeat_peers(bool all)
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ dout(10) << "reset_heartbeat_peers" << dendl;
+ utime_t stale = ceph_clock_now();
+ stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+ std::lock_guard l(heartbeat_lock);
+ for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
+ auto& [peer, hi] = *it;
+ if (all || hi.is_stale(stale)) {
+ hi.clear_mark_down();
+ // stop sending failure_report to mon too
+ failure_queue.erase(peer);
+ failure_pending.erase(peer);
+ it = heartbeat_peers.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+void OSD::handle_osd_ping(MOSDPing *m)
+{
+ if (superblock.cluster_fsid != m->fsid) {
+ dout(20) << "handle_osd_ping from " << m->get_source_inst()
+ << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
+ << dendl;
+ m->put();
+ return;
+ }
+
+ int from = m->get_source().num();
+
+ heartbeat_lock.lock();
+ if (is_stopping()) {
+ heartbeat_lock.unlock();
+ m->put();
+ return;
+ }
+
+ utime_t now = ceph_clock_now();
+ auto mnow = service.get_mnow();
+ ConnectionRef con(m->get_connection());
+ OSDMapRef curmap = service.get_osdmap();
+ if (!curmap) {
+ heartbeat_lock.unlock();
+ m->put();
+ return;
+ }
+
+ auto sref = con->get_priv();
+ Session *s = static_cast<Session*>(sref.get());
+ if (!s) {
+ heartbeat_lock.unlock();
+ m->put();
+ return;
+ }
+ if (!s->stamps) {
+ s->peer = from;
+ s->stamps = service.get_hb_stamps(from);
+ }
+
+ switch (m->op) {
+
+ case MOSDPing::PING:
+ {
+ if (cct->_conf->osd_debug_drop_ping_probability > 0) {
+ auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
+ if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
+ if (heartbeat_drop->second == 0) {
+ debug_heartbeat_drops_remaining.erase(heartbeat_drop);
+ } else {
+ --heartbeat_drop->second;
+ dout(5) << "Dropping heartbeat from " << from
+ << ", " << heartbeat_drop->second
+ << " remaining to drop" << dendl;
+ break;
+ }
+ } else if (cct->_conf->osd_debug_drop_ping_probability >
+ ((((double)(rand()%100))/100.0))) {
+ heartbeat_drop =
+ debug_heartbeat_drops_remaining.insert(std::make_pair(from,
+ cct->_conf->osd_debug_drop_ping_duration)).first;
+ dout(5) << "Dropping heartbeat from " << from
+ << ", " << heartbeat_drop->second
+ << " remaining to drop" << dendl;
+ break;
+ }
+ }
+
+ ceph::signedspan sender_delta_ub{};
+ s->stamps->got_ping(
+ m->up_from,
+ mnow,
+ m->mono_send_stamp,
+ m->delta_ub,
+ &sender_delta_ub);
+ dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
+
+ if (!cct->get_heartbeat_map()->is_healthy()) {
+ dout(10) << "internal heartbeat not healthy, dropping ping request"
+ << dendl;
+ break;
+ }
+
+ Message *r = new MOSDPing(monc->get_fsid(),
+ curmap->get_epoch(),
+ MOSDPing::PING_REPLY,
+ m->ping_stamp,
+ m->mono_ping_stamp,
+ mnow,
+ service.get_up_epoch(),
+ cct->_conf->osd_heartbeat_min_size,
+ sender_delta_ub);
+ con->send_message(r);
+
+ if (curmap->is_up(from)) {
+ if (is_active()) {
+ ConnectionRef cluster_con = service.get_con_osd_cluster(
+ from, curmap->get_epoch());
+ if (cluster_con) {
+ service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
+ }
+ }
+ } else if (!curmap->exists(from) ||
+ curmap->get_down_at(from) > m->map_epoch) {
+ // tell them they have died
+ Message *r = new MOSDPing(monc->get_fsid(),
+ curmap->get_epoch(),
+ MOSDPing::YOU_DIED,
+ m->ping_stamp,
+ m->mono_ping_stamp,
+ mnow,
+ service.get_up_epoch(),
+ cct->_conf->osd_heartbeat_min_size);
+ con->send_message(r);
+ }
+ }
+ break;
+
+ case MOSDPing::PING_REPLY:
+ {
+ map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
+ if (i != heartbeat_peers.end()) {
+ auto acked = i->second.ping_history.find(m->ping_stamp);
+ if (acked != i->second.ping_history.end()) {
+ int &unacknowledged = acked->second.second;
+ if (con == i->second.con_back) {
+ dout(25) << "handle_osd_ping got reply from osd." << from
+ << " first_tx " << i->second.first_tx
+ << " last_tx " << i->second.last_tx
+ << " last_rx_back " << i->second.last_rx_back
+ << " -> " << now
+ << " last_rx_front " << i->second.last_rx_front
+ << dendl;
+ i->second.last_rx_back = now;
+ ceph_assert(unacknowledged > 0);
+ --unacknowledged;
+ // if there is no front con, set both stamps.
+ if (i->second.con_front == NULL) {
+ i->second.last_rx_front = now;
+ ceph_assert(unacknowledged > 0);
+ --unacknowledged;
+ }
+ } else if (con == i->second.con_front) {
+ dout(25) << "handle_osd_ping got reply from osd." << from
+ << " first_tx " << i->second.first_tx
+ << " last_tx " << i->second.last_tx
+ << " last_rx_back " << i->second.last_rx_back
+ << " last_rx_front " << i->second.last_rx_front
+ << " -> " << now
+ << dendl;
+ i->second.last_rx_front = now;
+ ceph_assert(unacknowledged > 0);
+ --unacknowledged;
+ }
+
+ if (unacknowledged == 0) {
+ // succeeded in getting all replies
+ dout(25) << "handle_osd_ping got all replies from osd." << from
+ << " , erase pending ping(sent at " << m->ping_stamp << ")"
+ << " and older pending ping(s)"
+ << dendl;
+
+#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
+ ++i->second.hb_average_count;
+ uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
+ i->second.hb_total_back += back_pingtime;
+ if (back_pingtime < i->second.hb_min_back)
+ i->second.hb_min_back = back_pingtime;
+ if (back_pingtime > i->second.hb_max_back)
+ i->second.hb_max_back = back_pingtime;
+ uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
+ i->second.hb_total_front += front_pingtime;
+ if (front_pingtime < i->second.hb_min_front)
+ i->second.hb_min_front = front_pingtime;
+ if (front_pingtime > i->second.hb_max_front)
+ i->second.hb_max_front = front_pingtime;
+
+ ceph_assert(i->second.hb_interval_start != utime_t());
+ if (i->second.hb_interval_start == utime_t())
+ i->second.hb_interval_start = now;
+ int64_t hb_avg_time_period = 60;
+ if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
+ hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
+ }
+ if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
+ uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
+ uint32_t back_min = i->second.hb_min_back;
+ uint32_t back_max = i->second.hb_max_back;
+ uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
+ uint32_t front_min = i->second.hb_min_front;
+ uint32_t front_max = i->second.hb_max_front;
+
+ // Reset for new interval
+ i->second.hb_average_count = 0;
+ i->second.hb_interval_start = now;
+ i->second.hb_total_back = i->second.hb_max_back = 0;
+ i->second.hb_min_back = UINT_MAX;
+ i->second.hb_total_front = i->second.hb_max_front = 0;
+ i->second.hb_min_front = UINT_MAX;
+
+ // Record per osd interace ping times
+ // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
+ if (i->second.hb_back_pingtime.size() == 0) {
+ ceph_assert(i->second.hb_front_pingtime.size() == 0);
+ for (unsigned k = 0 ; k < hb_vector_size; ++k) {
+ i->second.hb_back_pingtime.push_back(back_avg);
+ i->second.hb_back_min.push_back(back_min);
+ i->second.hb_back_max.push_back(back_max);
+ i->second.hb_front_pingtime.push_back(front_avg);
+ i->second.hb_front_min.push_back(front_min);
+ i->second.hb_front_max.push_back(front_max);
+ ++i->second.hb_index;
+ }
+ } else {
+ int index = i->second.hb_index & (hb_vector_size - 1);
+ i->second.hb_back_pingtime[index] = back_avg;
+ i->second.hb_back_min[index] = back_min;
+ i->second.hb_back_max[index] = back_max;
+ i->second.hb_front_pingtime[index] = front_avg;
+ i->second.hb_front_min[index] = front_min;
+ i->second.hb_front_max[index] = front_max;
+ ++i->second.hb_index;
+ }
+
+ {
+ std::lock_guard l(service.stat_lock);
+ service.osd_stat.hb_pingtime[from].last_update = now.sec();
+ service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
+
+ uint32_t total = 0;
+ uint32_t min = UINT_MAX;
+ uint32_t max = 0;
+ uint32_t count = 0;
+ uint32_t which = 0;
+ uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
+ for (int32_t k = size - 1 ; k >= 0; --k) {
+ ++count;
+ int index = (i->second.hb_index + k) % size;
+ total += i->second.hb_back_pingtime[index];
+ if (i->second.hb_back_min[index] < min)
+ min = i->second.hb_back_min[index];
+ if (i->second.hb_back_max[index] > max)
+ max = i->second.hb_back_max[index];
+ if (count == 1 || count == 5 || count == 15) {
+ service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
+ service.osd_stat.hb_pingtime[from].back_min[which] = min;
+ service.osd_stat.hb_pingtime[from].back_max[which] = max;
+ which++;
+ if (count == 15)
+ break;
+ }
+ }
+
+ if (i->second.con_front != NULL) {
+ service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
+
+ total = 0;
+ min = UINT_MAX;
+ max = 0;
+ count = 0;
+ which = 0;
+ for (int32_t k = size - 1 ; k >= 0; --k) {
+ ++count;
+ int index = (i->second.hb_index + k) % size;
+ total += i->second.hb_front_pingtime[index];
+ if (i->second.hb_front_min[index] < min)
+ min = i->second.hb_front_min[index];
+ if (i->second.hb_front_max[index] > max)
+ max = i->second.hb_front_max[index];
+ if (count == 1 || count == 5 || count == 15) {
+ service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
+ service.osd_stat.hb_pingtime[from].front_min[which] = min;
+ service.osd_stat.hb_pingtime[from].front_max[which] = max;
+ which++;
+ if (count == 15)
+ break;
+ }
+ }
+ }
+ }
+ } else {
+ std::lock_guard l(service.stat_lock);
+ service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
+ if (i->second.con_front != NULL)
+ service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
+ }
+ i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
+ }
+
+ if (i->second.is_healthy(now)) {
+ // Cancel false reports
+ auto failure_queue_entry = failure_queue.find(from);
+ if (failure_queue_entry != failure_queue.end()) {
+ dout(10) << "handle_osd_ping canceling queued "
+ << "failure report for osd." << from << dendl;
+ failure_queue.erase(failure_queue_entry);
+ }
+
+ auto failure_pending_entry = failure_pending.find(from);
+ if (failure_pending_entry != failure_pending.end()) {
+ dout(10) << "handle_osd_ping canceling in-flight "
+ << "failure report for osd." << from << dendl;
+ send_still_alive(curmap->get_epoch(),
+ from,
+ failure_pending_entry->second.second);
+ failure_pending.erase(failure_pending_entry);
+ }
+ }
+ } else {
+ // old replies, deprecated by newly sent pings.
+ dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
+ << ") is found, treat as covered by newly sent pings "
+ << "and ignore"
+ << dendl;
+ }
+ }
+
+ if (m->map_epoch &&
+ curmap->is_up(from)) {
+ if (is_active()) {
+ ConnectionRef cluster_con = service.get_con_osd_cluster(
+ from, curmap->get_epoch());
+ if (cluster_con) {
+ service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
+ }
+ }
+ }
+
+ s->stamps->got_ping_reply(
+ mnow,
+ m->mono_send_stamp,
+ m->delta_ub);
+ dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
+ }
+ break;
+
+ case MOSDPing::YOU_DIED:
+ dout(10) << "handle_osd_ping " << m->get_source_inst()
+ << " says i am down in " << m->map_epoch << dendl;
+ osdmap_subscribe(curmap->get_epoch()+1, false);
+ break;
+ }
+
+ heartbeat_lock.unlock();
+ m->put();
+}
+
+void OSD::heartbeat_entry()
+{
+ std::unique_lock l(heartbeat_lock);
+ if (is_stopping())
+ return;
+ while (!heartbeat_stop) {
+ heartbeat();
+
+ double wait;
+ if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
+ wait = (float)cct->_conf->osd_heartbeat_interval;
+ } else {
+ wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
+ }
+ auto w = ceph::make_timespan(wait);
+ dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
+ heartbeat_cond.wait_for(l, w);
+ if (is_stopping())
+ return;
+ dout(30) << "heartbeat_entry woke up" << dendl;
+ }
+}
+
+void OSD::heartbeat_check()
+{
+ ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
+ utime_t now = ceph_clock_now();
+
+ // check for incoming heartbeats (move me elsewhere?)
+ for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+ p != heartbeat_peers.end();
+ ++p) {
+
+ if (p->second.first_tx == utime_t()) {
+ dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
+ << " yet, skipping" << dendl;
+ continue;
+ }
+
+ dout(25) << "heartbeat_check osd." << p->first
+ << " first_tx " << p->second.first_tx
+ << " last_tx " << p->second.last_tx
+ << " last_rx_back " << p->second.last_rx_back
+ << " last_rx_front " << p->second.last_rx_front
+ << dendl;
+ if (p->second.is_unhealthy(now)) {
+ utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
+ if (p->second.last_rx_back == utime_t() ||
+ p->second.last_rx_front == utime_t()) {
+ derr << "heartbeat_check: no reply from "
+ << p->second.con_front->get_peer_addr().get_sockaddr()
+ << " osd." << p->first
+ << " ever on either front or back, first ping sent "
+ << p->second.first_tx
+ << " (oldest deadline " << oldest_deadline << ")"
+ << dendl;
+ // fail
+ failure_queue[p->first] = p->second.first_tx;
+ } else {
+ derr << "heartbeat_check: no reply from "
+ << p->second.con_front->get_peer_addr().get_sockaddr()
+ << " osd." << p->first << " since back " << p->second.last_rx_back
+ << " front " << p->second.last_rx_front
+ << " (oldest deadline " << oldest_deadline << ")"
+ << dendl;
+ // fail
+ failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
+ }
+ }
+ }
+}
+
+void OSD::heartbeat()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
+ dout(30) << "heartbeat" << dendl;
+
+ // get CPU load avg
+ double loadavgs[1];
+ int hb_interval = cct->_conf->osd_heartbeat_interval;
+ int n_samples = 86400;
+ if (hb_interval > 1) {
+ n_samples /= hb_interval;
+ if (n_samples < 1)
+ n_samples = 1;
+ }
+
+ if (getloadavg(loadavgs, 1) == 1) {
+ logger->set(l_osd_loadavg, 100 * loadavgs[0]);
+ daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
+ dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
+ }
+
+ dout(30) << "heartbeat checking stats" << dendl;
+
+ // refresh peer list and osd stats
+ vector<int> hb_peers;
+ for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+ p != heartbeat_peers.end();
+ ++p)
+ hb_peers.push_back(p->first);
+
+ auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
+ dout(5) << __func__ << " " << new_stat << dendl;
+ ceph_assert(new_stat.statfs.total);
+
+ float pratio;
+ float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
+
+ service.check_full_status(ratio, pratio);
+
+ utime_t now = ceph_clock_now();
+ auto mnow = service.get_mnow();
+ utime_t deadline = now;
+ deadline += cct->_conf->osd_heartbeat_grace;
+
+ // send heartbeats
+ for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
+ i != heartbeat_peers.end();
+ ++i) {
+ int peer = i->first;
+ Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
+ if (!s) {
+ dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
+ continue;
+ }
+ dout(30) << "heartbeat sending ping to osd." << peer << dendl;
+
+ i->second.last_tx = now;
+ if (i->second.first_tx == utime_t())
+ i->second.first_tx = now;
+ i->second.ping_history[now] = make_pair(deadline,
+ HeartbeatInfo::HEARTBEAT_MAX_CONN);
+ if (i->second.hb_interval_start == utime_t())
+ i->second.hb_interval_start = now;
+
+ std::optional<ceph::signedspan> delta_ub;
+ s->stamps->sent_ping(&delta_ub);
+
+ i->second.con_back->send_message(
+ new MOSDPing(monc->get_fsid(),
+ service.get_osdmap_epoch(),
+ MOSDPing::PING,
+ now,
+ mnow,
+ mnow,
+ service.get_up_epoch(),
+ cct->_conf->osd_heartbeat_min_size,
+ delta_ub));
+
+ if (i->second.con_front)
+ i->second.con_front->send_message(
+ new MOSDPing(monc->get_fsid(),
+ service.get_osdmap_epoch(),
+ MOSDPing::PING,
+ now,
+ mnow,
+ mnow,
+ service.get_up_epoch(),
+ cct->_conf->osd_heartbeat_min_size,
+ delta_ub));
+ }
+
+ logger->set(l_osd_hb_to, heartbeat_peers.size());
+
+ // hmm.. am i all alone?
+ dout(30) << "heartbeat lonely?" << dendl;
+ if (heartbeat_peers.empty()) {
+ if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
+ last_mon_heartbeat = now;
+ dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
+ osdmap_subscribe(get_osdmap_epoch() + 1, false);
+ }
+ }
+
+ dout(30) << "heartbeat done" << dendl;
+}
+
+bool OSD::heartbeat_reset(Connection *con)
+{
+ std::lock_guard l(heartbeat_lock);
+ auto s = con->get_priv();
+ dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
+ con->set_priv(nullptr);
+ if (s) {
+ if (is_stopping()) {
+ return true;
+ }
+ auto session = static_cast<Session*>(s.get());
+ auto p = heartbeat_peers.find(session->peer);
+ if (p != heartbeat_peers.end() &&
+ (p->second.con_back == con ||
+ p->second.con_front == con)) {
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", reopening" << dendl;
+ p->second.clear_mark_down(con);
+ pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
+ if (newcon.first) {
+ p->second.con_back = newcon.first.get();
+ p->second.con_back->set_priv(s);
+ if (newcon.second) {
+ p->second.con_front = newcon.second.get();
+ p->second.con_front->set_priv(s);
+ }
+ p->second.ping_history.clear();
+ } else {
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", raced with osdmap update, closing out peer" << dendl;
+ heartbeat_peers.erase(p);
+ }
+ } else {
+ dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
+ }
+ }
+ return true;
+}
+
+
+
+// =========================================
+
+void OSD::tick()
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ dout(10) << "tick" << dendl;
+
+ utime_t now = ceph_clock_now();
+ // throw out any obsolete markdown log
+ utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
+ while (!osd_markdown_log.empty() &&
+ osd_markdown_log.front() + grace < now)
+ osd_markdown_log.pop_front();
+
+ if (is_active() || is_waiting_for_healthy()) {
+ maybe_update_heartbeat_peers();
+ }
+
+ if (is_waiting_for_healthy()) {
+ start_boot();
+ }
+
+ if (is_waiting_for_healthy() || is_booting()) {
+ std::lock_guard l(heartbeat_lock);
+ if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
+ last_mon_heartbeat = now;
+ dout(1) << __func__ << " checking mon for new map" << dendl;
+ osdmap_subscribe(get_osdmap_epoch() + 1, false);
+ }
+ }
+
+ do_waiters();
+
+ // scrub purged_snaps every deep scrub interval
+ {
+ const utime_t last = superblock.last_purged_snaps_scrub;
+ utime_t next = last;
+ next += cct->_conf->osd_scrub_min_interval;
+ std::mt19937 rng;
+ // use a seed that is stable for each scrub interval, but varies
+ // by OSD to avoid any herds.
+ rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
+ double r = (rng() % 1024) / 1024.0;
+ next +=
+ cct->_conf->osd_scrub_min_interval *
+ cct->_conf->osd_scrub_interval_randomize_ratio * r;
+ if (next < ceph_clock_now()) {
+ dout(20) << __func__ << " last_purged_snaps_scrub " << last
+ << " next " << next << " ... now" << dendl;
+ scrub_purged_snaps();
+ } else {
+ dout(20) << __func__ << " last_purged_snaps_scrub " << last
+ << " next " << next << dendl;
+ }
+ }
+
+ tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
+}
+
+void OSD::tick_without_osd_lock()
+{
+ ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
+ dout(10) << "tick_without_osd_lock" << dendl;
+
+ logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
+ logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
+ logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
+
+ // refresh osd stats
+ struct store_statfs_t stbuf;
+ osd_alert_list_t alerts;
+ int r = store->statfs(&stbuf, &alerts);
+ ceph_assert(r == 0);
+ service.set_statfs(stbuf, alerts);
+
+ // osd_lock is not being held, which means the OSD state
+ // might change when doing the monitor report
+ if (is_active() || is_waiting_for_healthy()) {
+ {
+ std::lock_guard l{heartbeat_lock};
+ heartbeat_check();
+ }
+ map_lock.lock_shared();
+ std::lock_guard l(mon_report_lock);
+
+ // mon report?
+ utime_t now = ceph_clock_now();
+ if (service.need_fullness_update() ||
+ now - last_mon_report > cct->_conf->osd_mon_report_interval) {
+ last_mon_report = now;
+ send_full_update();
+ send_failures();
+ }
+ map_lock.unlock_shared();
+
+ epoch_t max_waiting_epoch = 0;
+ for (auto s : shards) {
+ max_waiting_epoch = std::max(max_waiting_epoch,
+ s->get_max_waiting_epoch());
+ }
+ if (max_waiting_epoch > get_osdmap()->get_epoch()) {
+ dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
+ << ", requesting new map" << dendl;
+ osdmap_subscribe(superblock.newest_map + 1, false);
+ }
+ }
+
+ if (is_active()) {
+ if (!scrub_random_backoff()) {
+ sched_scrub();
+ }
+ service.promote_throttle_recalibrate();
+ resume_creating_pg();
+ bool need_send_beacon = false;
+ const auto now = ceph::coarse_mono_clock::now();
+ {
+ // borrow lec lock to pretect last_sent_beacon from changing
+ std::lock_guard l{min_last_epoch_clean_lock};
+ const auto elapsed = now - last_sent_beacon;
+ if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
+ cct->_conf->osd_beacon_report_interval) {
+ need_send_beacon = true;
+ }
+ }
+ if (need_send_beacon) {
+ send_beacon(now);
+ }
+ }
+
+ mgrc.update_daemon_health(get_health_metrics());
+ service.kick_recovery_queue();
+ tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
+ new C_Tick_WithoutOSDLock(this));
+}
+
+// Usage:
+// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
+// rmomapkey <pool-id> [namespace/]<obj-name> <key>
+// setomapheader <pool-id> [namespace/]<obj-name> <header>
+// getomap <pool> [namespace/]<obj-name>
+// truncobj <pool-id> [namespace/]<obj-name> <newlen>
+// injectmdataerr [namespace/]<obj-name> [shardid]
+// injectdataerr [namespace/]<obj-name> [shardid]
+//
+// set_recovery_delay [utime]
+void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
+ std::string_view command,
+ const cmdmap_t& cmdmap, ostream &ss)
+{
+ //Test support
+ //Support changing the omap on a single osd by using the Admin Socket to
+ //directly request the osd make a change.
+ if (command == "setomapval" || command == "rmomapkey" ||
+ command == "setomapheader" || command == "getomap" ||
+ command == "truncobj" || command == "injectmdataerr" ||
+ command == "injectdataerr"
+ ) {
+ pg_t rawpg;
+ int64_t pool;
+ OSDMapRef curmap = service->get_osdmap();
+ int r = -1;
+
+ string poolstr;
+
+ cmd_getval(cmdmap, "pool", poolstr);
+ pool = curmap->lookup_pg_pool_name(poolstr);
+ //If we can't find it by name then maybe id specified
+ if (pool < 0 && isdigit(poolstr[0]))
+ pool = atoll(poolstr.c_str());
+ if (pool < 0) {
+ ss << "Invalid pool '" << poolstr << "''";
+ return;
+ }
+
+ string objname, nspace;
+ cmd_getval(cmdmap, "objname", objname);
+ std::size_t found = objname.find_first_of('/');
+ if (found != string::npos) {
+ nspace = objname.substr(0, found);
+ objname = objname.substr(found+1);
+ }
+ object_locator_t oloc(pool, nspace);
+ r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
+
+ if (r < 0) {
+ ss << "Invalid namespace/objname";
+ return;
+ }
+
+ int64_t shardid;
+ cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
+ hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
+ ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
+ spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
+ if (curmap->pg_is_ec(rawpg)) {
+ if ((command != "injectdataerr") && (command != "injectmdataerr")) {
+ ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
+ return;
+ }
+ }
+
+ ObjectStore::Transaction t;
+
+ if (command == "setomapval") {
+ map<string, bufferlist> newattrs;
+ bufferlist val;
+ string key, valstr;
+ cmd_getval(cmdmap, "key", key);
+ cmd_getval(cmdmap, "val", valstr);
+
+ val.append(valstr);
+ newattrs[key] = val;
+ t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
+ r = store->queue_transaction(service->meta_ch, std::move(t));
+ if (r < 0)
+ ss << "error=" << r;
+ else
+ ss << "ok";
+ } else if (command == "rmomapkey") {
+ string key;
+ cmd_getval(cmdmap, "key", key);
+
+ t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
+ r = store->queue_transaction(service->meta_ch, std::move(t));
+ if (r < 0)
+ ss << "error=" << r;
+ else
+ ss << "ok";
+ } else if (command == "setomapheader") {
+ bufferlist newheader;
+ string headerstr;
+
+ cmd_getval(cmdmap, "header", headerstr);
+ newheader.append(headerstr);
+ t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
+ r = store->queue_transaction(service->meta_ch, std::move(t));
+ if (r < 0)
+ ss << "error=" << r;
+ else
+ ss << "ok";
+ } else if (command == "getomap") {
+ //Debug: Output entire omap
+ bufferlist hdrbl;
+ map<string, bufferlist> keyvals;
+ auto ch = store->open_collection(coll_t(pgid));
+ if (!ch) {
+ ss << "unable to open collection for " << pgid;
+ r = -ENOENT;
+ } else {
+ r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
+ if (r >= 0) {
+ ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
+ for (map<string, bufferlist>::iterator it = keyvals.begin();
+ it != keyvals.end(); ++it)
+ ss << " key=" << (*it).first << " val="
+ << string((*it).second.c_str(), (*it).second.length());
+ } else {
+ ss << "error=" << r;
+ }
+ }
+ } else if (command == "truncobj") {
+ int64_t trunclen;
+ cmd_getval(cmdmap, "len", trunclen);
+ t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
+ r = store->queue_transaction(service->meta_ch, std::move(t));
+ if (r < 0)
+ ss << "error=" << r;
+ else
+ ss << "ok";
+ } else if (command == "injectdataerr") {
+ store->inject_data_error(gobj);
+ ss << "ok";
+ } else if (command == "injectmdataerr") {
+ store->inject_mdata_error(gobj);
+ ss << "ok";
+ }
+ return;
+ }
+ if (command == "set_recovery_delay") {
+ int64_t delay;
+ cmd_getval(cmdmap, "utime", delay, (int64_t)0);
+ ostringstream oss;
+ oss << delay;
+ int r = service->cct->_conf.set_val("osd_recovery_delay_start",
+ oss.str().c_str());
+ if (r != 0) {
+ ss << "set_recovery_delay: error setting "
+ << "osd_recovery_delay_start to '" << delay << "': error "
+ << r;
+ return;
+ }
+ service->cct->_conf.apply_changes(nullptr);
+ ss << "set_recovery_delay: set osd_recovery_delay_start "
+ << "to " << service->cct->_conf->osd_recovery_delay_start;
+ return;
+ }
+ if (command == "injectfull") {
+ int64_t count;
+ string type;
+ OSDService::s_names state;
+ cmd_getval(cmdmap, "type", type, string("full"));
+ cmd_getval(cmdmap, "count", count, (int64_t)-1);
+ if (type == "none" || count == 0) {
+ type = "none";
+ count = 0;
+ }
+ state = service->get_full_state(type);
+ if (state == OSDService::s_names::INVALID) {
+ ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
+ return;
+ }
+ service->set_injectfull(state, count);
+ return;
+ }
+ ss << "Internal error - command=" << command;
+}
+
+// =========================================
+
+void OSD::ms_handle_connect(Connection *con)
+{
+ dout(10) << __func__ << " con " << con << dendl;
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ std::lock_guard l(osd_lock);
+ if (is_stopping())
+ return;
+ dout(10) << __func__ << " on mon" << dendl;
+
+ if (is_preboot()) {
+ start_boot();
+ } else if (is_booting()) {
+ _send_boot(); // resend boot message
+ } else {
+ map_lock.lock_shared();
+ std::lock_guard l2(mon_report_lock);
+
+ utime_t now = ceph_clock_now();
+ last_mon_report = now;
+
+ // resend everything, it's a new session
+ send_full_update();
+ send_alive();
+ service.requeue_pg_temp();
+ service.clear_sent_ready_to_merge();
+ service.send_pg_temp();
+ service.send_ready_to_merge();
+ service.send_pg_created();
+ requeue_failures();
+ send_failures();
+
+ map_lock.unlock_shared();
+ if (is_active()) {
+ send_beacon(ceph::coarse_mono_clock::now());
+ }
+ }
+
+ // full map requests may happen while active or pre-boot
+ if (requested_full_first) {
+ rerequest_full_maps();
+ }
+ }
+}
+
+void OSD::ms_handle_fast_connect(Connection *con)
+{
+ if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
+ con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
+ if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
+ s = ceph::make_ref<Session>(cct, con);
+ con->set_priv(s);
+ dout(10) << " new session (outgoing) " << s << " con=" << s->con
+ << " addr=" << s->con->get_peer_addr() << dendl;
+ // we don't connect to clients
+ ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
+ s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
+ }
+ }
+}
+
+void OSD::ms_handle_fast_accept(Connection *con)
+{
+ if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
+ con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
+ if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
+ s = ceph::make_ref<Session>(cct, con);
+ con->set_priv(s);
+ dout(10) << "new session (incoming)" << s << " con=" << con
+ << " addr=" << con->get_peer_addr()
+ << " must have raced with connect" << dendl;
+ ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
+ s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
+ }
+ }
+}
+
+bool OSD::ms_handle_reset(Connection *con)
+{
+ auto session = ceph::ref_cast<Session>(con->get_priv());
+ dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
+ if (!session)
+ return false;
+ session->wstate.reset(con);
+ session->con->set_priv(nullptr);
+ session->con.reset(); // break con <-> session ref cycle
+ // note that we break session->con *before* the session_handle_reset
+ // cleanup below. this avoids a race between us and
+ // PG::add_backoff, Session::check_backoff, etc.
+ session_handle_reset(session);
+ return true;
+}
+
+bool OSD::ms_handle_refused(Connection *con)
+{
+ if (!cct->_conf->osd_fast_fail_on_connection_refused)
+ return false;
+
+ auto session = ceph::ref_cast<Session>(con->get_priv());
+ dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
+ if (!session)
+ return false;
+ int type = con->get_peer_type();
+ // handle only OSD failures here
+ if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
+ OSDMapRef osdmap = get_osdmap();
+ if (osdmap) {
+ int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
+ if (id >= 0 && osdmap->is_up(id)) {
+ // I'm cheating mon heartbeat grace logic, because we know it's not going
+ // to respawn alone. +1 so we won't hit any boundary case.
+ monc->send_mon_message(
+ new MOSDFailure(
+ monc->get_fsid(),
+ id,
+ osdmap->get_addrs(id),
+ cct->_conf->osd_heartbeat_grace + 1,
+ osdmap->get_epoch(),
+ MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
+ ));
+ }
+ }
+ }
+ return true;
+}
+
+struct CB_OSD_GetVersion {
+ OSD *osd;
+ explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
+ void operator ()(boost::system::error_code ec, version_t newest,
+ version_t oldest) {
+ if (!ec)
+ osd->_got_mon_epochs(oldest, newest);
+ }
+};
+
+void OSD::start_boot()
+{
+ if (!_is_healthy()) {
+ // if we are not healthy, do not mark ourselves up (yet)
+ dout(1) << "not healthy; waiting to boot" << dendl;
+ if (!is_waiting_for_healthy())
+ start_waiting_for_healthy();
+ // send pings sooner rather than later
+ heartbeat_kick();
+ return;
+ }
+ dout(1) << __func__ << dendl;
+ set_state(STATE_PREBOOT);
+ dout(10) << "start_boot - have maps " << superblock.oldest_map
+ << ".." << superblock.newest_map << dendl;
+ monc->get_version("osdmap", CB_OSD_GetVersion(this));
+}
+
+void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
+{
+ std::lock_guard l(osd_lock);
+ if (is_preboot()) {
+ _preboot(oldest, newest);
+ }
+}
+
+void OSD::_preboot(epoch_t oldest, epoch_t newest)
+{
+ ceph_assert(is_preboot());
+ dout(10) << __func__ << " _preboot mon has osdmaps "
+ << oldest << ".." << newest << dendl;
+
+ // ensure our local fullness awareness is accurate
+ {
+ std::lock_guard l(heartbeat_lock);
+ heartbeat();
+ }
+
+ const auto& monmap = monc->monmap;
+ const auto osdmap = get_osdmap();
+ // if our map within recent history, try to add ourselves to the osdmap.
+ if (osdmap->get_epoch() == 0) {
+ derr << "waiting for initial osdmap" << dendl;
+ } else if (osdmap->is_destroyed(whoami)) {
+ derr << "osdmap says I am destroyed" << dendl;
+ // provide a small margin so we don't livelock seeing if we
+ // un-destroyed ourselves.
+ if (osdmap->get_epoch() > newest - 1) {
+ exit(0);
+ }
+ } else if (osdmap->is_noup(whoami)) {
+ derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
+ } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+ derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
+ << dendl;
+ } else if (service.need_fullness_update()) {
+ derr << "osdmap fullness state needs update" << dendl;
+ send_full_update();
+ } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
+ superblock.purged_snaps_last < superblock.current_epoch) {
+ dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
+ << " < newest_map " << superblock.current_epoch << dendl;
+ _get_purged_snaps();
+ } else if (osdmap->get_epoch() >= oldest - 1 &&
+ osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
+
+ // wait for pgs to fully catch up in a different thread, since
+ // this thread might be required for splitting and merging PGs to
+ // make progress.
+ boot_finisher.queue(
+ new LambdaContext(
+ [this](int r) {
+ std::unique_lock l(osd_lock);
+ if (is_preboot()) {
+ dout(10) << __func__ << " waiting for peering work to drain"
+ << dendl;
+ l.unlock();
+ for (auto shard : shards) {
+ shard->wait_min_pg_epoch(get_osdmap_epoch());
+ }
+ l.lock();
+ }
+ if (is_preboot()) {
+ _send_boot();
+ }
+ }));
+ return;
+ }
+
+ // get all the latest maps
+ if (osdmap->get_epoch() + 1 >= oldest)
+ osdmap_subscribe(osdmap->get_epoch() + 1, false);
+ else
+ osdmap_subscribe(oldest - 1, true);
+}
+
+void OSD::_get_purged_snaps()
+{
+ // NOTE: this is a naive, stateless implementaiton. it may send multiple
+ // overlapping requests to the mon, which will be somewhat inefficient, but
+ // it should be reliable.
+ dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
+ << ", newest_map " << superblock.current_epoch << dendl;
+ MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
+ superblock.purged_snaps_last + 1,
+ superblock.current_epoch + 1);
+ monc->send_mon_message(m);
+}
+
+void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+ ObjectStore::Transaction t;
+ if (!is_preboot() ||
+ m->last < superblock.purged_snaps_last) {
+ goto out;
+ }
+ SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
+ make_purged_snaps_oid(), &t,
+ m->purged_snaps);
+ superblock.purged_snaps_last = m->last;
+ write_superblock(t);
+ store->queue_transaction(
+ service.meta_ch,
+ std::move(t));
+ service.publish_superblock(superblock);
+ if (m->last < superblock.current_epoch) {
+ _get_purged_snaps();
+ } else {
+ start_boot();
+ }
+out:
+ m->put();
+}
+
+void OSD::send_full_update()
+{
+ if (!service.need_fullness_update())
+ return;
+ unsigned state = 0;
+ if (service.is_full()) {
+ state = CEPH_OSD_FULL;
+ } else if (service.is_backfillfull()) {
+ state = CEPH_OSD_BACKFILLFULL;
+ } else if (service.is_nearfull()) {
+ state = CEPH_OSD_NEARFULL;
+ }
+ set<string> s;
+ OSDMap::calc_state_set(state, s);
+ dout(10) << __func__ << " want state " << s << dendl;
+ monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
+}
+
+void OSD::start_waiting_for_healthy()
+{
+ dout(1) << "start_waiting_for_healthy" << dendl;
+ set_state(STATE_WAITING_FOR_HEALTHY);
+ last_heartbeat_resample = utime_t();
+
+ // subscribe to osdmap updates, in case our peers really are known to be dead
+ osdmap_subscribe(get_osdmap_epoch() + 1, false);
+}
+
+bool OSD::_is_healthy()
+{
+ if (!cct->get_heartbeat_map()->is_healthy()) {
+ dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
+ return false;
+ }
+
+ if (is_waiting_for_healthy()) {
+ utime_t now = ceph_clock_now();
+ if (osd_markdown_log.empty()) {
+ dout(5) << __func__ << " force returning true since last markdown"
+ << " was " << cct->_conf->osd_max_markdown_period
+ << "s ago" << dendl;
+ return true;
+ }
+ std::lock_guard l(heartbeat_lock);
+ int num = 0, up = 0;
+ for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+ p != heartbeat_peers.end();
+ ++p) {
+ if (p->second.is_healthy(now))
+ ++up;
+ ++num;
+ }
+ if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
+ dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
+ << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void OSD::_send_boot()
+{
+ dout(10) << "_send_boot" << dendl;
+ Connection *local_connection =
+ cluster_messenger->get_loopback_connection().get();
+ entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
+ entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
+ entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
+ entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
+
+ dout(20) << " initial client_addrs " << client_addrs
+ << ", cluster_addrs " << cluster_addrs
+ << ", hb_back_addrs " << hb_back_addrs
+ << ", hb_front_addrs " << hb_front_addrs
+ << dendl;
+ if (cluster_messenger->set_addr_unknowns(client_addrs)) {
+ dout(10) << " assuming cluster_addrs match client_addrs "
+ << client_addrs << dendl;
+ cluster_addrs = cluster_messenger->get_myaddrs();
+ }
+ if (auto session = local_connection->get_priv(); !session) {
+ cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
+ }
+
+ local_connection = hb_back_server_messenger->get_loopback_connection().get();
+ if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
+ dout(10) << " assuming hb_back_addrs match cluster_addrs "
+ << cluster_addrs << dendl;
+ hb_back_addrs = hb_back_server_messenger->get_myaddrs();
+ }
+ if (auto session = local_connection->get_priv(); !session) {
+ hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
+ }
+
+ local_connection = hb_front_server_messenger->get_loopback_connection().get();
+ if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
+ dout(10) << " assuming hb_front_addrs match client_addrs "
+ << client_addrs << dendl;
+ hb_front_addrs = hb_front_server_messenger->get_myaddrs();
+ }
+ if (auto session = local_connection->get_priv(); !session) {
+ hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
+ }
+
+ // we now know what our front and back addrs will be, and we are
+ // about to tell the mon what our metadata (including numa bindings)
+ // are, so now is a good time!
+ set_numa_affinity();
+
+ MOSDBoot *mboot = new MOSDBoot(
+ superblock, get_osdmap_epoch(), service.get_boot_epoch(),
+ hb_back_addrs, hb_front_addrs, cluster_addrs,
+ CEPH_FEATURES_ALL);
+ dout(10) << " final client_addrs " << client_addrs
+ << ", cluster_addrs " << cluster_addrs
+ << ", hb_back_addrs " << hb_back_addrs
+ << ", hb_front_addrs " << hb_front_addrs
+ << dendl;
+ _collect_metadata(&mboot->metadata);
+ monc->send_mon_message(mboot);
+ set_state(STATE_BOOTING);
+}
+
+void OSD::_collect_metadata(map<string,string> *pm)
+{
+ // config info
+ (*pm)["osd_data"] = dev_path;
+ if (store->get_type() == "filestore") {
+ // not applicable for bluestore
+ (*pm)["osd_journal"] = journal_path;
+ }
+ (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
+ (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
+ (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
+ (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
+
+ // backend
+ (*pm)["osd_objectstore"] = store->get_type();
+ (*pm)["rotational"] = store_is_rotational ? "1" : "0";
+ (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
+ (*pm)["default_device_class"] = store->get_default_device_class();
+ string osdspec_affinity;
+ int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
+ if (r < 0 || osdspec_affinity.empty()) {
+ osdspec_affinity = "";
+ }
+ (*pm)["osdspec_affinity"] = osdspec_affinity;
+ string ceph_version_when_created;
+ r = store->read_meta("ceph_version_when_created", &ceph_version_when_created);
+ if (r <0 || ceph_version_when_created.empty()) {
+ ceph_version_when_created = "";
+ }
+ (*pm)["ceph_version_when_created"] = ceph_version_when_created;
+ string created_at;
+ r = store->read_meta("created_at", &created_at);
+ if (r < 0 || created_at.empty()) {
+ created_at = "";
+ }
+ (*pm)["created_at"] = created_at;
+ store->collect_metadata(pm);
+
+ collect_sys_info(pm, cct);
+
+ (*pm)["front_iface"] = pick_iface(
+ cct,
+ client_messenger->get_myaddrs().front().get_sockaddr_storage());
+ (*pm)["back_iface"] = pick_iface(
+ cct,
+ cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
+
+ // network numa
+ {
+ int node = -1;
+ set<int> nodes;
+ set<string> unknown;
+ for (auto nm : { "front_iface", "back_iface" }) {
+ if (!(*pm)[nm].size()) {
+ unknown.insert(nm);
+ continue;
+ }
+ int n = -1;
+ int r = get_iface_numa_node((*pm)[nm], &n);
+ if (r < 0) {
+ unknown.insert((*pm)[nm]);
+ continue;
+ }
+ nodes.insert(n);
+ if (node < 0) {
+ node = n;
+ }
+ }
+ if (unknown.size()) {
+ (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
+ }
+ if (!nodes.empty()) {
+ (*pm)["network_numa_nodes"] = stringify(nodes);
+ }
+ if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
+ (*pm)["network_numa_node"] = stringify(node);
+ }
+ }
+
+ if (numa_node >= 0) {
+ (*pm)["numa_node"] = stringify(numa_node);
+ (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
+ &numa_cpu_set);
+ }
+
+ set<string> devnames;
+ store->get_devices(&devnames);
+ map<string,string> errs;
+ get_device_metadata(devnames, pm, &errs);
+ for (auto& i : errs) {
+ dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
+ }
+ dout(10) << __func__ << " " << *pm << dendl;
+}
+
+void OSD::queue_want_up_thru(epoch_t want)
+{
+ std::shared_lock map_locker{map_lock};
+ epoch_t cur = get_osdmap()->get_up_thru(whoami);
+ std::lock_guard report_locker(mon_report_lock);
+ if (want > up_thru_wanted) {
+ dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
+ << ", currently " << cur
+ << dendl;
+ up_thru_wanted = want;
+ send_alive();
+ } else {
+ dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
+ << ", currently " << cur
+ << dendl;
+ }
+}
+
+void OSD::send_alive()
+{
+ ceph_assert(ceph_mutex_is_locked(mon_report_lock));
+ const auto osdmap = get_osdmap();
+ if (!osdmap->exists(whoami))
+ return;
+ epoch_t up_thru = osdmap->get_up_thru(whoami);
+ dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
+ if (up_thru_wanted > up_thru) {
+ dout(10) << "send_alive want " << up_thru_wanted << dendl;
+ monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
+ }
+}
+
+void OSD::request_full_map(epoch_t first, epoch_t last)
+{
+ dout(10) << __func__ << " " << first << ".." << last
+ << ", previously requested "
+ << requested_full_first << ".." << requested_full_last << dendl;
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ ceph_assert(first > 0 && last > 0);
+ ceph_assert(first <= last);
+ ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
+ if (requested_full_first == 0) {
+ // first request
+ requested_full_first = first;
+ requested_full_last = last;
+ } else if (last <= requested_full_last) {
+ // dup
+ return;
+ } else {
+ // additional request
+ first = requested_full_last + 1;
+ requested_full_last = last;
+ }
+ MMonGetOSDMap *req = new MMonGetOSDMap;
+ req->request_full(first, last);
+ monc->send_mon_message(req);
+}
+
+void OSD::got_full_map(epoch_t e)
+{
+ ceph_assert(requested_full_first <= requested_full_last);
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ if (requested_full_first == 0) {
+ dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
+ return;
+ }
+ if (e < requested_full_first) {
+ dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+ << ".." << requested_full_last
+ << ", ignoring" << dendl;
+ return;
+ }
+ if (e >= requested_full_last) {
+ dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+ << ".." << requested_full_last << ", resetting" << dendl;
+ requested_full_first = requested_full_last = 0;
+ return;
+ }
+
+ requested_full_first = e + 1;
+
+ dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+ << ".." << requested_full_last
+ << ", still need more" << dendl;
+}
+
+void OSD::requeue_failures()
+{
+ std::lock_guard l(heartbeat_lock);
+ unsigned old_queue = failure_queue.size();
+ unsigned old_pending = failure_pending.size();
+ for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
+ failure_queue[p->first] = p->second.first;
+ failure_pending.erase(p++);
+ }
+ dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
+ << failure_queue.size() << dendl;
+}
+
+void OSD::send_failures()
+{
+ ceph_assert(ceph_mutex_is_locked(map_lock));
+ ceph_assert(ceph_mutex_is_locked(mon_report_lock));
+ std::lock_guard l(heartbeat_lock);
+ utime_t now = ceph_clock_now();
+ const auto osdmap = get_osdmap();
+ while (!failure_queue.empty()) {
+ int osd = failure_queue.begin()->first;
+ if (!failure_pending.count(osd)) {
+ int failed_for = (int)(double)(now - failure_queue.begin()->second);
+ monc->send_mon_message(
+ new MOSDFailure(
+ monc->get_fsid(),
+ osd,
+ osdmap->get_addrs(osd),
+ failed_for,
+ osdmap->get_epoch()));
+ failure_pending[osd] = make_pair(failure_queue.begin()->second,
+ osdmap->get_addrs(osd));
+ }
+ failure_queue.erase(osd);
+ }
+}
+
+void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
+{
+ MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
+ MOSDFailure::FLAG_ALIVE);
+ monc->send_mon_message(m);
+}
+
+void OSD::cancel_pending_failures()
+{
+ std::lock_guard l(heartbeat_lock);
+ auto it = failure_pending.begin();
+ while (it != failure_pending.end()) {
+ dout(10) << __func__ << " canceling in-flight failure report for osd."
+ << it->first << dendl;
+ send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
+ failure_pending.erase(it++);
+ }
+}
+
+void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
+{
+ const auto& monmap = monc->monmap;
+ // send beacon to mon even if we are just connected, and the monmap is not
+ // initialized yet by then.
+ if (monmap.epoch > 0 &&
+ monmap.get_required_features().contains_all(
+ ceph::features::mon::FEATURE_LUMINOUS)) {
+ dout(20) << __func__ << " sending" << dendl;
+ MOSDBeacon* beacon = nullptr;
+ {
+ std::lock_guard l{min_last_epoch_clean_lock};
+ beacon = new MOSDBeacon(get_osdmap_epoch(),
+ min_last_epoch_clean,
+ superblock.last_purged_snaps_scrub,
+ cct->_conf->osd_beacon_report_interval);
+ beacon->pgs = min_last_epoch_clean_pgs;
+ last_sent_beacon = now;
+ }
+ monc->send_mon_message(beacon);
+ } else {
+ dout(20) << __func__ << " not sending" << dendl;
+ }
+}
+
+void OSD::handle_command(MCommand *m)
+{
+ ConnectionRef con = m->get_connection();
+ auto session = ceph::ref_cast<Session>(con->get_priv());
+ if (!session) {
+ con->send_message(new MCommandReply(m, -EACCES));
+ m->put();
+ return;
+ }
+ if (!session->caps.allow_all()) {
+ con->send_message(new MCommandReply(m, -EACCES));
+ m->put();
+ return;
+ }
+ cct->get_admin_socket()->queue_tell_command(m);
+ m->put();
+}
+
+namespace {
+ class unlock_guard {
+ ceph::mutex& m;
+ public:
+ explicit unlock_guard(ceph::mutex& mutex)
+ : m(mutex)
+ {
+ m.unlock();
+ }
+ unlock_guard(unlock_guard&) = delete;
+ ~unlock_guard() {
+ m.lock();
+ }
+ };
+}
+
+void OSD::scrub_purged_snaps()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ SnapMapper::Scrubber s(cct, store, service.meta_ch,
+ make_snapmapper_oid(),
+ make_purged_snaps_oid());
+ clog->debug() << "purged_snaps scrub starts";
+ osd_lock.unlock();
+ s.run();
+ if (s.stray.size()) {
+ clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
+ } else {
+ clog->debug() << "purged_snaps scrub ok";
+ }
+ set<pair<spg_t,snapid_t>> queued;
+ for (auto& [pool, snap, hash, shard] : s.stray) {
+ const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
+ if (!pi) {
+ dout(20) << __func__ << " pool " << pool << " dne" << dendl;
+ continue;
+ }
+ pg_t pgid(pi->raw_hash_to_pg(hash), pool);
+ spg_t spgid(pgid, shard);
+ pair<spg_t,snapid_t> p(spgid, snap);
+ if (queued.count(p)) {
+ dout(20) << __func__ << " pg " << spgid << " snap " << snap
+ << " already queued" << dendl;
+ continue;
+ }
+ PGRef pg = lookup_lock_pg(spgid);
+ if (!pg) {
+ dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
+ continue;
+ }
+ queued.insert(p);
+ dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
+ << snap << dendl;
+ pg->queue_snap_retrim(snap);
+ pg->unlock();
+ }
+ osd_lock.lock();
+ if (is_stopping()) {
+ return;
+ }
+ dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
+ ObjectStore::Transaction t;
+ superblock.last_purged_snaps_scrub = ceph_clock_now();
+ write_superblock(t);
+ int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ ceph_assert(tr == 0);
+ if (is_active()) {
+ send_beacon(ceph::coarse_mono_clock::now());
+ }
+ dout(10) << __func__ << " done" << dendl;
+}
+
+void OSD::probe_smart(const string& only_devid, ostream& ss)
+{
+ set<string> devnames;
+ store->get_devices(&devnames);
+ uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
+ "osd_smart_report_timeout");
+
+ // == typedef std::map<std::string, mValue> mObject;
+ json_spirit::mObject json_map;
+
+ for (auto dev : devnames) {
+ // smartctl works only on physical devices; filter out any logical device
+ if (dev.find("dm-") == 0) {
+ continue;
+ }
+
+ string err;
+ string devid = get_device_id(dev, &err);
+ if (devid.size() == 0) {
+ dout(10) << __func__ << " no unique id for dev " << dev << " ("
+ << err << "), skipping" << dendl;
+ continue;
+ }
+ if (only_devid.size() && devid != only_devid) {
+ continue;
+ }
+
+ json_spirit::mValue smart_json;
+ if (block_device_get_metrics(dev, smart_timeout,
+ &smart_json)) {
+ dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
+ continue;
+ }
+ json_map[devid] = smart_json;
+ }
+ json_spirit::write(json_map, ss, json_spirit::pretty_print);
+}
+
+bool OSD::heartbeat_dispatch(Message *m)
+{
+ dout(30) << "heartbeat_dispatch " << m << dendl;
+ switch (m->get_type()) {
+
+ case CEPH_MSG_PING:
+ dout(10) << "ping from " << m->get_source_inst() << dendl;
+ m->put();
+ break;
+
+ case MSG_OSD_PING:
+ handle_osd_ping(static_cast<MOSDPing*>(m));
+ break;
+
+ default:
+ dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
+ m->put();
+ }
+
+ return true;
+}
+
+bool OSD::ms_dispatch(Message *m)
+{
+ dout(20) << "OSD::ms_dispatch: " << *m << dendl;
+ if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
+ service.got_stop_ack();
+ m->put();
+ return true;
+ }
+
+ // lock!
+
+ osd_lock.lock();
+ if (is_stopping()) {
+ osd_lock.unlock();
+ m->put();
+ return true;
+ }
+
+ do_waiters();
+ _dispatch(m);
+
+ osd_lock.unlock();
+
+ return true;
+}
+
+void OSDService::maybe_share_map(
+ Connection *con,
+ const OSDMapRef& osdmap,
+ epoch_t peer_epoch_lb)
+{
+ // NOTE: we assume caller hold something that keeps the Connection itself
+ // pinned (e.g., an OpRequest's MessageRef).
+ auto session = ceph::ref_cast<Session>(con->get_priv());
+ if (!session) {
+ return;
+ }
+
+ // assume the peer has the newer of the op's sent_epoch and what
+ // we think we sent them.
+ session->sent_epoch_lock.lock();
+ if (peer_epoch_lb > session->last_sent_epoch) {
+ dout(10) << __func__ << " con " << con
+ << " " << con->get_peer_addr()
+ << " map epoch " << session->last_sent_epoch
+ << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
+ session->last_sent_epoch = peer_epoch_lb;
+ }
+ epoch_t last_sent_epoch = session->last_sent_epoch;
+ session->sent_epoch_lock.unlock();
+
+ if (osdmap->get_epoch() <= last_sent_epoch) {
+ return;
+ }
+
+ send_incremental_map(last_sent_epoch, con, osdmap);
+ last_sent_epoch = osdmap->get_epoch();
+
+ session->sent_epoch_lock.lock();
+ if (session->last_sent_epoch < last_sent_epoch) {
+ dout(10) << __func__ << " con " << con
+ << " " << con->get_peer_addr()
+ << " map epoch " << session->last_sent_epoch
+ << " -> " << last_sent_epoch << " (shared)" << dendl;
+ session->last_sent_epoch = last_sent_epoch;
+ }
+ session->sent_epoch_lock.unlock();
+}
+
+void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
+{
+ ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
+
+ auto i = session->waiting_on_map.begin();
+ while (i != session->waiting_on_map.end()) {
+ OpRequestRef op = &(*i);
+ ceph_assert(ms_can_fast_dispatch(op->get_req()));
+ auto m = op->get_req<MOSDFastDispatchOp>();
+ if (m->get_min_epoch() > osdmap->get_epoch()) {
+ break;
+ }
+ session->waiting_on_map.erase(i++);
+ op->put();
+
+ spg_t pgid;
+ if (m->get_type() == CEPH_MSG_OSD_OP) {
+ pg_t actual_pgid = osdmap->raw_pg_to_pg(
+ static_cast<const MOSDOp*>(m)->get_pg());
+ if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
+ continue;
+ }
+ } else {
+ pgid = m->get_spg();
+ }
+ enqueue_op(pgid, std::move(op), m->get_map_epoch());
+ }
+
+ if (session->waiting_on_map.empty()) {
+ clear_session_waiting_on_map(session);
+ } else {
+ register_session_waiting_on_map(session);
+ }
+}
+
+void OSD::ms_fast_dispatch(Message *m)
+{
+
+#ifdef HAVE_JAEGER
+ jaeger_tracing::init_tracer("osd-services-reinit");
+ dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
+ auto dispatch_span = jaeger_tracing::new_span(__func__);
+#endif
+ FUNCTRACE(cct);
+ if (service.is_stopping()) {
+ m->put();
+ return;
+ }
+
+ // peering event?
+ switch (m->get_type()) {
+ case CEPH_MSG_PING:
+ dout(10) << "ping from " << m->get_source() << dendl;
+ m->put();
+ return;
+ case MSG_OSD_FORCE_RECOVERY:
+ handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
+ return;
+ case MSG_OSD_SCRUB2:
+ handle_fast_scrub(static_cast<MOSDScrub2*>(m));
+ return;
+
+ case MSG_OSD_PG_CREATE2:
+ return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
+ case MSG_OSD_PG_QUERY:
+ return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
+ case MSG_OSD_PG_NOTIFY:
+ return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
+ case MSG_OSD_PG_INFO:
+ return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
+ case MSG_OSD_PG_REMOVE:
+ return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
+
+ // these are single-pg messages that handle themselves
+ case MSG_OSD_PG_LOG:
+ case MSG_OSD_PG_TRIM:
+ case MSG_OSD_PG_NOTIFY2:
+ case MSG_OSD_PG_QUERY2:
+ case MSG_OSD_PG_INFO2:
+ case MSG_OSD_BACKFILL_RESERVE:
+ case MSG_OSD_RECOVERY_RESERVE:
+ case MSG_OSD_PG_LEASE:
+ case MSG_OSD_PG_LEASE_ACK:
+ {
+ MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
+ if (require_osd_peer(pm)) {
+ enqueue_peering_evt(
+ pm->get_spg(),
+ PGPeeringEventRef(pm->get_event()));
+ }
+ pm->put();
+ return;
+ }
+ }
+
+ OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
+ {
+#ifdef WITH_LTTNG
+ osd_reqid_t reqid = op->get_reqid();
+#endif
+ tracepoint(osd, ms_fast_dispatch, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc);
+ }
+#ifdef HAVE_JAEGER
+ op->set_osd_parent_span(dispatch_span);
+ if (op->osd_parent_span) {
+ auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
+ op->set_osd_parent_span(op_req_span);
+ }
+#endif
+ if (m->trace)
+ op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
+
+ // note sender epoch, min req's epoch
+ op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
+ op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
+ ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
+
+ service.maybe_inject_dispatch_delay();
+
+ if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
+ m->get_type() != CEPH_MSG_OSD_OP) {
+ // queue it directly
+ enqueue_op(
+ static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
+ std::move(op),
+ static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
+ } else {
+ // legacy client, and this is an MOSDOp (the *only* fast dispatch
+ // message that didn't have an explicit spg_t); we need to map
+ // them to an spg_t while preserving delivery order.
+ auto priv = m->get_connection()->get_priv();
+ if (auto session = static_cast<Session*>(priv.get()); session) {
+ std::lock_guard l{session->session_dispatch_lock};
+ op->get();
+ session->waiting_on_map.push_back(*op);
+ OSDMapRef nextmap = service.get_nextmap_reserved();
+ dispatch_session_waiting(session, nextmap);
+ service.release_map(nextmap);
+ }
+ }
+ OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
+}
+
+int OSD::ms_handle_authentication(Connection *con)
+{
+ int ret = 0;
+ auto s = ceph::ref_cast<Session>(con->get_priv());
+ if (!s) {
+ s = ceph::make_ref<Session>(cct, con);
+ con->set_priv(s);
+ s->entity_name = con->get_peer_entity_name();
+ dout(10) << __func__ << " new session " << s << " con " << s->con
+ << " entity " << s->entity_name
+ << " addr " << con->get_peer_addrs() << dendl;
+ } else {
+ dout(10) << __func__ << " existing session " << s << " con " << s->con
+ << " entity " << s->entity_name
+ << " addr " << con->get_peer_addrs() << dendl;
+ }
+
+ AuthCapsInfo &caps_info = con->get_peer_caps_info();
+ if (caps_info.allow_all) {
+ s->caps.set_allow_all();
+ } else if (caps_info.caps.length() > 0) {
+ bufferlist::const_iterator p = caps_info.caps.cbegin();
+ string str;
+ try {
+ decode(str, p);
+ }
+ catch (ceph::buffer::error& e) {
+ dout(10) << __func__ << " session " << s << " " << s->entity_name
+ << " failed to decode caps string" << dendl;
+ ret = -EACCES;
+ }
+ if (!ret) {
+ bool success = s->caps.parse(str);
+ if (success) {
+ dout(10) << __func__ << " session " << s
+ << " " << s->entity_name
+ << " has caps " << s->caps << " '" << str << "'" << dendl;
+ ret = 1;
+ } else {
+ dout(10) << __func__ << " session " << s << " " << s->entity_name
+ << " failed to parse caps '" << str << "'" << dendl;
+ ret = -EACCES;
+ }
+ }
+ }
+ return ret;
+}
+
+void OSD::do_waiters()
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+
+ dout(10) << "do_waiters -- start" << dendl;
+ while (!finished.empty()) {
+ OpRequestRef next = finished.front();
+ finished.pop_front();
+ dispatch_op(next);
+ }
+ dout(10) << "do_waiters -- finish" << dendl;
+}
+
+void OSD::dispatch_op(OpRequestRef op)
+{
+ switch (op->get_req()->get_type()) {
+
+ case MSG_OSD_PG_CREATE:
+ handle_pg_create(op);
+ break;
+ }
+}
+
+void OSD::_dispatch(Message *m)
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ dout(20) << "_dispatch " << m << " " << *m << dendl;
+
+ switch (m->get_type()) {
+ // -- don't need OSDMap --
+
+ // map and replication
+ case CEPH_MSG_OSD_MAP:
+ handle_osd_map(static_cast<MOSDMap*>(m));
+ break;
+ case MSG_MON_GET_PURGED_SNAPS_REPLY:
+ handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
+ break;
+
+ // osd
+ case MSG_OSD_SCRUB:
+ handle_scrub(static_cast<MOSDScrub*>(m));
+ break;
+
+ case MSG_COMMAND:
+ handle_command(static_cast<MCommand*>(m));
+ return;
+
+ // -- need OSDMap --
+
+ case MSG_OSD_PG_CREATE:
+ {
+ OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
+ if (m->trace)
+ op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
+ // no map? starting up?
+ if (!get_osdmap()) {
+ dout(7) << "no OSDMap, not booted" << dendl;
+ logger->inc(l_osd_waiting_for_map);
+ waiting_for_osdmap.push_back(op);
+ op->mark_delayed("no osdmap");
+ break;
+ }
+
+ // need OSDMap
+ dispatch_op(op);
+ }
+ }
+}
+
+// remove me post-nautilus
+void OSD::handle_scrub(MOSDScrub *m)
+{
+ dout(10) << "handle_scrub " << *m << dendl;
+ if (!require_mon_or_mgr_peer(m)) {
+ m->put();
+ return;
+ }
+ if (m->fsid != monc->get_fsid()) {
+ dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
+ << dendl;
+ m->put();
+ return;
+ }
+
+ vector<spg_t> spgs;
+ _get_pgids(&spgs);
+
+ if (!m->scrub_pgs.empty()) {
+ vector<spg_t> v;
+ for (auto pgid : m->scrub_pgs) {
+ spg_t pcand;
+ if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
+ std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
+ v.push_back(pcand);
+ }
+ }
+ spgs.swap(v);
+ }
+
+ for (auto pgid : spgs) {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestScrub(m->deep, m->repair))));
+ }
+
+ m->put();
+}
+
+void OSD::handle_fast_scrub(MOSDScrub2 *m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+ if (!require_mon_or_mgr_peer(m)) {
+ m->put();
+ return;
+ }
+ if (m->fsid != monc->get_fsid()) {
+ dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
+ << dendl;
+ m->put();
+ return;
+ }
+ for (auto pgid : m->scrub_pgs) {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ m->epoch,
+ m->epoch,
+ PeeringState::RequestScrub(m->deep, m->repair))));
+ }
+ m->put();
+}
+
+bool OSD::scrub_random_backoff()
+{
+ bool coin_flip = (rand() / (double)RAND_MAX >=
+ cct->_conf->osd_scrub_backoff_ratio);
+ if (!coin_flip) {
+ dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
+ return true;
+ }
+ return false;
+}
+
+OSDService::ScrubJob::ScrubJob(CephContext* cct,
+ const spg_t& pg, const utime_t& timestamp,
+ double pool_scrub_min_interval,
+ double pool_scrub_max_interval, bool must)
+ : cct(cct),
+ pgid(pg),
+ sched_time(timestamp),
+ deadline(timestamp)
+{
+ // if not explicitly requested, postpone the scrub with a random delay
+ if (!must) {
+ double scrub_min_interval = pool_scrub_min_interval > 0 ?
+ pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
+ double scrub_max_interval = pool_scrub_max_interval > 0 ?
+ pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
+
+ sched_time += scrub_min_interval;
+ double r = rand() / (double)RAND_MAX;
+ sched_time +=
+ scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
+ if (scrub_max_interval == 0) {
+ deadline = utime_t();
+ } else {
+ deadline += scrub_max_interval;
+ }
+
+ }
+}
+
+bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
+ if (sched_time < rhs.sched_time)
+ return true;
+ if (sched_time > rhs.sched_time)
+ return false;
+ return pgid < rhs.pgid;
+}
+
+void OSDService::dumps_scrub(ceph::Formatter *f)
+{
+ ceph_assert(f != nullptr);
+ std::lock_guard l(sched_scrub_lock);
+
+ f->open_array_section("scrubs");
+ for (const auto &i: sched_scrub_pg) {
+ f->open_object_section("scrub");
+ f->dump_stream("pgid") << i.pgid;
+ f->dump_stream("sched_time") << i.sched_time;
+ f->dump_stream("deadline") << i.deadline;
+ f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
+ f->close_section();
+ }
+ f->close_section();
+}
+
+double OSD::scrub_sleep_time(bool must_scrub)
+{
+ if (must_scrub) {
+ return cct->_conf->osd_scrub_sleep;
+ }
+ utime_t now = ceph_clock_now();
+ if (scrub_time_permit(now)) {
+ return cct->_conf->osd_scrub_sleep;
+ }
+ double normal_sleep = cct->_conf->osd_scrub_sleep;
+ double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
+ return std::max(extended_sleep, normal_sleep);
+}
+
+bool OSD::scrub_time_permit(utime_t now)
+{
+ struct tm bdt;
+ time_t tt = now.sec();
+ localtime_r(&tt, &bdt);
+
+ bool day_permit = false;
+ if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
+ if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+ day_permit = true;
+ }
+ } else {
+ if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+ day_permit = true;
+ }
+ }
+
+ if (!day_permit) {
+ dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
+ << " - " << cct->_conf->osd_scrub_end_week_day
+ << " now " << bdt.tm_wday << " = no" << dendl;
+ return false;
+ }
+
+ bool time_permit = false;
+ if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
+ if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
+ time_permit = true;
+ }
+ } else {
+ if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
+ time_permit = true;
+ }
+ }
+ if (time_permit) {
+ dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
+ << " - " << cct->_conf->osd_scrub_end_hour
+ << " now " << bdt.tm_hour << " = yes" << dendl;
+ } else {
+ dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
+ << " - " << cct->_conf->osd_scrub_end_hour
+ << " now " << bdt.tm_hour << " = no" << dendl;
+ }
+ return time_permit;
+}
+
+bool OSD::scrub_load_below_threshold()
+{
+ double loadavgs[3];
+ if (getloadavg(loadavgs, 3) != 3) {
+ dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
+ return false;
+ }
+
+ // allow scrub if below configured threshold
+ long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
+ if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
+ dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
+ << " < max " << cct->_conf->osd_scrub_load_threshold
+ << " = yes" << dendl;
+ return true;
+ }
+
+ // allow scrub if below daily avg and currently decreasing
+ if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
+ dout(20) << __func__ << " loadavg " << loadavgs[0]
+ << " < daily_loadavg " << daily_loadavg
+ << " and < 15m avg " << loadavgs[2]
+ << " = yes" << dendl;
+ return true;
+ }
+
+ dout(20) << __func__ << " loadavg " << loadavgs[0]
+ << " >= max " << cct->_conf->osd_scrub_load_threshold
+ << " and ( >= daily_loadavg " << daily_loadavg
+ << " or >= 15m avg " << loadavgs[2]
+ << ") = no" << dendl;
+ return false;
+}
+
+void OSD::sched_scrub()
+{
+ dout(20) << __func__ << " sched_scrub starts" << dendl;
+
+ // if not permitted, fail fast
+ if (!service.can_inc_scrubs()) {
+ dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
+ return;
+ }
+ bool allow_requested_repair_only = false;
+ if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
+ if (!cct->_conf->osd_repair_during_recovery) {
+ dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
+ return;
+ }
+ dout(10) << __func__
+ << " will only schedule explicitly requested repair due to active recovery"
+ << dendl;
+ allow_requested_repair_only = true;
+ }
+
+ utime_t now = ceph_clock_now();
+ bool time_permit = scrub_time_permit(now);
+ bool load_is_low = scrub_load_below_threshold();
+ dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
+
+ OSDService::ScrubJob scrub_job;
+ if (service.first_scrub_stamp(&scrub_job)) {
+ do {
+ dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
+
+ if (scrub_job.sched_time > now) {
+ // save ourselves some effort
+ dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
+ << " > " << now << dendl;
+ break;
+ }
+
+ if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
+ dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
+ << (!time_permit ? "time not permit" : "high load") << dendl;
+ continue;
+ }
+
+ PGRef pg = _lookup_lock_pg(scrub_job.pgid);
+ if (!pg) {
+ dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl;
+ continue;
+ }
+
+ // This has already started, so go on to the next scrub job
+ if (pg->is_scrub_queued_or_active()) {
+ pg->unlock();
+ dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
+ continue;
+ }
+ // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
+ if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
+ pg->unlock();
+ dout(10) << __func__ << " skip " << scrub_job.pgid
+ << " because repairing is not explicitly requested on it"
+ << dendl;
+ continue;
+ }
+
+ // If it is reserving, let it resolve before going to the next scrub job
+ if (pg->m_scrubber->is_reserving()) {
+ pg->unlock();
+ dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
+ break;
+ }
+ dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
+ << (pg->get_must_scrub() ? ", explicitly requested" :
+ (load_is_low ? ", load_is_low" : " deadline < now"))
+ << dendl;
+ if (pg->sched_scrub()) {
+ pg->unlock();
+ dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
+ break;
+ }
+ pg->unlock();
+ } while (service.next_scrub_stamp(scrub_job, &scrub_job));
+ }
+ dout(20) << "sched_scrub done" << dendl;
+}
+
+void OSD::resched_all_scrubs()
+{
+ dout(10) << __func__ << ": start" << dendl;
+ const vector<spg_t> pgs = [this] {
+ vector<spg_t> pgs;
+ OSDService::ScrubJob job;
+ if (service.first_scrub_stamp(&job)) {
+ do {
+ pgs.push_back(job.pgid);
+ } while (service.next_scrub_stamp(job, &job));
+ }
+ return pgs;
+ }();
+ for (auto& pgid : pgs) {
+ dout(20) << __func__ << ": examine " << pgid << dendl;
+ PGRef pg = _lookup_lock_pg(pgid);
+ if (!pg)
+ continue;
+ if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
+ dout(15) << __func__ << ": reschedule " << pgid << dendl;
+ pg->on_info_history_change();
+ }
+ pg->unlock();
+ }
+ dout(10) << __func__ << ": done" << dendl;
+}
+
+MPGStats* OSD::collect_pg_stats()
+{
+ // This implementation unconditionally sends every is_primary PG's
+ // stats every time we're called. This has equivalent cost to the
+ // previous implementation's worst case where all PGs are busy and
+ // their stats are always enqueued for sending.
+ std::shared_lock l{map_lock};
+
+ osd_stat_t cur_stat = service.get_osd_stat();
+ cur_stat.os_perf_stat = store->get_cur_stats();
+
+ auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
+ m->osd_stat = cur_stat;
+
+ std::lock_guard lec{min_last_epoch_clean_lock};
+ min_last_epoch_clean = get_osdmap_epoch();
+ min_last_epoch_clean_pgs.clear();
+
+ std::set<int64_t> pool_set;
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ auto pool = pg->pg_id.pgid.pool();
+ pool_set.emplace((int64_t)pool);
+ if (!pg->is_primary()) {
+ continue;
+ }
+ pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
+ m->pg_stat[pg->pg_id.pgid] = s;
+ min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
+ min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
+ });
+ }
+ store_statfs_t st;
+ bool per_pool_stats = true;
+ bool per_pool_omap_stats = false;
+ for (auto p : pool_set) {
+ int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
+ if (r == -ENOTSUP) {
+ per_pool_stats = false;
+ break;
+ } else {
+ assert(r >= 0);
+ m->pool_stat[p] = st;
+ }
+ }
+
+ // indicate whether we are reporting per-pool stats
+ m->osd_stat.num_osds = 1;
+ m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
+ m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
+
+ return m;
+}
+
+vector<DaemonHealthMetric> OSD::get_health_metrics()
+{
+ vector<DaemonHealthMetric> metrics;
+ {
+ utime_t oldest_secs;
+ const utime_t now = ceph_clock_now();
+ auto too_old = now;
+ too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
+ int slow = 0;
+ TrackedOpRef oldest_op;
+ OSDMapRef osdmap = get_osdmap();
+ // map of slow op counts by slow op event type for an aggregated logging to
+ // the cluster log.
+ map<uint8_t, int> slow_op_types;
+ // map of slow op counts by pool for reporting a pool name with highest
+ // slow ops.
+ map<uint64_t, int> slow_op_pools;
+ bool log_aggregated_slow_op =
+ cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
+ auto count_slow_ops = [&](TrackedOp& op) {
+ if (op.get_initiated() < too_old) {
+ stringstream ss;
+ ss << "slow request " << op.get_desc()
+ << " initiated "
+ << op.get_initiated()
+ << " currently "
+ << op.state_string();
+ lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+ if (log_aggregated_slow_op) {
+ if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
+ uint8_t op_type = req->state_flag();
+ auto m = req->get_req<MOSDFastDispatchOp>();
+ uint64_t poolid = m->get_spg().pgid.m_pool;
+ slow_op_types[op_type]++;
+ if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
+ slow_op_pools[poolid]++;
+ }
+ }
+ } else {
+ clog->warn() << ss.str();
+ }
+ slow++;
+ if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
+ oldest_op = &op;
+ }
+ return true;
+ } else {
+ return false;
+ }
+ };
+ if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
+ if (slow) {
+ derr << __func__ << " reporting " << slow << " slow ops, oldest is "
+ << oldest_op->get_desc() << dendl;
+ if (log_aggregated_slow_op &&
+ slow_op_types.size() > 0) {
+ stringstream ss;
+ ss << slow << " slow requests (by type [ ";
+ for (const auto& [op_type, count] : slow_op_types) {
+ ss << "'" << OpRequest::get_state_string(op_type)
+ << "' : " << count
+ << " ";
+ }
+ auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
+ [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
+ return p1.second < p2.second;
+ });
+ if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
+ string pool_name = osdmap->get_pool_name(slow_pool_it->first);
+ ss << "] most affected pool [ '"
+ << pool_name
+ << "' : "
+ << slow_pool_it->second
+ << " ])";
+ } else {
+ ss << "])";
+ }
+ lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+ clog->warn() << ss.str();
+ }
+ }
+ metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
+ } else {
+ // no news is not good news.
+ metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
+ }
+ }
+ {
+ std::lock_guard l(pending_creates_lock);
+ auto n_primaries = pending_creates_from_mon;
+ for (const auto& create : pending_creates_from_osd) {
+ if (create.second) {
+ n_primaries++;
+ }
+ }
+ metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
+ }
+ return metrics;
+}
+
+// =====================================================
+// MAP
+
+void OSD::wait_for_new_map(OpRequestRef op)
+{
+ // ask?
+ if (waiting_for_osdmap.empty()) {
+ osdmap_subscribe(get_osdmap_epoch() + 1, false);
+ }
+
+ logger->inc(l_osd_waiting_for_map);
+ waiting_for_osdmap.push_back(op);
+ op->mark_delayed("wait for new map");
+}
+
+
+/** update_map
+ * assimilate new OSDMap(s). scan pgs, etc.
+ */
+
+void OSD::note_down_osd(int peer)
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
+
+ std::lock_guard l{heartbeat_lock};
+ failure_queue.erase(peer);
+ failure_pending.erase(peer);
+ map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
+ if (p != heartbeat_peers.end()) {
+ p->second.clear_mark_down();
+ heartbeat_peers.erase(p);
+ }
+}
+
+void OSD::note_up_osd(int peer)
+{
+ heartbeat_set_peers_need_update();
+}
+
+struct C_OnMapCommit : public Context {
+ OSD *osd;
+ epoch_t first, last;
+ MOSDMap *msg;
+ C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
+ : osd(o), first(f), last(l), msg(m) {}
+ void finish(int r) override {
+ osd->_committed_osd_maps(first, last, msg);
+ msg->put();
+ }
+};
+
+void OSD::osdmap_subscribe(version_t epoch, bool force_request)
+{
+ std::lock_guard l(osdmap_subscribe_lock);
+ if (latest_subscribed_epoch >= epoch && !force_request)
+ return;
+
+ latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
+
+ if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
+ force_request) {
+ monc->renew_subs();
+ }
+}
+
+void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
+{
+ epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
+ if (min <= superblock.oldest_map)
+ return;
+
+ int num = 0;
+ ObjectStore::Transaction t;
+ for (epoch_t e = superblock.oldest_map; e < min; ++e) {
+ dout(20) << " removing old osdmap epoch " << e << dendl;
+ t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
+ t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
+ superblock.oldest_map = e + 1;
+ num++;
+ if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
+ service.publish_superblock(superblock);
+ write_superblock(t);
+ int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ ceph_assert(tr == 0);
+ num = 0;
+ if (!skip_maps) {
+ // skip_maps leaves us with a range of old maps if we fail to remove all
+ // of them before moving superblock.oldest_map forward to the first map
+ // in the incoming MOSDMap msg. so we should continue removing them in
+ // this case, even we could do huge series of delete transactions all at
+ // once.
+ break;
+ }
+ }
+ }
+ if (num > 0) {
+ service.publish_superblock(superblock);
+ write_superblock(t);
+ int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ ceph_assert(tr == 0);
+ }
+ // we should not remove the cached maps
+ ceph_assert(min <= service.map_cache.cached_key_lower_bound());
+}
+
+void OSD::handle_osd_map(MOSDMap *m)
+{
+ // wait for pgs to catch up
+ {
+ // we extend the map cache pins to accomodate pgs slow to consume maps
+ // for some period, until we hit the max_lag_factor bound, at which point
+ // we block here to stop injesting more maps than they are able to keep
+ // up with.
+ epoch_t max_lag = cct->_conf->osd_map_cache_size *
+ m_osd_pg_epoch_max_lag_factor;
+ ceph_assert(max_lag > 0);
+ epoch_t osd_min = 0;
+ for (auto shard : shards) {
+ epoch_t min = shard->get_min_pg_epoch();
+ if (osd_min == 0 || min < osd_min) {
+ osd_min = min;
+ }
+ }
+ epoch_t osdmap_epoch = get_osdmap_epoch();
+ if (osd_min > 0 &&
+ osdmap_epoch > max_lag &&
+ osdmap_epoch - max_lag > osd_min) {
+ epoch_t need = osdmap_epoch - max_lag;
+ dout(10) << __func__ << " waiting for pgs to catch up (need " << need
+ << " max_lag " << max_lag << ")" << dendl;
+ for (auto shard : shards) {
+ epoch_t min = shard->get_min_pg_epoch();
+ if (need > min) {
+ dout(10) << __func__ << " waiting for pgs to consume " << need
+ << " (shard " << shard->shard_id << " min " << min
+ << ", map cache is " << cct->_conf->osd_map_cache_size
+ << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
+ << ")" << dendl;
+ unlock_guard unlock{osd_lock};
+ shard->wait_min_pg_epoch(need);
+ }
+ }
+ }
+ }
+
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ map<epoch_t,OSDMapRef> added_maps;
+ map<epoch_t,bufferlist> added_maps_bl;
+ if (m->fsid != monc->get_fsid()) {
+ dout(0) << "handle_osd_map fsid " << m->fsid << " != "
+ << monc->get_fsid() << dendl;
+ m->put();
+ return;
+ }
+ if (is_initializing()) {
+ dout(0) << "ignoring osdmap until we have initialized" << dendl;
+ m->put();
+ return;
+ }
+
+ auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
+ if (session && !(session->entity_name.is_mon() ||
+ session->entity_name.is_osd())) {
+ //not enough perms!
+ dout(10) << "got osd map from Session " << session
+ << " which we can't take maps from (not a mon or osd)" << dendl;
+ m->put();
+ return;
+ }
+
+ // share with the objecter
+ if (!is_preboot())
+ service.objecter->handle_osd_map(m);
+
+ epoch_t first = m->get_first();
+ epoch_t last = m->get_last();
+ dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
+ << superblock.newest_map
+ << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
+ << dendl;
+
+ logger->inc(l_osd_map);
+ logger->inc(l_osd_mape, last - first + 1);
+ if (first <= superblock.newest_map)
+ logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
+ if (service.max_oldest_map < m->oldest_map) {
+ service.max_oldest_map = m->oldest_map;
+ ceph_assert(service.max_oldest_map >= superblock.oldest_map);
+ }
+
+ // make sure there is something new, here, before we bother flushing
+ // the queues and such
+ if (last <= superblock.newest_map) {
+ dout(10) << " no new maps here, dropping" << dendl;
+ m->put();
+ return;
+ }
+
+ // missing some?
+ bool skip_maps = false;
+ if (first > superblock.newest_map + 1) {
+ dout(10) << "handle_osd_map message skips epochs "
+ << superblock.newest_map + 1 << ".." << (first-1) << dendl;
+ if (m->oldest_map <= superblock.newest_map + 1) {
+ osdmap_subscribe(superblock.newest_map + 1, false);
+ m->put();
+ return;
+ }
+ // always try to get the full range of maps--as many as we can. this
+ // 1- is good to have
+ // 2- is at present the only way to ensure that we get a *full* map as
+ // the first map!
+ if (m->oldest_map < first) {
+ osdmap_subscribe(m->oldest_map - 1, true);
+ m->put();
+ return;
+ }
+ skip_maps = true;
+ }
+
+ ObjectStore::Transaction t;
+ uint64_t txn_size = 0;
+
+ map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
+
+ // store new maps: queue for disk and put in the osdmap cache
+ epoch_t start = std::max(superblock.newest_map + 1, first);
+ for (epoch_t e = start; e <= last; e++) {
+ if (txn_size >= t.get_num_bytes()) {
+ derr << __func__ << " transaction size overflowed" << dendl;
+ ceph_assert(txn_size < t.get_num_bytes());
+ }
+ txn_size = t.get_num_bytes();
+ map<epoch_t,bufferlist>::iterator p;
+ p = m->maps.find(e);
+ if (p != m->maps.end()) {
+ dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
+ OSDMap *o = new OSDMap;
+ bufferlist& bl = p->second;
+
+ o->decode(bl);
+
+ purged_snaps[e] = o->get_new_purged_snaps();
+
+ ghobject_t fulloid = get_osdmap_pobject_name(e);
+ t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
+ added_maps[e] = add_map(o);
+ added_maps_bl[e] = bl;
+ got_full_map(e);
+ continue;
+ }
+
+ p = m->incremental_maps.find(e);
+ if (p != m->incremental_maps.end()) {
+ dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
+ bufferlist& bl = p->second;
+ ghobject_t oid = get_inc_osdmap_pobject_name(e);
+ t.write(coll_t::meta(), oid, 0, bl.length(), bl);
+
+ OSDMap *o = new OSDMap;
+ if (e > 1) {
+ bufferlist obl;
+ bool got = get_map_bl(e - 1, obl);
+ if (!got) {
+ auto p = added_maps_bl.find(e - 1);
+ ceph_assert(p != added_maps_bl.end());
+ obl = p->second;
+ }
+ o->decode(obl);
+ }
+
+ OSDMap::Incremental inc;
+ auto p = bl.cbegin();
+ inc.decode(p);
+
+ if (o->apply_incremental(inc) < 0) {
+ derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
+ ceph_abort_msg("bad fsid");
+ }
+
+ bufferlist fbl;
+ o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
+
+ bool injected_failure = false;
+ if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
+ (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
+ derr << __func__ << " injecting map crc failure" << dendl;
+ injected_failure = true;
+ }
+
+ if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
+ dout(2) << "got incremental " << e
+ << " but failed to encode full with correct crc; requesting"
+ << dendl;
+ clog->warn() << "failed to encode map e" << e << " with expected crc";
+ dout(20) << "my encoded map was:\n";
+ fbl.hexdump(*_dout);
+ *_dout << dendl;
+ delete o;
+ request_full_map(e, last);
+ last = e - 1;
+
+ // don't continue committing if we failed to enc the first inc map
+ if (last < start) {
+ dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
+ m->put();
+ return;
+ }
+ break;
+ }
+ got_full_map(e);
+ purged_snaps[e] = o->get_new_purged_snaps();
+
+ ghobject_t fulloid = get_osdmap_pobject_name(e);
+ t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
+ added_maps[e] = add_map(o);
+ added_maps_bl[e] = fbl;
+ continue;
+ }
+
+ ceph_abort_msg("MOSDMap lied about what maps it had?");
+ }
+
+ // even if this map isn't from a mon, we may have satisfied our subscription
+ monc->sub_got("osdmap", last);
+
+ if (!m->maps.empty() && requested_full_first) {
+ dout(10) << __func__ << " still missing full maps " << requested_full_first
+ << ".." << requested_full_last << dendl;
+ rerequest_full_maps();
+ }
+
+ if (superblock.oldest_map) {
+ // make sure we at least keep pace with incoming maps
+ trim_maps(m->oldest_map, last - first + 1, skip_maps);
+ pg_num_history.prune(superblock.oldest_map);
+ }
+
+ if (!superblock.oldest_map || skip_maps)
+ superblock.oldest_map = first;
+ superblock.newest_map = last;
+ superblock.current_epoch = last;
+
+ // note in the superblock that we were clean thru the prior epoch
+ epoch_t boot_epoch = service.get_boot_epoch();
+ if (boot_epoch && boot_epoch >= superblock.mounted) {
+ superblock.mounted = boot_epoch;
+ superblock.clean_thru = last;
+ }
+
+ // check for pg_num changes and deleted pools
+ OSDMapRef lastmap;
+ for (auto& i : added_maps) {
+ if (!lastmap) {
+ if (!(lastmap = service.try_get_map(i.first - 1))) {
+ dout(10) << __func__ << " can't get previous map " << i.first - 1
+ << " probably first start of this osd" << dendl;
+ continue;
+ }
+ }
+ ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
+ for (auto& j : lastmap->get_pools()) {
+ if (!i.second->have_pg_pool(j.first)) {
+ pg_num_history.log_pool_delete(i.first, j.first);
+ dout(10) << __func__ << " recording final pg_pool_t for pool "
+ << j.first << dendl;
+ // this information is needed by _make_pg() if have to restart before
+ // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
+ ghobject_t obj = make_final_pool_info_oid(j.first);
+ bufferlist bl;
+ encode(j.second, bl, CEPH_FEATURES_ALL);
+ string name = lastmap->get_pool_name(j.first);
+ encode(name, bl);
+ map<string,string> profile;
+ if (lastmap->get_pg_pool(j.first)->is_erasure()) {
+ profile = lastmap->get_erasure_code_profile(
+ lastmap->get_pg_pool(j.first)->erasure_code_profile);
+ }
+ encode(profile, bl);
+ t.write(coll_t::meta(), obj, 0, bl.length(), bl);
+ } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
+ new_pg_num != j.second.get_pg_num()) {
+ dout(10) << __func__ << " recording pool " << j.first << " pg_num "
+ << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
+ pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
+ }
+ }
+ for (auto& j : i.second->get_pools()) {
+ if (!lastmap->have_pg_pool(j.first)) {
+ dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
+ << j.second.get_pg_num() << dendl;
+ pg_num_history.log_pg_num_change(i.first, j.first,
+ j.second.get_pg_num());
+ }
+ }
+ lastmap = i.second;
+ }
+ pg_num_history.epoch = last;
+ {
+ bufferlist bl;
+ ::encode(pg_num_history, bl);
+ t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
+ dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
+ }
+
+ // record new purged_snaps
+ if (superblock.purged_snaps_last == start - 1) {
+ SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
+ make_purged_snaps_oid(), &t,
+ purged_snaps);
+ superblock.purged_snaps_last = last;
+ } else {
+ dout(10) << __func__ << " superblock purged_snaps_last is "
+ << superblock.purged_snaps_last
+ << ", not recording new purged_snaps" << dendl;
+ }
+
+ // superblock and commit
+ write_superblock(t);
+ t.register_on_commit(new C_OnMapCommit(this, start, last, m));
+ store->queue_transaction(
+ service.meta_ch,
+ std::move(t));
+ service.publish_superblock(superblock);
+}
+
+void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
+{
+ dout(10) << __func__ << " " << first << ".." << last << dendl;
+ if (is_stopping()) {
+ dout(10) << __func__ << " bailing, we are shutting down" << dendl;
+ return;
+ }
+ std::lock_guard l(osd_lock);
+ if (is_stopping()) {
+ dout(10) << __func__ << " bailing, we are shutting down" << dendl;
+ return;
+ }
+ map_lock.lock();
+
+ ceph_assert(first <= last);
+
+ bool do_shutdown = false;
+ bool do_restart = false;
+ bool network_error = false;
+ OSDMapRef osdmap = get_osdmap();
+
+ // advance through the new maps
+ for (epoch_t cur = first; cur <= last; cur++) {
+ dout(10) << " advance to epoch " << cur
+ << " (<= last " << last
+ << " <= newest_map " << superblock.newest_map
+ << ")" << dendl;
+
+ OSDMapRef newmap = get_map(cur);
+ ceph_assert(newmap); // we just cached it above!
+
+ // start blocklisting messages sent to peers that go down.
+ service.pre_publish_map(newmap);
+
+ // kill connections to newly down osds
+ bool waited_for_reservations = false;
+ set<int> old;
+ osdmap = get_osdmap();
+ osdmap->get_all_osds(old);
+ for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
+ if (*p != whoami &&
+ osdmap->is_up(*p) && // in old map
+ newmap->is_down(*p)) { // but not the new one
+ if (!waited_for_reservations) {
+ service.await_reserved_maps();
+ waited_for_reservations = true;
+ }
+ note_down_osd(*p);
+ } else if (*p != whoami &&
+ osdmap->is_down(*p) &&
+ newmap->is_up(*p)) {
+ note_up_osd(*p);
+ }
+ }
+
+ if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
+ dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
+ << dendl;
+ if (is_booting()) {
+ // this captures the case where we sent the boot message while
+ // NOUP was being set on the mon and our boot request was
+ // dropped, and then later it is cleared. it imperfectly
+ // handles the case where our original boot message was not
+ // dropped and we restart even though we might have booted, but
+ // that is harmless (boot will just take slightly longer).
+ do_restart = true;
+ }
+ }
+
+ osdmap = std::move(newmap);
+ set_osdmap(osdmap);
+ epoch_t up_epoch;
+ epoch_t boot_epoch;
+ service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
+ if (!up_epoch &&
+ osdmap->is_up(whoami) &&
+ osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
+ up_epoch = osdmap->get_epoch();
+ dout(10) << "up_epoch is " << up_epoch << dendl;
+ if (!boot_epoch) {
+ boot_epoch = osdmap->get_epoch();
+ dout(10) << "boot_epoch is " << boot_epoch << dendl;
+ }
+ service.set_epochs(&boot_epoch, &up_epoch, NULL);
+ }
+ }
+
+ epoch_t _bind_epoch = service.get_bind_epoch();
+ if (osdmap->is_up(whoami) &&
+ osdmap->get_addrs(whoami).legacy_equals(
+ client_messenger->get_myaddrs()) &&
+ _bind_epoch < osdmap->get_up_from(whoami)) {
+
+ if (is_booting()) {
+ dout(1) << "state: booting -> active" << dendl;
+ set_state(STATE_ACTIVE);
+ do_restart = false;
+
+ // set incarnation so that osd_reqid_t's we generate for our
+ // objecter requests are unique across restarts.
+ service.objecter->set_client_incarnation(osdmap->get_epoch());
+ cancel_pending_failures();
+ }
+ }
+
+ if (osdmap->get_epoch() > 0 &&
+ is_active()) {
+ if (!osdmap->exists(whoami)) {
+ derr << "map says i do not exist. shutting down." << dendl;
+ do_shutdown = true; // don't call shutdown() while we have
+ // everything paused
+ } else if (osdmap->is_stop(whoami)) {
+ derr << "map says i am stopped by admin. shutting down." << dendl;
+ do_shutdown = true;
+ } else if (!osdmap->is_up(whoami) ||
+ !osdmap->get_addrs(whoami).legacy_equals(
+ client_messenger->get_myaddrs()) ||
+ !osdmap->get_cluster_addrs(whoami).legacy_equals(
+ cluster_messenger->get_myaddrs()) ||
+ !osdmap->get_hb_back_addrs(whoami).legacy_equals(
+ hb_back_server_messenger->get_myaddrs()) ||
+ !osdmap->get_hb_front_addrs(whoami).legacy_equals(
+ hb_front_server_messenger->get_myaddrs())) {
+ if (!osdmap->is_up(whoami)) {
+ if (service.is_preparing_to_stop() || service.is_stopping()) {
+ service.got_stop_ack();
+ } else {
+ clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
+ "but it is still running";
+ clog->debug() << "map e" << osdmap->get_epoch()
+ << " wrongly marked me down at e"
+ << osdmap->get_down_at(whoami);
+ }
+ if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
+ // note that this is best-effort...
+ monc->send_mon_message(
+ new MOSDMarkMeDead(
+ monc->get_fsid(),
+ whoami,
+ osdmap->get_epoch()));
+ }
+ } else if (!osdmap->get_addrs(whoami).legacy_equals(
+ client_messenger->get_myaddrs())) {
+ clog->error() << "map e" << osdmap->get_epoch()
+ << " had wrong client addr (" << osdmap->get_addrs(whoami)
+ << " != my " << client_messenger->get_myaddrs() << ")";
+ } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
+ cluster_messenger->get_myaddrs())) {
+ clog->error() << "map e" << osdmap->get_epoch()
+ << " had wrong cluster addr ("
+ << osdmap->get_cluster_addrs(whoami)
+ << " != my " << cluster_messenger->get_myaddrs() << ")";
+ } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
+ hb_back_server_messenger->get_myaddrs())) {
+ clog->error() << "map e" << osdmap->get_epoch()
+ << " had wrong heartbeat back addr ("
+ << osdmap->get_hb_back_addrs(whoami)
+ << " != my " << hb_back_server_messenger->get_myaddrs()
+ << ")";
+ } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
+ hb_front_server_messenger->get_myaddrs())) {
+ clog->error() << "map e" << osdmap->get_epoch()
+ << " had wrong heartbeat front addr ("
+ << osdmap->get_hb_front_addrs(whoami)
+ << " != my " << hb_front_server_messenger->get_myaddrs()
+ << ")";
+ }
+
+ if (!service.is_stopping()) {
+ epoch_t up_epoch = 0;
+ epoch_t bind_epoch = osdmap->get_epoch();
+ service.set_epochs(NULL,&up_epoch, &bind_epoch);
+ do_restart = true;
+
+ //add markdown log
+ utime_t now = ceph_clock_now();
+ utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
+ osd_markdown_log.push_back(now);
+ if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
+ derr << __func__ << " marked down "
+ << osd_markdown_log.size()
+ << " > osd_max_markdown_count "
+ << cct->_conf->osd_max_markdown_count
+ << " in last " << grace << " seconds, shutting down"
+ << dendl;
+ do_restart = false;
+ do_shutdown = true;
+ }
+
+ start_waiting_for_healthy();
+
+ set<int> avoid_ports;
+#if defined(__FreeBSD__)
+ // prevent FreeBSD from grabbing the client_messenger port during
+ // rebinding. In which case a cluster_meesneger will connect also
+ // to the same port
+ client_messenger->get_myaddrs().get_ports(&avoid_ports);
+#endif
+ cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
+
+ int r = cluster_messenger->rebind(avoid_ports);
+ if (r != 0) {
+ do_shutdown = true; // FIXME: do_restart?
+ network_error = true;
+ derr << __func__ << " marked down:"
+ << " rebind cluster_messenger failed" << dendl;
+ }
+
+ hb_back_server_messenger->mark_down_all();
+ hb_front_server_messenger->mark_down_all();
+ hb_front_client_messenger->mark_down_all();
+ hb_back_client_messenger->mark_down_all();
+
+ reset_heartbeat_peers(true);
+ }
+ }
+ } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
+ derr << "map says i am stopped by admin. shutting down." << dendl;
+ do_shutdown = true;
+ }
+
+ map_lock.unlock();
+
+ check_osdmap_features();
+
+ // yay!
+ consume_map();
+
+ if (is_active() || is_waiting_for_healthy())
+ maybe_update_heartbeat_peers();
+
+ if (is_active()) {
+ activate_map();
+ }
+
+ if (do_shutdown) {
+ if (network_error) {
+ cancel_pending_failures();
+ }
+ // trigger shutdown in a different thread
+ dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
+ queue_async_signal(SIGINT);
+ }
+ else if (m->newest_map && m->newest_map > last) {
+ dout(10) << " msg say newest map is " << m->newest_map
+ << ", requesting more" << dendl;
+ osdmap_subscribe(osdmap->get_epoch()+1, false);
+ }
+ else if (is_preboot()) {
+ if (m->get_source().is_mon())
+ _preboot(m->oldest_map, m->newest_map);
+ else
+ start_boot();
+ }
+ else if (do_restart)
+ start_boot();
+
+}
+
+void OSD::check_osdmap_features()
+{
+ // adjust required feature bits?
+
+ // we have to be a bit careful here, because we are accessing the
+ // Policy structures without taking any lock. in particular, only
+ // modify integer values that can safely be read by a racing CPU.
+ // since we are only accessing existing Policy structures a their
+ // current memory location, and setting or clearing bits in integer
+ // fields, and we are the only writer, this is not a problem.
+
+ const auto osdmap = get_osdmap();
+ {
+ Messenger::Policy p = client_messenger->get_default_policy();
+ uint64_t mask;
+ uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
+ if ((p.features_required & mask) != features) {
+ dout(0) << "crush map has features " << features
+ << ", adjusting msgr requires for clients" << dendl;
+ p.features_required = (p.features_required & ~mask) | features;
+ client_messenger->set_default_policy(p);
+ }
+ }
+ {
+ Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
+ uint64_t mask;
+ uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
+ if ((p.features_required & mask) != features) {
+ dout(0) << "crush map has features " << features
+ << " was " << p.features_required
+ << ", adjusting msgr requires for mons" << dendl;
+ p.features_required = (p.features_required & ~mask) | features;
+ client_messenger->set_policy(entity_name_t::TYPE_MON, p);
+ }
+ }
+ {
+ Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
+ uint64_t mask;
+ uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
+
+ if ((p.features_required & mask) != features) {
+ dout(0) << "crush map has features " << features
+ << ", adjusting msgr requires for osds" << dendl;
+ p.features_required = (p.features_required & ~mask) | features;
+ cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
+ }
+
+ if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
+ dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
+ superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ ObjectStore::Transaction t;
+ write_superblock(t);
+ int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
+ ceph_assert(err == 0);
+ }
+ }
+
+ if (osdmap->require_osd_release < ceph_release_t::nautilus) {
+ hb_front_server_messenger->set_require_authorizer(false);
+ hb_back_server_messenger->set_require_authorizer(false);
+ } else {
+ hb_front_server_messenger->set_require_authorizer(true);
+ hb_back_server_messenger->set_require_authorizer(true);
+ }
+
+ if (osdmap->require_osd_release != last_require_osd_release) {
+ dout(1) << __func__ << " require_osd_release " << last_require_osd_release
+ << " -> " << to_string(osdmap->require_osd_release) << dendl;
+ store->write_meta("require_osd_release",
+ stringify((int)osdmap->require_osd_release));
+ last_require_osd_release = osdmap->require_osd_release;
+ }
+}
+
+struct C_FinishSplits : public Context {
+ OSD *osd;
+ set<PGRef> pgs;
+ C_FinishSplits(OSD *osd, const set<PGRef> &in)
+ : osd(osd), pgs(in) {}
+ void finish(int r) override {
+ osd->_finish_splits(pgs);
+ }
+};
+
+void OSD::_finish_splits(set<PGRef>& pgs)
+{
+ dout(10) << __func__ << " " << pgs << dendl;
+ if (is_stopping())
+ return;
+ for (set<PGRef>::iterator i = pgs.begin();
+ i != pgs.end();
+ ++i) {
+ PG *pg = i->get();
+
+ PeeringCtx rctx = create_context();
+ pg->lock();
+ dout(10) << __func__ << " " << *pg << dendl;
+ epoch_t e = pg->get_osdmap_epoch();
+ pg->handle_initialize(rctx);
+ pg->queue_null(e, e);
+ dispatch_context(rctx, pg, service.get_osdmap());
+ pg->unlock();
+
+ unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
+ shards[shard_index]->register_and_wake_split_child(pg);
+ }
+};
+
+bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
+ unsigned need)
+{
+ std::lock_guard l(merge_lock);
+ auto& p = merge_waiters[nextmap->get_epoch()][target];
+ p[src->pg_id] = src;
+ dout(10) << __func__ << " added merge_waiter " << src->pg_id
+ << " for " << target << ", have " << p.size() << "/" << need
+ << dendl;
+ return p.size() == need;
+}
+
+bool OSD::advance_pg(
+ epoch_t osd_epoch,
+ PG *pg,
+ ThreadPool::TPHandle &handle,
+ PeeringCtx &rctx)
+{
+ if (osd_epoch <= pg->get_osdmap_epoch()) {
+ return true;
+ }
+ ceph_assert(pg->is_locked());
+ OSDMapRef lastmap = pg->get_osdmap();
+ set<PGRef> new_pgs; // any split children
+ bool ret = true;
+
+ unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
+ lastmap->get_pg_num(pg->pg_id.pool()) : 0;
+ for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
+ next_epoch <= osd_epoch;
+ ++next_epoch) {
+ OSDMapRef nextmap = service.try_get_map(next_epoch);
+ if (!nextmap) {
+ dout(20) << __func__ << " missing map " << next_epoch << dendl;
+ continue;
+ }
+
+ unsigned new_pg_num =
+ (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
+ nextmap->get_pg_num(pg->pg_id.pool()) : 0;
+ if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
+ // check for merge
+ if (nextmap->have_pg_pool(pg->pg_id.pool())) {
+ spg_t parent;
+ if (pg->pg_id.is_merge_source(
+ old_pg_num,
+ new_pg_num,
+ &parent)) {
+ // we are merge source
+ PGRef spg = pg; // carry a ref
+ dout(1) << __func__ << " " << pg->pg_id
+ << " is merge source, target is " << parent
+ << dendl;
+ pg->write_if_dirty(rctx);
+ if (!new_pgs.empty()) {
+ rctx.transaction.register_on_applied(new C_FinishSplits(this,
+ new_pgs));
+ new_pgs.clear();
+ }
+ dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
+ pg->ch->flush();
+ // release backoffs explicitly, since the on_shutdown path
+ // aggressively tears down backoff state.
+ if (pg->is_primary()) {
+ pg->release_pg_backoffs();
+ }
+ pg->on_shutdown();
+ OSDShard *sdata = pg->osd_shard;
+ {
+ std::lock_guard l(sdata->shard_lock);
+ if (pg->pg_slot) {
+ sdata->_detach_pg(pg->pg_slot);
+ // update pg count now since we might not get an osdmap
+ // any time soon.
+ if (pg->is_primary())
+ logger->dec(l_osd_pg_primary);
+ else if (pg->is_nonprimary())
+ logger->dec(l_osd_pg_replica); // misnomer
+ else
+ logger->dec(l_osd_pg_stray);
+ }
+ }
+ pg->unlock();
+
+ set<spg_t> children;
+ parent.is_split(new_pg_num, old_pg_num, &children);
+ if (add_merge_waiter(nextmap, parent, pg, children.size())) {
+ enqueue_peering_evt(
+ parent,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ nextmap->get_epoch(),
+ nextmap->get_epoch(),
+ NullEvt())));
+ }
+ ret = false;
+ goto out;
+ } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
+ // we are merge target
+ set<spg_t> children;
+ pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
+ dout(20) << __func__ << " " << pg->pg_id
+ << " is merge target, sources are " << children
+ << dendl;
+ map<spg_t,PGRef> sources;
+ {
+ std::lock_guard l(merge_lock);
+ auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
+ unsigned need = children.size();
+ dout(20) << __func__ << " have " << s.size() << "/"
+ << need << dendl;
+ if (s.size() == need) {
+ sources.swap(s);
+ merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
+ if (merge_waiters[nextmap->get_epoch()].empty()) {
+ merge_waiters.erase(nextmap->get_epoch());
+ }
+ }
+ }
+ if (!sources.empty()) {
+ unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
+ unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
+ dout(1) << __func__ << " merging " << pg->pg_id << dendl;
+ pg->merge_from(
+ sources, rctx, split_bits,
+ nextmap->get_pg_pool(
+ pg->pg_id.pool())->last_pg_merge_meta);
+ pg->pg_slot->waiting_for_merge_epoch = 0;
+ } else {
+ dout(20) << __func__ << " not ready to merge yet" << dendl;
+ pg->write_if_dirty(rctx);
+ if (!new_pgs.empty()) {
+ rctx.transaction.register_on_applied(new C_FinishSplits(this,
+ new_pgs));
+ new_pgs.clear();
+ }
+ dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
+ pg->unlock();
+ // kick source(s) to get them ready
+ for (auto& i : children) {
+ dout(20) << __func__ << " kicking source " << i << dendl;
+ enqueue_peering_evt(
+ i,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ nextmap->get_epoch(),
+ nextmap->get_epoch(),
+ NullEvt())));
+ }
+ ret = false;
+ goto out;
+ }
+ }
+ }
+ }
+
+ vector<int> newup, newacting;
+ int up_primary, acting_primary;
+ nextmap->pg_to_up_acting_osds(
+ pg->pg_id.pgid,
+ &newup, &up_primary,
+ &newacting, &acting_primary);
+ pg->handle_advance_map(
+ nextmap, lastmap, newup, up_primary,
+ newacting, acting_primary, rctx);
+
+ auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
+ auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
+ if (oldpool != lastmap->get_pools().end()
+ && newpool != nextmap->get_pools().end()) {
+ dout(20) << __func__
+ << " new pool opts " << newpool->second.opts
+ << " old pool opts " << oldpool->second.opts
+ << dendl;
+
+ double old_min_interval = 0, new_min_interval = 0;
+ oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
+ newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
+
+ double old_max_interval = 0, new_max_interval = 0;
+ oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
+ newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
+
+ // Assume if an interval is change from set to unset or vice versa the actual config
+ // is different. Keep it simple even if it is possible to call resched_all_scrub()
+ // unnecessarily.
+ if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
+ pg->on_info_history_change();
+ }
+ }
+
+ if (new_pg_num && old_pg_num != new_pg_num) {
+ // check for split
+ set<spg_t> children;
+ if (pg->pg_id.is_split(
+ old_pg_num,
+ new_pg_num,
+ &children)) {
+ split_pgs(
+ pg, children, &new_pgs, lastmap, nextmap,
+ rctx);
+ }
+ }
+
+ lastmap = nextmap;
+ old_pg_num = new_pg_num;
+ handle.reset_tp_timeout();
+ }
+ pg->handle_activate_map(rctx);
+
+ ret = true;
+ out:
+ if (!new_pgs.empty()) {
+ rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
+ }
+ return ret;
+}
+
+void OSD::consume_map()
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ auto osdmap = get_osdmap();
+ dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
+
+ /** make sure the cluster is speaking in SORTBITWISE, because we don't
+ * speak the older sorting version any more. Be careful not to force
+ * a shutdown if we are merely processing old maps, though.
+ */
+ if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
+ derr << __func__ << " SORTBITWISE flag is not set" << dendl;
+ ceph_abort();
+ }
+
+ service.pre_publish_map(osdmap);
+ service.await_reserved_maps();
+ service.publish_map(osdmap);
+
+ // prime splits and merges
+ set<pair<spg_t,epoch_t>> newly_split; // splits, and when
+ set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
+ for (auto& shard : shards) {
+ shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
+ }
+ if (!newly_split.empty()) {
+ for (auto& shard : shards) {
+ shard->prime_splits(osdmap, &newly_split);
+ }
+ ceph_assert(newly_split.empty());
+ }
+
+ // prune sent_ready_to_merge
+ service.prune_sent_ready_to_merge(osdmap);
+
+ // FIXME, maybe: We could race against an incoming peering message
+ // that instantiates a merge PG after identify_merges() below and
+ // never set up its peer to complete the merge. An OSD restart
+ // would clear it up. This is a hard race to resolve,
+ // extraordinarily rare (we only merge PGs that are stable and
+ // clean, so it'd have to be an imported PG to an OSD with a
+ // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
+ // replace all of this with a seastar-based code soon anyway.
+ if (!merge_pgs.empty()) {
+ // mark the pgs we already have, or create new and empty merge
+ // participants for those we are missing. do this all under the
+ // shard lock so we don't have to worry about racing pg creates
+ // via _process.
+ for (auto& shard : shards) {
+ shard->prime_merges(osdmap, &merge_pgs);
+ }
+ ceph_assert(merge_pgs.empty());
+ }
+
+ service.prune_pg_created();
+
+ unsigned pushes_to_free = 0;
+ for (auto& shard : shards) {
+ shard->consume_map(osdmap, &pushes_to_free);
+ }
+
+ vector<spg_t> pgids;
+ _get_pgids(&pgids);
+
+ // count (FIXME, probably during seastar rewrite)
+ int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ // FIXME (probably during seastar rewrite): this is lockless and
+ // racy, but we don't want to take pg lock here.
+ if (pg->is_primary())
+ num_pg_primary++;
+ else if (pg->is_nonprimary())
+ num_pg_replica++; // misnomer
+ else
+ num_pg_stray++;
+ }
+
+ {
+ // FIXME (as part of seastar rewrite): move to OSDShard
+ std::lock_guard l(pending_creates_lock);
+ for (auto pg = pending_creates_from_osd.begin();
+ pg != pending_creates_from_osd.end();) {
+ if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
+ dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
+ << "discarding pending_create_from_osd" << dendl;
+ pg = pending_creates_from_osd.erase(pg);
+ } else {
+ ++pg;
+ }
+ }
+ }
+
+ service.maybe_inject_dispatch_delay();
+
+ dispatch_sessions_waiting_on_map();
+
+ service.maybe_inject_dispatch_delay();
+
+ service.release_reserved_pushes(pushes_to_free);
+
+ // queue null events to push maps down to individual PGs
+ for (auto pgid : pgids) {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ osdmap->get_epoch(),
+ osdmap->get_epoch(),
+ NullEvt())));
+ }
+ logger->set(l_osd_pg, pgids.size());
+ logger->set(l_osd_pg_primary, num_pg_primary);
+ logger->set(l_osd_pg_replica, num_pg_replica);
+ logger->set(l_osd_pg_stray, num_pg_stray);
+}
+
+void OSD::activate_map()
+{
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ auto osdmap = get_osdmap();
+
+ dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
+
+ // norecover?
+ if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
+ if (!service.recovery_is_paused()) {
+ dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
+ service.pause_recovery();
+ }
+ } else {
+ if (service.recovery_is_paused()) {
+ dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
+ service.unpause_recovery();
+ }
+ }
+
+ service.activate_map();
+
+ // process waiters
+ take_waiters(waiting_for_osdmap);
+}
+
+bool OSD::require_mon_peer(const Message *m)
+{
+ if (!m->get_connection()->peer_is_mon()) {
+ dout(0) << "require_mon_peer received from non-mon "
+ << m->get_connection()->get_peer_addr()
+ << " " << *m << dendl;
+ return false;
+ }
+ return true;
+}
+
+bool OSD::require_mon_or_mgr_peer(const Message *m)
+{
+ if (!m->get_connection()->peer_is_mon() &&
+ !m->get_connection()->peer_is_mgr()) {
+ dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
+ << m->get_connection()->get_peer_addr()
+ << " " << *m << dendl;
+ return false;
+ }
+ return true;
+}
+
+bool OSD::require_osd_peer(const Message *m)
+{
+ if (!m->get_connection()->peer_is_osd()) {
+ dout(0) << "require_osd_peer received from non-osd "
+ << m->get_connection()->get_peer_addr()
+ << " " << *m << dendl;
+ return false;
+ }
+ return true;
+}
+
+bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
+{
+ epoch_t up_epoch = service.get_up_epoch();
+ if (epoch < up_epoch) {
+ dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
+ return false;
+ }
+
+ if (!is_active()) {
+ dout(7) << "still in boot state, dropping message " << *m << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
+ bool is_fast_dispatch)
+{
+ int from = m->get_source().num();
+
+ if (map->is_down(from) ||
+ (map->get_cluster_addrs(from) != m->get_source_addrs())) {
+ dout(5) << "from dead osd." << from << ", marking down, "
+ << " msg was " << m->get_source_inst().addr
+ << " expected "
+ << (map->is_up(from) ?
+ map->get_cluster_addrs(from) : entity_addrvec_t())
+ << dendl;
+ ConnectionRef con = m->get_connection();
+ con->mark_down();
+ if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
+ if (!is_fast_dispatch)
+ s->session_dispatch_lock.lock();
+ clear_session_waiting_on_map(s);
+ con->set_priv(nullptr); // break ref <-> session cycle, if any
+ s->con.reset();
+ if (!is_fast_dispatch)
+ s->session_dispatch_lock.unlock();
+ }
+ return false;
+ }
+ return true;
+}
+
+
+/*
+ * require that we have same (or newer) map, and that
+ * the source is the pg primary.
+ */
+bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
+ bool is_fast_dispatch)
+{
+ const Message *m = op->get_req();
+ const auto osdmap = get_osdmap();
+ dout(15) << "require_same_or_newer_map " << epoch
+ << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+
+ // do they have a newer map?
+ if (epoch > osdmap->get_epoch()) {
+ dout(7) << "waiting for newer map epoch " << epoch
+ << " > my " << osdmap->get_epoch() << " with " << m << dendl;
+ wait_for_new_map(op);
+ return false;
+ }
+
+ if (!require_self_aliveness(op->get_req(), epoch)) {
+ return false;
+ }
+
+ // ok, our map is same or newer.. do they still exist?
+ if (m->get_connection()->get_messenger() == cluster_messenger &&
+ !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
+ return false;
+ }
+
+ return true;
+}
+
+
+
+
+
+// ----------------------------------------
+// pg creation
+
+void OSD::split_pgs(
+ PG *parent,
+ const set<spg_t> &childpgids, set<PGRef> *out_pgs,
+ OSDMapRef curmap,
+ OSDMapRef nextmap,
+ PeeringCtx &rctx)
+{
+ unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
+ parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
+
+ vector<object_stat_sum_t> updated_stats;
+ parent->start_split_stats(childpgids, &updated_stats);
+
+ vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
+ for (set<spg_t>::const_iterator i = childpgids.begin();
+ i != childpgids.end();
+ ++i, ++stat_iter) {
+ ceph_assert(stat_iter != updated_stats.end());
+ dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
+ PG* child = _make_pg(nextmap, *i);
+ child->lock(true);
+ out_pgs->insert(child);
+ child->ch = store->create_new_collection(child->coll);
+
+ {
+ uint32_t shard_index = i->hash_to_shard(shards.size());
+ assert(NULL != shards[shard_index]);
+ store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
+ }
+
+ unsigned split_bits = i->get_split_bits(pg_num);
+ dout(10) << " pg_num is " << pg_num
+ << ", m_seed " << i->ps()
+ << ", split_bits is " << split_bits << dendl;
+ parent->split_colls(
+ *i,
+ split_bits,
+ i->ps(),
+ &child->get_pool().info,
+ rctx.transaction);
+ parent->split_into(
+ i->pgid,
+ child,
+ split_bits);
+
+ child->init_collection_pool_opts();
+
+ child->finish_split_stats(*stat_iter, rctx.transaction);
+ child->unlock();
+ }
+ ceph_assert(stat_iter != updated_stats.end());
+ parent->finish_split_stats(*stat_iter, rctx.transaction);
+}
+
+/*
+ * holding osd_lock
+ */
+void OSD::handle_pg_create(OpRequestRef op)
+{
+ // NOTE: this can be removed in P release (mimic is the last version to
+ // send MOSDPGCreate messages).
+
+ auto m = op->get_req<MOSDPGCreate>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
+
+ dout(10) << "handle_pg_create " << *m << dendl;
+
+ if (!require_mon_peer(op->get_req())) {
+ return;
+ }
+
+ if (!require_same_or_newer_map(op, m->epoch, false))
+ return;
+
+ op->mark_started();
+
+ const auto osdmap = get_osdmap();
+ map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
+ for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
+ p != m->mkpg.end();
+ ++p, ++ci) {
+ ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
+ epoch_t created = p->second.created;
+ if (p->second.split_bits) // Skip split pgs
+ continue;
+ pg_t on = p->first;
+
+ if (!osdmap->have_pg_pool(on.pool())) {
+ dout(20) << "ignoring pg on deleted pool " << on << dendl;
+ continue;
+ }
+
+ dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
+
+ spg_t pgid;
+ bool mapped = osdmap->get_primary_shard(on, &pgid);
+ ceph_assert(mapped);
+
+ // is it still ours?
+ vector<int> up, acting;
+ int up_primary = -1;
+ int acting_primary = -1;
+ osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
+ int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
+
+ if (acting_primary != whoami) {
+ dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
+ << "), my role=" << role << ", skipping" << dendl;
+ continue;
+ }
+
+
+ PastIntervals pi;
+ pg_history_t history;
+ build_initial_pg_history(pgid, created, ci->second, &history, &pi);
+
+ // The mon won't resend unless the primary changed, so we ignore
+ // same_interval_since. We'll pass this history with the current
+ // epoch as the event.
+ if (history.same_primary_since > m->epoch) {
+ dout(10) << __func__ << ": got obsolete pg create on pgid "
+ << pgid << " from epoch " << m->epoch
+ << ", primary changed in " << history.same_primary_since
+ << dendl;
+ continue;
+ }
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ osdmap->get_epoch(),
+ osdmap->get_epoch(),
+ NullEvt(),
+ true,
+ new PGCreateInfo(
+ pgid,
+ osdmap->get_epoch(),
+ history,
+ pi,
+ true)
+ )));
+ }
+
+ {
+ std::lock_guard l(pending_creates_lock);
+ if (pending_creates_from_mon == 0) {
+ last_pg_create_epoch = m->epoch;
+ }
+ }
+
+ maybe_update_heartbeat_peers();
+}
+
+
+// ----------------------------------------
+// peering and recovery
+
+PeeringCtx OSD::create_context()
+{
+ return PeeringCtx(get_osdmap()->require_osd_release);
+}
+
+void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
+ ThreadPool::TPHandle *handle)
+{
+ if (!service.get_osdmap()->is_up(whoami)) {
+ dout(20) << __func__ << " not up in osdmap" << dendl;
+ } else if (!is_active()) {
+ dout(20) << __func__ << " not active" << dendl;
+ } else {
+ for (auto& [osd, ls] : ctx.message_map) {
+ if (!curmap->is_up(osd)) {
+ dout(20) << __func__ << " skipping down osd." << osd << dendl;
+ continue;
+ }
+ ConnectionRef con = service.get_con_osd_cluster(
+ osd, curmap->get_epoch());
+ if (!con) {
+ dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
+ << dendl;
+ continue;
+ }
+ service.maybe_share_map(con.get(), curmap);
+ for (auto m : ls) {
+ con->send_message2(m);
+ }
+ ls.clear();
+ }
+ }
+ if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
+ int tr = store->queue_transaction(
+ pg->ch,
+ std::move(ctx.transaction), TrackedOpRef(),
+ handle);
+ ceph_assert(tr == 0);
+ }
+}
+
+void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
+{
+ dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ if (!require_mon_peer(m)) {
+ m->put();
+ return;
+ }
+ for (auto& p : m->pgs) {
+ spg_t pgid = p.first;
+ epoch_t created = p.second.first;
+ utime_t created_stamp = p.second.second;
+ auto q = m->pg_extra.find(pgid);
+ if (q == m->pg_extra.end()) {
+ dout(20) << __func__ << " " << pgid << " e" << created
+ << "@" << created_stamp
+ << " (no history or past_intervals)" << dendl;
+ // pre-octopus ... no pg history. this can be removed in Q release.
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ m->epoch,
+ m->epoch,
+ NullEvt(),
+ true,
+ new PGCreateInfo(
+ pgid,
+ created,
+ pg_history_t(created, created_stamp),
+ PastIntervals(),
+ true)
+ )));
+ } else {
+ dout(20) << __func__ << " " << pgid << " e" << created
+ << "@" << created_stamp
+ << " history " << q->second.first
+ << " pi " << q->second.second << dendl;
+ if (!q->second.second.empty() &&
+ m->epoch < q->second.second.get_bounds().second) {
+ clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
+ << " and unmatched past_intervals " << q->second.second
+ << " (history " << q->second.first << ")";
+ } else {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ m->epoch,
+ m->epoch,
+ NullEvt(),
+ true,
+ new PGCreateInfo(
+ pgid,
+ m->epoch,
+ q->second.first,
+ q->second.second,
+ true)
+ )));
+ }
+ }
+ }
+
+ {
+ std::lock_guard l(pending_creates_lock);
+ if (pending_creates_from_mon == 0) {
+ last_pg_create_epoch = m->epoch;
+ }
+ }
+
+ m->put();
+}
+
+void OSD::handle_fast_pg_query(MOSDPGQuery *m)
+{
+ dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ if (!require_osd_peer(m)) {
+ m->put();
+ return;
+ }
+ int from = m->get_source().num();
+ for (auto& p : m->pg_list) {
+ enqueue_peering_evt(
+ p.first,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ p.second.epoch_sent, p.second.epoch_sent,
+ MQuery(
+ p.first,
+ pg_shard_t(from, p.second.from),
+ p.second,
+ p.second.epoch_sent),
+ false))
+ );
+ }
+ m->put();
+}
+
+void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
+{
+ dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ if (!require_osd_peer(m)) {
+ m->put();
+ return;
+ }
+ int from = m->get_source().num();
+ for (auto& p : m->get_pg_list()) {
+ spg_t pgid(p.info.pgid.pgid, p.to);
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ p.epoch_sent,
+ p.query_epoch,
+ MNotifyRec(
+ pgid, pg_shard_t(from, p.from),
+ p,
+ m->get_connection()->get_features()),
+ true,
+ new PGCreateInfo(
+ pgid,
+ p.query_epoch,
+ p.info.history,
+ p.past_intervals,
+ false)
+ )));
+ }
+ m->put();
+}
+
+void OSD::handle_fast_pg_info(MOSDPGInfo* m)
+{
+ dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ if (!require_osd_peer(m)) {
+ m->put();
+ return;
+ }
+ int from = m->get_source().num();
+ for (auto& p : m->pg_list) {
+ enqueue_peering_evt(
+ spg_t(p.info.pgid.pgid, p.to),
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ p.epoch_sent, p.query_epoch,
+ MInfoRec(
+ pg_shard_t(from, p.from),
+ p.info,
+ p.epoch_sent)))
+ );
+ }
+ m->put();
+}
+
+void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
+{
+ dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ if (!require_osd_peer(m)) {
+ m->put();
+ return;
+ }
+ for (auto& pgid : m->pg_list) {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ m->get_epoch(), m->get_epoch(),
+ PeeringState::DeleteStart())));
+ }
+ m->put();
+}
+
+void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+ if (!require_mon_or_mgr_peer(m)) {
+ m->put();
+ return;
+ }
+ epoch_t epoch = get_osdmap_epoch();
+ for (auto pgid : m->forced_pgs) {
+ if (m->options & OFR_BACKFILL) {
+ if (m->options & OFR_CANCEL) {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ epoch, epoch,
+ PeeringState::UnsetForceBackfill())));
+ } else {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ epoch, epoch,
+ PeeringState::SetForceBackfill())));
+ }
+ } else if (m->options & OFR_RECOVERY) {
+ if (m->options & OFR_CANCEL) {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ epoch, epoch,
+ PeeringState::UnsetForceRecovery())));
+ } else {
+ enqueue_peering_evt(
+ pgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ epoch, epoch,
+ PeeringState::SetForceRecovery())));
+ }
+ }
+ }
+ m->put();
+}
+
+void OSD::handle_pg_query_nopg(const MQuery& q)
+{
+ spg_t pgid = q.pgid;
+ dout(10) << __func__ << " " << pgid << dendl;
+
+ OSDMapRef osdmap = get_osdmap();
+ if (!osdmap->have_pg_pool(pgid.pool()))
+ return;
+
+ dout(10) << " pg " << pgid << " dne" << dendl;
+ pg_info_t empty(spg_t(pgid.pgid, q.query.to));
+ ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
+ if (con) {
+ Message *m;
+ if (q.query.type == pg_query_t::LOG ||
+ q.query.type == pg_query_t::FULLLOG) {
+ m = new MOSDPGLog(
+ q.query.from, q.query.to,
+ osdmap->get_epoch(), empty,
+ q.query.epoch_sent);
+ } else {
+ vector<pg_notify_t> ls;
+ ls.push_back(
+ pg_notify_t(
+ q.query.from, q.query.to,
+ q.query.epoch_sent,
+ osdmap->get_epoch(),
+ empty,
+ PastIntervals()));
+ m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
+ }
+ service.maybe_share_map(con.get(), osdmap);
+ con->send_message(m);
+ }
+}
+
+void OSDService::queue_check_readable(spg_t spgid,
+ epoch_t lpr,
+ ceph::signedspan delay)
+{
+ if (delay == ceph::signedspan::zero()) {
+ osd->enqueue_peering_evt(
+ spgid,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ lpr, lpr,
+ PeeringState::CheckReadable())));
+ } else {
+ mono_timer.add_event(
+ delay,
+ [this, spgid, lpr]() {
+ queue_check_readable(spgid, lpr);
+ });
+ }
+}
+
+
+// =========================================================
+// RECOVERY
+
+void OSDService::_maybe_queue_recovery() {
+ ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
+ uint64_t available_pushes;
+ while (!awaiting_throttle.empty() &&
+ _recover_now(&available_pushes)) {
+ uint64_t to_start = std::min(
+ available_pushes,
+ cct->_conf->osd_recovery_max_single_start);
+ _queue_for_recovery(awaiting_throttle.front(), to_start);
+ awaiting_throttle.pop_front();
+ dout(10) << __func__ << " starting " << to_start
+ << ", recovery_ops_reserved " << recovery_ops_reserved
+ << " -> " << (recovery_ops_reserved + to_start) << dendl;
+ recovery_ops_reserved += to_start;
+ }
+}
+
+bool OSDService::_recover_now(uint64_t *available_pushes)
+{
+ if (available_pushes)
+ *available_pushes = 0;
+
+ if (ceph_clock_now() < defer_recovery_until) {
+ dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
+ return false;
+ }
+
+ if (recovery_paused) {
+ dout(15) << __func__ << " paused" << dendl;
+ return false;
+ }
+
+ uint64_t max = osd->get_recovery_max_active();
+ if (max <= recovery_ops_active + recovery_ops_reserved) {
+ dout(15) << __func__ << " active " << recovery_ops_active
+ << " + reserved " << recovery_ops_reserved
+ << " >= max " << max << dendl;
+ return false;
+ }
+
+ if (available_pushes)
+ *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
+
+ return true;
+}
+
+unsigned OSDService::get_target_pg_log_entries() const
+{
+ auto num_pgs = osd->get_num_pgs();
+ auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
+ if (num_pgs > 0 && target > 0) {
+ // target an even spread of our budgeted log entries across all
+ // PGs. note that while we only get to control the entry count
+ // for primary PGs, we'll normally be responsible for a mix of
+ // primary and replica PGs (for the same pool(s) even), so this
+ // will work out.
+ return std::max<unsigned>(
+ std::min<unsigned>(target / num_pgs,
+ cct->_conf->osd_max_pg_log_entries),
+ cct->_conf->osd_min_pg_log_entries);
+ } else {
+ // fall back to a per-pg value.
+ return cct->_conf->osd_min_pg_log_entries;
+ }
+}
+
+void OSD::do_recovery(
+ PG *pg, epoch_t queued, uint64_t reserved_pushes,
+ ThreadPool::TPHandle &handle)
+{
+ uint64_t started = 0;
+
+ /*
+ * When the value of osd_recovery_sleep is set greater than zero, recovery
+ * ops are scheduled after osd_recovery_sleep amount of time from the previous
+ * recovery event's schedule time. This is done by adding a
+ * recovery_requeue_callback event, which re-queues the recovery op using
+ * queue_recovery_after_sleep.
+ */
+ float recovery_sleep = get_osd_recovery_sleep();
+ {
+ std::lock_guard l(service.sleep_lock);
+ if (recovery_sleep > 0 && service.recovery_needs_sleep) {
+ PGRef pgref(pg);
+ auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
+ dout(20) << "do_recovery wake up at "
+ << ceph_clock_now()
+ << ", re-queuing recovery" << dendl;
+ std::lock_guard l(service.sleep_lock);
+ service.recovery_needs_sleep = false;
+ service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
+ });
+
+ // This is true for the first recovery op and when the previous recovery op
+ // has been scheduled in the past. The next recovery op is scheduled after
+ // completing the sleep from now.
+
+ if (auto now = ceph::real_clock::now();
+ service.recovery_schedule_time < now) {
+ service.recovery_schedule_time = now;
+ }
+ service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
+ service.sleep_timer.add_event_at(service.recovery_schedule_time,
+ recovery_requeue_callback);
+ dout(20) << "Recovery event scheduled at "
+ << service.recovery_schedule_time << dendl;
+ return;
+ }
+ }
+
+ {
+ {
+ std::lock_guard l(service.sleep_lock);
+ service.recovery_needs_sleep = true;
+ }
+
+ if (pg->pg_has_reset_since(queued)) {
+ goto out;
+ }
+
+ dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
+#ifdef DEBUG_RECOVERY_OIDS
+ dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
+#endif
+
+ bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
+ dout(10) << "do_recovery started " << started << "/" << reserved_pushes
+ << " on " << *pg << dendl;
+
+ if (do_unfound) {
+ PeeringCtx rctx = create_context();
+ rctx.handle = &handle;
+ pg->find_unfound(queued, rctx);
+ dispatch_context(rctx, pg, pg->get_osdmap());
+ }
+ }
+
+ out:
+ ceph_assert(started <= reserved_pushes);
+ service.release_reserved_pushes(reserved_pushes);
+}
+
+void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
+{
+ std::lock_guard l(recovery_lock);
+ dout(10) << "start_recovery_op " << *pg << " " << soid
+ << " (" << recovery_ops_active << "/"
+ << osd->get_recovery_max_active() << " rops)"
+ << dendl;
+ recovery_ops_active++;
+
+#ifdef DEBUG_RECOVERY_OIDS
+ dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
+ ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
+ recovery_oids[pg->pg_id].insert(soid);
+#endif
+}
+
+void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
+{
+ std::lock_guard l(recovery_lock);
+ dout(10) << "finish_recovery_op " << *pg << " " << soid
+ << " dequeue=" << dequeue
+ << " (" << recovery_ops_active << "/"
+ << osd->get_recovery_max_active() << " rops)"
+ << dendl;
+
+ // adjust count
+ ceph_assert(recovery_ops_active > 0);
+ recovery_ops_active--;
+
+#ifdef DEBUG_RECOVERY_OIDS
+ dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
+ ceph_assert(recovery_oids[pg->pg_id].count(soid));
+ recovery_oids[pg->pg_id].erase(soid);
+#endif
+
+ _maybe_queue_recovery();
+}
+
+bool OSDService::is_recovery_active()
+{
+ if (cct->_conf->osd_debug_pretend_recovery_active) {
+ return true;
+ }
+ return local_reserver.has_reservation() || remote_reserver.has_reservation();
+}
+
+void OSDService::release_reserved_pushes(uint64_t pushes)
+{
+ std::lock_guard l(recovery_lock);
+ dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
+ << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
+ << dendl;
+ ceph_assert(recovery_ops_reserved >= pushes);
+ recovery_ops_reserved -= pushes;
+ _maybe_queue_recovery();
+}
+
+// =========================================================
+// OPS
+
+bool OSD::op_is_discardable(const MOSDOp *op)
+{
+ // drop client request if they are not connected and can't get the
+ // reply anyway.
+ if (!op->get_connection()->is_connected()) {
+ return true;
+ }
+ return false;
+}
+
+void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
+{
+ const utime_t stamp = op->get_req()->get_recv_stamp();
+ const utime_t latency = ceph_clock_now() - stamp;
+ const unsigned priority = op->get_req()->get_priority();
+ const int cost = op->get_req()->get_cost();
+ const uint64_t owner = op->get_req()->get_source().num();
+ const int type = op->get_req()->get_type();
+
+ dout(15) << "enqueue_op " << op << " prio " << priority
+ << " type " << type
+ << " cost " << cost
+ << " latency " << latency
+ << " epoch " << epoch
+ << " " << *(op->get_req()) << dendl;
+ op->osd_trace.event("enqueue op");
+ op->osd_trace.keyval("priority", priority);
+ op->osd_trace.keyval("cost", cost);
+#ifdef HAVE_JAEGER
+ if (op->osd_parent_span) {
+ auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+ enqueue_span->Log({
+ {"priority", priority},
+ {"cost", cost},
+ {"epoch", epoch},
+ {"owner", owner},
+ {"type", type}
+ });
+ }
+#endif
+ op->mark_queued_for_pg();
+ logger->tinc(l_osd_op_before_queue_op_lat, latency);
+ if (type == MSG_OSD_PG_PUSH ||
+ type == MSG_OSD_PG_PUSH_REPLY) {
+ op_shardedwq.queue(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
+ cost, priority, stamp, owner, epoch));
+ } else {
+ op_shardedwq.queue(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
+ cost, priority, stamp, owner, epoch));
+ }
+}
+
+void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
+{
+ dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
+ op_shardedwq.queue(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
+ 10,
+ cct->_conf->osd_peering_op_priority,
+ utime_t(),
+ 0,
+ evt->get_epoch_sent()));
+}
+
+/*
+ * NOTE: dequeue called in worker thread, with pg lock
+ */
+void OSD::dequeue_op(
+ PGRef pg, OpRequestRef op,
+ ThreadPool::TPHandle &handle)
+{
+ const Message *m = op->get_req();
+
+ FUNCTRACE(cct);
+ OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
+
+ utime_t now = ceph_clock_now();
+ op->set_dequeued_time(now);
+
+ utime_t latency = now - m->get_recv_stamp();
+ dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
+ << " cost " << m->get_cost()
+ << " latency " << latency
+ << " " << *m
+ << " pg " << *pg << dendl;
+
+ logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
+
+ service.maybe_share_map(m->get_connection().get(),
+ pg->get_osdmap(),
+ op->sent_epoch);
+
+ if (pg->is_deleting())
+ return;
+
+ op->mark_reached_pg();
+ op->osd_trace.event("dequeue_op");
+
+ pg->do_request(op, handle);
+
+ // finish
+ dout(10) << "dequeue_op " << op << " finish" << dendl;
+ OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
+}
+
+
+void OSD::dequeue_peering_evt(
+ OSDShard *sdata,
+ PG *pg,
+ PGPeeringEventRef evt,
+ ThreadPool::TPHandle& handle)
+{
+ PeeringCtx rctx = create_context();
+ auto curmap = sdata->get_osdmap();
+ bool need_up_thru = false;
+ epoch_t same_interval_since = 0;
+ if (!pg) {
+ if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
+ handle_pg_query_nopg(*q);
+ } else {
+ derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
+ ceph_abort();
+ }
+ } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
+ pg->do_peering_event(evt, rctx);
+ if (pg->is_deleted()) {
+ pg->unlock();
+ return;
+ }
+ dispatch_context(rctx, pg, curmap, &handle);
+ need_up_thru = pg->get_need_up_thru();
+ same_interval_since = pg->get_same_interval_since();
+ pg->unlock();
+ }
+
+ if (need_up_thru) {
+ queue_want_up_thru(same_interval_since);
+ }
+
+ service.send_pg_temp();
+}
+
+void OSD::dequeue_delete(
+ OSDShard *sdata,
+ PG *pg,
+ epoch_t e,
+ ThreadPool::TPHandle& handle)
+{
+ dequeue_peering_evt(
+ sdata,
+ pg,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ e, e,
+ PeeringState::DeleteSome())),
+ handle);
+}
+
+
+
+// --------------------------------
+
+const char** OSD::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_max_backfills",
+ "osd_min_recovery_priority",
+ "osd_max_trimming_pgs",
+ "osd_op_complaint_time",
+ "osd_op_log_threshold",
+ "osd_op_history_size",
+ "osd_op_history_duration",
+ "osd_op_history_slow_op_size",
+ "osd_op_history_slow_op_threshold",
+ "osd_enable_op_tracker",
+ "osd_map_cache_size",
+ "osd_pg_epoch_max_lag_factor",
+ "osd_pg_epoch_persisted_max_stale",
+ "osd_recovery_sleep",
+ "osd_recovery_sleep_hdd",
+ "osd_recovery_sleep_ssd",
+ "osd_recovery_sleep_hybrid",
+ "osd_delete_sleep",
+ "osd_delete_sleep_hdd",
+ "osd_delete_sleep_ssd",
+ "osd_delete_sleep_hybrid",
+ "osd_snap_trim_sleep",
+ "osd_snap_trim_sleep_hdd",
+ "osd_snap_trim_sleep_ssd",
+ "osd_snap_trim_sleep_hybrid"
+ "osd_scrub_sleep",
+ "osd_recovery_max_active",
+ "osd_recovery_max_active_hdd",
+ "osd_recovery_max_active_ssd",
+ // clog & admin clog
+ "clog_to_monitors",
+ "clog_to_syslog",
+ "clog_to_syslog_facility",
+ "clog_to_syslog_level",
+ "osd_objectstore_fuse",
+ "clog_to_graylog",
+ "clog_to_graylog_host",
+ "clog_to_graylog_port",
+ "host",
+ "fsid",
+ "osd_recovery_delay_start",
+ "osd_client_message_size_cap",
+ "osd_client_message_cap",
+ "osd_heartbeat_min_size",
+ "osd_heartbeat_interval",
+ "osd_object_clean_region_max_num_intervals",
+ "osd_scrub_min_interval",
+ "osd_scrub_max_interval",
+ NULL
+ };
+ return KEYS;
+}
+
+void OSD::handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ std::lock_guard l{osd_lock};
+
+ if (changed.count("osd_max_backfills") ||
+ changed.count("osd_delete_sleep") ||
+ changed.count("osd_delete_sleep_hdd") ||
+ changed.count("osd_delete_sleep_ssd") ||
+ changed.count("osd_delete_sleep_hybrid") ||
+ changed.count("osd_snap_trim_sleep") ||
+ changed.count("osd_snap_trim_sleep_hdd") ||
+ changed.count("osd_snap_trim_sleep_ssd") ||
+ changed.count("osd_snap_trim_sleep_hybrid") ||
+ changed.count("osd_scrub_sleep") ||
+ changed.count("osd_recovery_sleep") ||
+ changed.count("osd_recovery_sleep_hdd") ||
+ changed.count("osd_recovery_sleep_ssd") ||
+ changed.count("osd_recovery_sleep_hybrid") ||
+ changed.count("osd_recovery_max_active") ||
+ changed.count("osd_recovery_max_active_hdd") ||
+ changed.count("osd_recovery_max_active_ssd")) {
+ if (!maybe_override_options_for_qos() &&
+ changed.count("osd_max_backfills")) {
+ // Scheduler is not "mclock". Fallback to earlier behavior
+ service.local_reserver.set_max(cct->_conf->osd_max_backfills);
+ service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
+ }
+ }
+ if (changed.count("osd_min_recovery_priority")) {
+ service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
+ service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
+ }
+ if (changed.count("osd_max_trimming_pgs")) {
+ service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
+ }
+ if (changed.count("osd_op_complaint_time") ||
+ changed.count("osd_op_log_threshold")) {
+ op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
+ }
+ if (changed.count("osd_op_history_size") ||
+ changed.count("osd_op_history_duration")) {
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
+ }
+ if (changed.count("osd_op_history_slow_op_size") ||
+ changed.count("osd_op_history_slow_op_threshold")) {
+ op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
+ cct->_conf->osd_op_history_slow_op_threshold);
+ }
+ if (changed.count("osd_enable_op_tracker")) {
+ op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
+ }
+ if (changed.count("osd_map_cache_size")) {
+ service.map_cache.set_size(cct->_conf->osd_map_cache_size);
+ service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
+ service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
+ }
+ if (changed.count("clog_to_monitors") ||
+ changed.count("clog_to_syslog") ||
+ changed.count("clog_to_syslog_level") ||
+ changed.count("clog_to_syslog_facility") ||
+ changed.count("clog_to_graylog") ||
+ changed.count("clog_to_graylog_host") ||
+ changed.count("clog_to_graylog_port") ||
+ changed.count("host") ||
+ changed.count("fsid")) {
+ update_log_config();
+ }
+ if (changed.count("osd_pg_epoch_max_lag_factor")) {
+ m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
+ "osd_pg_epoch_max_lag_factor");
+ }
+
+#ifdef HAVE_LIBFUSE
+ if (changed.count("osd_objectstore_fuse")) {
+ if (store) {
+ enable_disable_fuse(false);
+ }
+ }
+#endif
+
+ if (changed.count("osd_recovery_delay_start")) {
+ service.defer_recovery(cct->_conf->osd_recovery_delay_start);
+ service.kick_recovery_queue();
+ }
+
+ if (changed.count("osd_client_message_cap")) {
+ uint64_t newval = cct->_conf->osd_client_message_cap;
+ Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
+ if (pol.throttler_messages && newval > 0) {
+ pol.throttler_messages->reset_max(newval);
+ }
+ }
+ if (changed.count("osd_client_message_size_cap")) {
+ uint64_t newval = cct->_conf->osd_client_message_size_cap;
+ Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
+ if (pol.throttler_bytes && newval > 0) {
+ pol.throttler_bytes->reset_max(newval);
+ }
+ }
+ if (changed.count("osd_object_clean_region_max_num_intervals")) {
+ ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
+ }
+
+ if (changed.count("osd_scrub_min_interval") ||
+ changed.count("osd_scrub_max_interval")) {
+ resched_all_scrubs();
+ dout(0) << __func__ << ": scrub interval change" << dendl;
+ }
+ check_config();
+ if (changed.count("osd_asio_thread_count")) {
+ service.poolctx.stop();
+ service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
+ }
+}
+
+void OSD::maybe_override_max_osd_capacity_for_qos()
+{
+ // If the scheduler enabled is mclock, override the default
+ // osd capacity with the value obtained from running the
+ // osd bench test. This is later used to setup mclock.
+ if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
+ (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false)) {
+ std::string max_capacity_iops_config;
+ bool force_run_benchmark =
+ cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
+
+ if (store_is_rotational) {
+ max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
+ } else {
+ max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
+ }
+
+ if (!force_run_benchmark) {
+ double default_iops = 0.0;
+
+ // Get the current osd iops capacity
+ double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
+
+ // Get the default max iops capacity
+ auto val = cct->_conf.get_val_default(max_capacity_iops_config);
+ if (!val.has_value()) {
+ derr << __func__ << " Unable to determine default value of "
+ << max_capacity_iops_config << dendl;
+ // Cannot determine default iops. Force a run of the OSD benchmark.
+ force_run_benchmark = true;
+ } else {
+ // Default iops
+ default_iops = std::stod(val.value());
+ }
+
+ // Determine if we really need to run the osd benchmark
+ if (!force_run_benchmark && (default_iops != cur_iops)) {
+ dout(1) << __func__ << std::fixed << std::setprecision(2)
+ << " default_iops: " << default_iops
+ << " cur_iops: " << cur_iops
+ << ". Skip OSD benchmark test." << dendl;
+ return;
+ }
+ }
+
+ // Run osd bench: write 100 4MiB objects with blocksize 4KiB
+ int64_t count = 12288000; // Count of bytes to write
+ int64_t bsize = 4096; // Block size
+ int64_t osize = 4194304; // Object size
+ int64_t onum = 100; // Count of objects to write
+ double elapsed = 0.0; // Time taken to complete the test
+ double iops = 0.0;
+ stringstream ss;
+ int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+ if (ret != 0) {
+ derr << __func__
+ << " osd bench err: " << ret
+ << " osd bench errstr: " << ss.str()
+ << dendl;
+ return;
+ }
+
+ double rate = count / elapsed;
+ iops = rate / bsize;
+ dout(1) << __func__
+ << " osd bench result -"
+ << std::fixed << std::setprecision(3)
+ << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
+ << " iops: " << iops
+ << " elapsed_sec: " << elapsed
+ << dendl;
+
+ // Persist iops to the MON store
+ ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
+ if (ret < 0) {
+ // Fallback to setting the config within the in-memory "values" map.
+ cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
+ }
+
+ // Override the max osd capacity for all shards
+ for (auto& shard : shards) {
+ shard->update_scheduler_config();
+ }
+ }
+}
+
+bool OSD::maybe_override_options_for_qos()
+{
+ // If the scheduler enabled is mclock, override the recovery, backfill
+ // and sleep options so that mclock can meet the QoS goals.
+ if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
+ dout(1) << __func__
+ << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
+
+ // Set high value for recovery max active
+ uint32_t rec_max_active = 1000;
+ cct->_conf.set_val(
+ "osd_recovery_max_active", std::to_string(rec_max_active));
+ cct->_conf.set_val(
+ "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
+ cct->_conf.set_val(
+ "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
+
+ // Set high value for osd_max_backfill
+ uint32_t max_backfills = 1000;
+ cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
+ service.local_reserver.set_max(max_backfills);
+ service.remote_reserver.set_max(max_backfills);
+
+ // Disable recovery sleep
+ cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
+ cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
+ cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
+ cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
+
+ // Disable delete sleep
+ cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
+ cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
+ cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
+ cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
+
+ // Disable snap trim sleep
+ cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
+ cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
+ cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
+ cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
+
+ // Disable scrub sleep
+ cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
+ return true;
+ }
+ return false;
+}
+
+int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
+{
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config set\", "
+ "\"who\": \"osd." + std::to_string(whoami) + "\", "
+ "\"name\": \"" + key + "\", "
+ "\"value\": \"" + val + "\""
+ "}";
+
+ vector<std::string> vcmd{cmd};
+ bufferlist inbl;
+ std::string outs;
+ C_SaferCond cond;
+ monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
+ int r = cond.wait();
+ if (r < 0) {
+ derr << __func__ << " Failed to set config key " << key
+ << " err: " << cpp_strerror(r)
+ << " errstr: " << outs << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+void OSD::update_log_config()
+{
+ map<string,string> log_to_monitors;
+ map<string,string> log_to_syslog;
+ map<string,string> log_channel;
+ map<string,string> log_prio;
+ map<string,string> log_to_graylog;
+ map<string,string> log_to_graylog_host;
+ map<string,string> log_to_graylog_port;
+ uuid_d fsid;
+ string host;
+
+ if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host) == 0)
+ clog->update_config(log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host);
+ derr << "log_to_monitors " << log_to_monitors << dendl;
+}
+
+void OSD::check_config()
+{
+ // some sanity checks
+ if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
+ clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
+ << " is not > osd_pg_epoch_persisted_max_stale ("
+ << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
+ }
+ if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
+ clog->warn() << "osd_object_clean_region_max_num_intervals ("
+ << cct->_conf->osd_object_clean_region_max_num_intervals
+ << ") is < 0";
+ }
+}
+
+// --------------------------------
+
+void OSD::get_latest_osdmap()
+{
+ dout(10) << __func__ << " -- start" << dendl;
+
+ boost::system::error_code ec;
+ service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
+
+ dout(10) << __func__ << " -- finish" << dendl;
+}
+
+// --------------------------------
+
+void OSD::set_perf_queries(const ConfigPayload &config_payload) {
+ const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
+ const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
+ dout(10) << "setting " << queries.size() << " queries" << dendl;
+
+ std::list<OSDPerfMetricQuery> supported_queries;
+ for (auto &it : queries) {
+ auto &query = it.first;
+ if (!query.key_descriptor.empty()) {
+ supported_queries.push_back(query);
+ }
+ }
+ if (supported_queries.size() < queries.size()) {
+ dout(1) << queries.size() - supported_queries.size()
+ << " unsupported queries" << dendl;
+ }
+ {
+ std::lock_guard locker{m_perf_queries_lock};
+ m_perf_queries = supported_queries;
+ m_perf_limits = queries;
+ }
+ std::vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto& pg : pgs) {
+ std::scoped_lock l{*pg};
+ pg->set_dynamic_perf_stats_queries(supported_queries);
+ }
+}
+
+MetricPayload OSD::get_perf_reports() {
+ OSDMetricPayload payload;
+ std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
+
+ std::vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ DynamicPerfStats dps;
+ for (auto& pg : pgs) {
+ // m_perf_queries can be modified only in set_perf_queries by mgr client
+ // request, and it is protected by by mgr client's lock, which is held
+ // when set_perf_queries/get_perf_reports are called, so we may not hold
+ // m_perf_queries_lock here.
+ DynamicPerfStats pg_dps(m_perf_queries);
+ pg->lock();
+ pg->get_dynamic_perf_stats(&pg_dps);
+ pg->unlock();
+ dps.merge(pg_dps);
+ }
+ dps.add_to_reports(m_perf_limits, &reports);
+ dout(20) << "reports for " << reports.size() << " queries" << dendl;
+
+ return payload;
+}
+
+// =============================================================
+
+#undef dout_context
+#define dout_context cct
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
+
+void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
+{
+ dout(10) << pg->pg_id << " " << pg << dendl;
+ slot->pg = pg;
+ pg->osd_shard = this;
+ pg->pg_slot = slot;
+ osd->inc_num_pgs();
+
+ slot->epoch = pg->get_osdmap_epoch();
+ pg_slots_by_epoch.insert(*slot);
+}
+
+void OSDShard::_detach_pg(OSDShardPGSlot *slot)
+{
+ dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
+ slot->pg->osd_shard = nullptr;
+ slot->pg->pg_slot = nullptr;
+ slot->pg = nullptr;
+ osd->dec_num_pgs();
+
+ pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
+ slot->epoch = 0;
+ if (waiting_for_min_pg_epoch) {
+ min_pg_epoch_cond.notify_all();
+ }
+}
+
+void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
+{
+ std::lock_guard l(shard_lock);
+ dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
+ << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
+ pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
+ dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
+ slot->epoch = e;
+ pg_slots_by_epoch.insert(*slot);
+ dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
+ << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
+ if (waiting_for_min_pg_epoch) {
+ min_pg_epoch_cond.notify_all();
+ }
+}
+
+epoch_t OSDShard::get_min_pg_epoch()
+{
+ std::lock_guard l(shard_lock);
+ auto p = pg_slots_by_epoch.begin();
+ if (p == pg_slots_by_epoch.end()) {
+ return 0;
+ }
+ return p->epoch;
+}
+
+void OSDShard::wait_min_pg_epoch(epoch_t need)
+{
+ std::unique_lock l{shard_lock};
+ ++waiting_for_min_pg_epoch;
+ min_pg_epoch_cond.wait(l, [need, this] {
+ if (pg_slots_by_epoch.empty()) {
+ return true;
+ } else if (pg_slots_by_epoch.begin()->epoch >= need) {
+ return true;
+ } else {
+ dout(10) << need << " waiting on "
+ << pg_slots_by_epoch.begin()->epoch << dendl;
+ return false;
+ }
+ });
+ --waiting_for_min_pg_epoch;
+}
+
+epoch_t OSDShard::get_max_waiting_epoch()
+{
+ std::lock_guard l(shard_lock);
+ epoch_t r = 0;
+ for (auto& i : pg_slots) {
+ if (!i.second->waiting_peering.empty()) {
+ r = std::max(r, i.second->waiting_peering.rbegin()->first);
+ }
+ }
+ return r;
+}
+
+void OSDShard::consume_map(
+ const OSDMapRef& new_osdmap,
+ unsigned *pushes_to_free)
+{
+ std::lock_guard l(shard_lock);
+ OSDMapRef old_osdmap;
+ {
+ std::lock_guard l(osdmap_lock);
+ old_osdmap = std::move(shard_osdmap);
+ shard_osdmap = new_osdmap;
+ }
+ dout(10) << new_osdmap->get_epoch()
+ << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
+ << dendl;
+ bool queued = false;
+
+ // check slots
+ auto p = pg_slots.begin();
+ while (p != pg_slots.end()) {
+ OSDShardPGSlot *slot = p->second.get();
+ const spg_t& pgid = p->first;
+ dout(20) << __func__ << " " << pgid << dendl;
+ if (!slot->waiting_for_split.empty()) {
+ dout(20) << __func__ << " " << pgid
+ << " waiting for split " << slot->waiting_for_split << dendl;
+ ++p;
+ continue;
+ }
+ if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
+ dout(20) << __func__ << " " << pgid
+ << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
+ << dendl;
+ ++p;
+ continue;
+ }
+ if (!slot->waiting_peering.empty()) {
+ epoch_t first = slot->waiting_peering.begin()->first;
+ if (first <= new_osdmap->get_epoch()) {
+ dout(20) << __func__ << " " << pgid
+ << " pending_peering first epoch " << first
+ << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
+ _wake_pg_slot(pgid, slot);
+ queued = true;
+ }
+ ++p;
+ continue;
+ }
+ if (!slot->waiting.empty()) {
+ if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
+ dout(20) << __func__ << " " << pgid << " maps to us, keeping"
+ << dendl;
+ ++p;
+ continue;
+ }
+ while (!slot->waiting.empty() &&
+ slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
+ auto& qi = slot->waiting.front();
+ dout(20) << __func__ << " " << pgid
+ << " waiting item " << qi
+ << " epoch " << qi.get_map_epoch()
+ << " <= " << new_osdmap->get_epoch()
+ << ", "
+ << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
+ "misdirected")
+ << ", dropping" << dendl;
+ *pushes_to_free += qi.get_reserved_pushes();
+ slot->waiting.pop_front();
+ }
+ }
+ if (slot->waiting.empty() &&
+ slot->num_running == 0 &&
+ slot->waiting_for_split.empty() &&
+ !slot->pg) {
+ dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
+ p = pg_slots.erase(p);
+ continue;
+ }
+
+ ++p;
+ }
+ if (queued) {
+ std::lock_guard l{sdata_wait_lock};
+ sdata_cond.notify_one();
+ }
+}
+
+void OSDShard::_wake_pg_slot(
+ spg_t pgid,
+ OSDShardPGSlot *slot)
+{
+ dout(20) << __func__ << " " << pgid
+ << " to_process " << slot->to_process
+ << " waiting " << slot->waiting
+ << " waiting_peering " << slot->waiting_peering << dendl;
+ for (auto i = slot->to_process.rbegin();
+ i != slot->to_process.rend();
+ ++i) {
+ scheduler->enqueue_front(std::move(*i));
+ }
+ slot->to_process.clear();
+ for (auto i = slot->waiting.rbegin();
+ i != slot->waiting.rend();
+ ++i) {
+ scheduler->enqueue_front(std::move(*i));
+ }
+ slot->waiting.clear();
+ for (auto i = slot->waiting_peering.rbegin();
+ i != slot->waiting_peering.rend();
+ ++i) {
+ // this is overkill; we requeue everything, even if some of these
+ // items are waiting for maps we don't have yet. FIXME, maybe,
+ // someday, if we decide this inefficiency matters
+ for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
+ scheduler->enqueue_front(std::move(*j));
+ }
+ }
+ slot->waiting_peering.clear();
+ ++slot->requeue_seq;
+}
+
+void OSDShard::identify_splits_and_merges(
+ const OSDMapRef& as_of_osdmap,
+ set<pair<spg_t,epoch_t>> *split_pgs,
+ set<pair<spg_t,epoch_t>> *merge_pgs)
+{
+ std::lock_guard l(shard_lock);
+ if (shard_osdmap) {
+ for (auto& i : pg_slots) {
+ const spg_t& pgid = i.first;
+ auto *slot = i.second.get();
+ if (slot->pg) {
+ osd->service.identify_splits_and_merges(
+ shard_osdmap, as_of_osdmap, pgid,
+ split_pgs, merge_pgs);
+ } else if (!slot->waiting_for_split.empty()) {
+ osd->service.identify_splits_and_merges(
+ shard_osdmap, as_of_osdmap, pgid,
+ split_pgs, nullptr);
+ } else {
+ dout(20) << __func__ << " slot " << pgid
+ << " has no pg and waiting_for_split " << dendl;
+ }
+ }
+ }
+}
+
+void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
+ set<pair<spg_t,epoch_t>> *pgids)
+{
+ std::lock_guard l(shard_lock);
+ _prime_splits(pgids);
+ if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
+ set<pair<spg_t,epoch_t>> newer_children;
+ for (auto i : *pgids) {
+ osd->service.identify_splits_and_merges(
+ as_of_osdmap, shard_osdmap, i.first,
+ &newer_children, nullptr);
+ }
+ newer_children.insert(pgids->begin(), pgids->end());
+ dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
+ << shard_osdmap->get_epoch() << ", new children " << newer_children
+ << dendl;
+ _prime_splits(&newer_children);
+ // note: we don't care what is left over here for other shards.
+ // if this shard is ahead of us and one isn't, e.g., one thread is
+ // calling into prime_splits via _process (due to a newly created
+ // pg) and this shard has a newer map due to a racing consume_map,
+ // then any grandchildren left here will be identified (or were
+ // identified) when the slower shard's osdmap is advanced.
+ // _prime_splits() will tolerate the case where the pgid is
+ // already primed.
+ }
+}
+
+void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
+{
+ dout(10) << *pgids << dendl;
+ auto p = pgids->begin();
+ while (p != pgids->end()) {
+ unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
+ if (shard_index == shard_id) {
+ auto r = pg_slots.emplace(p->first, nullptr);
+ if (r.second) {
+ dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
+ r.first->second = make_unique<OSDShardPGSlot>();
+ r.first->second->waiting_for_split.insert(p->second);
+ } else {
+ auto q = r.first;
+ ceph_assert(q != pg_slots.end());
+ dout(10) << "priming (existing) slot " << p->first << " e" << p->second
+ << dendl;
+ q->second->waiting_for_split.insert(p->second);
+ }
+ p = pgids->erase(p);
+ } else {
+ ++p;
+ }
+ }
+}
+
+void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
+ set<pair<spg_t,epoch_t>> *merge_pgs)
+{
+ std::lock_guard l(shard_lock);
+ dout(20) << __func__ << " checking shard " << shard_id
+ << " for remaining merge pgs " << merge_pgs << dendl;
+ auto p = merge_pgs->begin();
+ while (p != merge_pgs->end()) {
+ spg_t pgid = p->first;
+ epoch_t epoch = p->second;
+ unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
+ if (shard_index != shard_id) {
+ ++p;
+ continue;
+ }
+ OSDShardPGSlot *slot;
+ auto r = pg_slots.emplace(pgid, nullptr);
+ if (r.second) {
+ r.first->second = make_unique<OSDShardPGSlot>();
+ }
+ slot = r.first->second.get();
+ if (slot->pg) {
+ // already have pg
+ dout(20) << __func__ << " have merge participant pg " << pgid
+ << " " << slot->pg << dendl;
+ } else if (!slot->waiting_for_split.empty() &&
+ *slot->waiting_for_split.begin() < epoch) {
+ dout(20) << __func__ << " pending split on merge participant pg " << pgid
+ << " " << slot->waiting_for_split << dendl;
+ } else {
+ dout(20) << __func__ << " creating empty merge participant " << pgid
+ << " for merge in " << epoch << dendl;
+ // leave history zeroed; PG::merge_from() will fill it in.
+ pg_history_t history;
+ PGCreateInfo cinfo(pgid, epoch - 1,
+ history, PastIntervals(), false);
+ PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
+ _attach_pg(r.first->second.get(), pg.get());
+ _wake_pg_slot(pgid, slot);
+ pg->unlock();
+ }
+ // mark slot for merge
+ dout(20) << __func__ << " marking merge participant " << pgid << dendl;
+ slot->waiting_for_merge_epoch = epoch;
+ p = merge_pgs->erase(p);
+ }
+}
+
+void OSDShard::register_and_wake_split_child(PG *pg)
+{
+ epoch_t epoch;
+ {
+ std::lock_guard l(shard_lock);
+ dout(10) << pg->pg_id << " " << pg << dendl;
+ auto p = pg_slots.find(pg->pg_id);
+ ceph_assert(p != pg_slots.end());
+ auto *slot = p->second.get();
+ dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
+ << dendl;
+ ceph_assert(!slot->pg);
+ ceph_assert(!slot->waiting_for_split.empty());
+ _attach_pg(slot, pg);
+
+ epoch = pg->get_osdmap_epoch();
+ ceph_assert(slot->waiting_for_split.count(epoch));
+ slot->waiting_for_split.erase(epoch);
+ if (slot->waiting_for_split.empty()) {
+ _wake_pg_slot(pg->pg_id, slot);
+ } else {
+ dout(10) << __func__ << " still waiting for split on "
+ << slot->waiting_for_split << dendl;
+ }
+ }
+
+ // kick child to ensure it pulls up to the latest osdmap
+ osd->enqueue_peering_evt(
+ pg->pg_id,
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ epoch,
+ epoch,
+ NullEvt())));
+
+ std::lock_guard l{sdata_wait_lock};
+ sdata_cond.notify_one();
+}
+
+void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
+{
+ std::lock_guard l(shard_lock);
+ vector<spg_t> to_delete;
+ for (auto& i : pg_slots) {
+ if (i.first != parent &&
+ i.first.get_ancestor(old_pg_num) == parent) {
+ dout(10) << __func__ << " parent " << parent << " clearing " << i.first
+ << dendl;
+ _wake_pg_slot(i.first, i.second.get());
+ to_delete.push_back(i.first);
+ }
+ }
+ for (auto pgid : to_delete) {
+ pg_slots.erase(pgid);
+ }
+}
+
+void OSDShard::update_scheduler_config()
+{
+ std::lock_guard l(shard_lock);
+ scheduler->update_configuration();
+}
+
+OSDShard::OSDShard(
+ int id,
+ CephContext *cct,
+ OSD *osd)
+ : shard_id(id),
+ cct(cct),
+ osd(osd),
+ shard_name(string("OSDShard.") + stringify(id)),
+ sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
+ sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
+ osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
+ shard_lock_name(shard_name + "::shard_lock"),
+ shard_lock{make_mutex(shard_lock_name)},
+ scheduler(ceph::osd::scheduler::make_scheduler(
+ cct, osd->num_shards, osd->store->is_rotational())),
+ context_queue(sdata_wait_lock, sdata_cond)
+{
+ dout(0) << "using op scheduler " << *scheduler << dendl;
+}
+
+
+// =============================================================
+
+#undef dout_context
+#define dout_context osd->cct
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
+
+void OSD::ShardedOpWQ::_add_slot_waiter(
+ spg_t pgid,
+ OSDShardPGSlot *slot,
+ OpSchedulerItem&& qi)
+{
+ if (qi.is_peering()) {
+ dout(20) << __func__ << " " << pgid
+ << " peering, item epoch is "
+ << qi.get_map_epoch()
+ << ", will wait on " << qi << dendl;
+ slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
+ } else {
+ dout(20) << __func__ << " " << pgid
+ << " item epoch is "
+ << qi.get_map_epoch()
+ << ", will wait on " << qi << dendl;
+ slot->waiting.push_back(std::move(qi));
+ }
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
+
+void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
+{
+ uint32_t shard_index = thread_index % osd->num_shards;
+ auto& sdata = osd->shards[shard_index];
+ ceph_assert(sdata);
+
+ // If all threads of shards do oncommits, there is a out-of-order
+ // problem. So we choose the thread which has the smallest
+ // thread_index(thread_index < num_shards) of shard to do oncommit
+ // callback.
+ bool is_smallest_thread_index = thread_index < osd->num_shards;
+
+ // peek at spg_t
+ sdata->shard_lock.lock();
+ if (sdata->scheduler->empty() &&
+ (!is_smallest_thread_index || sdata->context_queue.empty())) {
+ std::unique_lock wait_lock{sdata->sdata_wait_lock};
+ if (is_smallest_thread_index && !sdata->context_queue.empty()) {
+ // we raced with a context_queue addition, don't wait
+ wait_lock.unlock();
+ } else if (!sdata->stop_waiting) {
+ dout(20) << __func__ << " empty q, waiting" << dendl;
+ osd->cct->get_heartbeat_map()->clear_timeout(hb);
+ sdata->shard_lock.unlock();
+ sdata->sdata_cond.wait(wait_lock);
+ wait_lock.unlock();
+ sdata->shard_lock.lock();
+ if (sdata->scheduler->empty() &&
+ !(is_smallest_thread_index && !sdata->context_queue.empty())) {
+ sdata->shard_lock.unlock();
+ return;
+ }
+ // found a work item; reapply default wq timeouts
+ osd->cct->get_heartbeat_map()->reset_timeout(hb,
+ timeout_interval, suicide_interval);
+ } else {
+ dout(20) << __func__ << " need return immediately" << dendl;
+ wait_lock.unlock();
+ sdata->shard_lock.unlock();
+ return;
+ }
+ }
+
+ list<Context *> oncommits;
+ if (is_smallest_thread_index) {
+ sdata->context_queue.move_to(oncommits);
+ }
+
+ WorkItem work_item;
+ while (!std::get_if<OpSchedulerItem>(&work_item)) {
+ if (sdata->scheduler->empty()) {
+ if (osd->is_stopping()) {
+ sdata->shard_lock.unlock();
+ for (auto c : oncommits) {
+ dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
+ delete c;
+ }
+ return; // OSD shutdown, discard.
+ }
+ sdata->shard_lock.unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+
+ work_item = sdata->scheduler->dequeue();
+ if (osd->is_stopping()) {
+ sdata->shard_lock.unlock();
+ for (auto c : oncommits) {
+ dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
+ delete c;
+ }
+ return; // OSD shutdown, discard.
+ }
+
+ // If the work item is scheduled in the future, wait until
+ // the time returned in the dequeue response before retrying.
+ if (auto when_ready = std::get_if<double>(&work_item)) {
+ if (is_smallest_thread_index) {
+ sdata->shard_lock.unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+ std::unique_lock wait_lock{sdata->sdata_wait_lock};
+ auto future_time = ceph::real_clock::from_double(*when_ready);
+ dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
+ // Disable heartbeat timeout until we find a non-future work item to process.
+ osd->cct->get_heartbeat_map()->clear_timeout(hb);
+ sdata->shard_lock.unlock();
+ ++sdata->waiting_threads;
+ sdata->sdata_cond.wait_until(wait_lock, future_time);
+ --sdata->waiting_threads;
+ wait_lock.unlock();
+ sdata->shard_lock.lock();
+ // Reapply default wq timeouts
+ osd->cct->get_heartbeat_map()->reset_timeout(hb,
+ timeout_interval, suicide_interval);
+ }
+ } // while
+
+ // Access the stored item
+ auto item = std::move(std::get<OpSchedulerItem>(work_item));
+ if (osd->is_stopping()) {
+ sdata->shard_lock.unlock();
+ for (auto c : oncommits) {
+ dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
+ delete c;
+ }
+ return; // OSD shutdown, discard.
+ }
+
+ const auto token = item.get_ordering_token();
+ auto r = sdata->pg_slots.emplace(token, nullptr);
+ if (r.second) {
+ r.first->second = make_unique<OSDShardPGSlot>();
+ }
+ OSDShardPGSlot *slot = r.first->second.get();
+ dout(20) << __func__ << " " << token
+ << (r.second ? " (new)" : "")
+ << " to_process " << slot->to_process
+ << " waiting " << slot->waiting
+ << " waiting_peering " << slot->waiting_peering
+ << dendl;
+ slot->to_process.push_back(std::move(item));
+ dout(20) << __func__ << " " << slot->to_process.back()
+ << " queued" << dendl;
+
+ retry_pg:
+ PGRef pg = slot->pg;
+
+ // lock pg (if we have it)
+ if (pg) {
+ // note the requeue seq now...
+ uint64_t requeue_seq = slot->requeue_seq;
+ ++slot->num_running;
+
+ sdata->shard_lock.unlock();
+ osd->service.maybe_inject_dispatch_delay();
+ pg->lock();
+ osd->service.maybe_inject_dispatch_delay();
+ sdata->shard_lock.lock();
+
+ auto q = sdata->pg_slots.find(token);
+ if (q == sdata->pg_slots.end()) {
+ // this can happen if we race with pg removal.
+ dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
+ pg->unlock();
+ sdata->shard_lock.unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+ slot = q->second.get();
+ --slot->num_running;
+
+ if (slot->to_process.empty()) {
+ // raced with _wake_pg_slot or consume_map
+ dout(20) << __func__ << " " << token
+ << " nothing queued" << dendl;
+ pg->unlock();
+ sdata->shard_lock.unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+ if (requeue_seq != slot->requeue_seq) {
+ dout(20) << __func__ << " " << token
+ << " requeue_seq " << slot->requeue_seq << " > our "
+ << requeue_seq << ", we raced with _wake_pg_slot"
+ << dendl;
+ pg->unlock();
+ sdata->shard_lock.unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+ if (slot->pg != pg) {
+ // this can happen if we race with pg removal.
+ dout(20) << __func__ << " slot " << token << " no longer attached to "
+ << pg << dendl;
+ pg->unlock();
+ goto retry_pg;
+ }
+ }
+
+ dout(20) << __func__ << " " << token
+ << " to_process " << slot->to_process
+ << " waiting " << slot->waiting
+ << " waiting_peering " << slot->waiting_peering << dendl;
+
+ ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
+ suicide_interval);
+
+ // take next item
+ auto qi = std::move(slot->to_process.front());
+ slot->to_process.pop_front();
+ dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
+ set<pair<spg_t,epoch_t>> new_children;
+ OSDMapRef osdmap;
+
+ while (!pg) {
+ // should this pg shard exist on this osd in this (or a later) epoch?
+ osdmap = sdata->shard_osdmap;
+ const PGCreateInfo *create_info = qi.creates_pg();
+ if (!slot->waiting_for_split.empty()) {
+ dout(20) << __func__ << " " << token
+ << " splitting " << slot->waiting_for_split << dendl;
+ _add_slot_waiter(token, slot, std::move(qi));
+ } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
+ dout(20) << __func__ << " " << token
+ << " map " << qi.get_map_epoch() << " > "
+ << osdmap->get_epoch() << dendl;
+ _add_slot_waiter(token, slot, std::move(qi));
+ } else if (qi.is_peering()) {
+ if (!qi.peering_requires_pg()) {
+ // for pg-less events, we run them under the ordering lock, since
+ // we don't have the pg lock to keep them ordered.
+ qi.run(osd, sdata, pg, tp_handle);
+ } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
+ if (create_info) {
+ if (create_info->by_mon &&
+ osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
+ dout(20) << __func__ << " " << token
+ << " no pg, no longer primary, ignoring mon create on "
+ << qi << dendl;
+ } else {
+ dout(20) << __func__ << " " << token
+ << " no pg, should create on " << qi << dendl;
+ pg = osd->handle_pg_create_info(osdmap, create_info);
+ if (pg) {
+ // we created the pg! drop out and continue "normally"!
+ sdata->_attach_pg(slot, pg.get());
+ sdata->_wake_pg_slot(token, slot);
+
+ // identify split children between create epoch and shard epoch.
+ osd->service.identify_splits_and_merges(
+ pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
+ sdata->_prime_splits(&new_children);
+ // distribute remaining split children to other shards below!
+ break;
+ }
+ dout(20) << __func__ << " ignored create on " << qi << dendl;
+ }
+ } else {
+ dout(20) << __func__ << " " << token
+ << " no pg, peering, !create, discarding " << qi << dendl;
+ }
+ } else {
+ dout(20) << __func__ << " " << token
+ << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
+ << ", discarding " << qi
+ << dendl;
+ }
+ } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
+ dout(20) << __func__ << " " << token
+ << " no pg, should exist e" << osdmap->get_epoch()
+ << ", will wait on " << qi << dendl;
+ _add_slot_waiter(token, slot, std::move(qi));
+ } else {
+ dout(20) << __func__ << " " << token
+ << " no pg, shouldn't exist e" << osdmap->get_epoch()
+ << ", dropping " << qi << dendl;
+ // share map with client?
+ if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
+ osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
+ sdata->shard_osdmap,
+ (*_op)->sent_epoch);
+ }
+ unsigned pushes_to_free = qi.get_reserved_pushes();
+ if (pushes_to_free > 0) {
+ sdata->shard_lock.unlock();
+ osd->service.release_reserved_pushes(pushes_to_free);
+ handle_oncommits(oncommits);
+ return;
+ }
+ }
+ sdata->shard_lock.unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+ if (qi.is_peering()) {
+ OSDMapRef osdmap = sdata->shard_osdmap;
+ if (qi.get_map_epoch() > osdmap->get_epoch()) {
+ _add_slot_waiter(token, slot, std::move(qi));
+ sdata->shard_lock.unlock();
+ pg->unlock();
+ handle_oncommits(oncommits);
+ return;
+ }
+ }
+ sdata->shard_lock.unlock();
+
+ if (!new_children.empty()) {
+ for (auto shard : osd->shards) {
+ shard->prime_splits(osdmap, &new_children);
+ }
+ ceph_assert(new_children.empty());
+ }
+
+ // osd_opwq_process marks the point at which an operation has been dequeued
+ // and will begin to be handled by a worker thread.
+ {
+#ifdef WITH_LTTNG
+ osd_reqid_t reqid;
+ if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
+ reqid = (*_op)->get_reqid();
+ }
+#endif
+ tracepoint(osd, opwq_process_start, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc);
+ }
+
+ lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
+ Formatter *f = Formatter::create("json");
+ f->open_object_section("q");
+ dump(f);
+ f->close_section();
+ f->flush(*_dout);
+ delete f;
+ *_dout << dendl;
+
+ qi.run(osd, sdata, pg, tp_handle);
+
+ {
+#ifdef WITH_LTTNG
+ osd_reqid_t reqid;
+ if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
+ reqid = (*_op)->get_reqid();
+ }
+#endif
+ tracepoint(osd, opwq_process_finish, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc);
+ }
+
+ handle_oncommits(oncommits);
+}
+
+void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
+ uint32_t shard_index =
+ item.get_ordering_token().hash_to_shard(osd->shards.size());
+
+ dout(20) << __func__ << " " << item << dendl;
+
+ OSDShard* sdata = osd->shards[shard_index];
+ assert (NULL != sdata);
+
+ bool empty = true;
+ {
+ std::lock_guard l{sdata->shard_lock};
+ empty = sdata->scheduler->empty();
+ sdata->scheduler->enqueue(std::move(item));
+ }
+
+ {
+ std::lock_guard l{sdata->sdata_wait_lock};
+ if (empty) {
+ sdata->sdata_cond.notify_all();
+ } else if (sdata->waiting_threads) {
+ sdata->sdata_cond.notify_one();
+ }
+ }
+}
+
+void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
+{
+ auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
+ auto& sdata = osd->shards[shard_index];
+ ceph_assert(sdata);
+ sdata->shard_lock.lock();
+ auto p = sdata->pg_slots.find(item.get_ordering_token());
+ if (p != sdata->pg_slots.end() &&
+ !p->second->to_process.empty()) {
+ // we may be racing with _process, which has dequeued a new item
+ // from scheduler, put it on to_process, and is now busy taking the
+ // pg lock. ensure this old requeued item is ordered before any
+ // such newer item in to_process.
+ p->second->to_process.push_front(std::move(item));
+ item = std::move(p->second->to_process.back());
+ p->second->to_process.pop_back();
+ dout(20) << __func__
+ << " " << p->second->to_process.front()
+ << " shuffled w/ " << item << dendl;
+ } else {
+ dout(20) << __func__ << " " << item << dendl;
+ }
+ sdata->scheduler->enqueue_front(std::move(item));
+ sdata->shard_lock.unlock();
+ std::lock_guard l{sdata->sdata_wait_lock};
+ sdata->sdata_cond.notify_one();
+}
+
+namespace ceph::osd_cmds {
+
+int heap(CephContext& cct,
+ const cmdmap_t& cmdmap,
+ std::ostream& outos,
+ std::ostream& erros)
+{
+ if (!ceph_using_tcmalloc()) {
+ erros << "could not issue heap profiler command -- not using tcmalloc!";
+ return -EOPNOTSUPP;
+ }
+
+ string cmd;
+ if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
+ erros << "unable to get value for command \"" << cmd << "\"";
+ return -EINVAL;
+ }
+
+ std::vector<std::string> cmd_vec;
+ get_str_vec(cmd, cmd_vec);
+
+ string val;
+ if (cmd_getval(cmdmap, "value", val)) {
+ cmd_vec.push_back(val);
+ }
+
+ ceph_heap_profiler_handle_command(cmd_vec, outos);
+
+ return 0;
+}
+
+} // namespace ceph::osd_cmds
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
new file mode 100644
index 000000000..efbcb40f7
--- /dev/null
+++ b/src/osd/OSD.h
@@ -0,0 +1,2152 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_H
+#define CEPH_OSD_H
+
+#include "PG.h"
+
+#include "msg/Dispatcher.h"
+
+#include "common/async/context_pool.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncReserver.h"
+#include "common/ceph_context.h"
+#include "common/config_cacher.h"
+#include "common/zipkin_trace.h"
+#include "common/ceph_timer.h"
+
+#include "mgr/MgrClient.h"
+
+#include "os/ObjectStore.h"
+
+#include "include/CompatSet.h"
+#include "include/common_fwd.h"
+
+#include "OpRequest.h"
+#include "Session.h"
+
+#include "osd/scheduler/OpScheduler.h"
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "include/unordered_map.h"
+
+#include "common/shared_cache.hpp"
+#include "common/simple_cache.hpp"
+#include "messages/MOSDOp.h"
+#include "common/EventTrace.h"
+#include "osd/osd_perf_counters.h"
+#include "common/Finisher.h"
+
+#define CEPH_OSD_PROTOCOL 10 /* cluster internal */
+
+/*
+
+ lock ordering for pg map
+
+ PG::lock
+ ShardData::lock
+ OSD::pg_map_lock
+
+ */
+
+class Messenger;
+class Message;
+class MonClient;
+class ObjectStore;
+class FuseStore;
+class OSDMap;
+class MLog;
+class Objecter;
+class KeyStore;
+
+class Watch;
+class PrimaryLogPG;
+
+class TestOpsSocketHook;
+struct C_FinishSplits;
+struct C_OpenPGs;
+class LogChannel;
+
+class MOSDPGCreate2;
+class MOSDPGQuery;
+class MOSDPGNotify;
+class MOSDPGInfo;
+class MOSDPGRemove;
+class MOSDForceRecovery;
+class MMonGetPurgedSnapsReply;
+
+class OSD;
+
+class OSDService {
+ using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
+public:
+ OSD *osd;
+ CephContext *cct;
+ ObjectStore::CollectionHandle meta_ch;
+ const int whoami;
+ ObjectStore *&store;
+ LogClient &log_client;
+ LogChannelRef clog;
+ PGRecoveryStats &pg_recovery_stats;
+private:
+ Messenger *&cluster_messenger;
+ Messenger *&client_messenger;
+public:
+ PerfCounters *&logger;
+ PerfCounters *&recoverystate_perf;
+ MonClient *&monc;
+
+ md_config_cacher_t<Option::size_t> osd_max_object_size;
+ md_config_cacher_t<bool> osd_skip_data_digest;
+
+ void enqueue_back(OpSchedulerItem&& qi);
+ void enqueue_front(OpSchedulerItem&& qi);
+
+ void maybe_inject_dispatch_delay() {
+ if (g_conf()->osd_debug_inject_dispatch_delay_probability > 0) {
+ if (rand() % 10000 <
+ g_conf()->osd_debug_inject_dispatch_delay_probability * 10000) {
+ utime_t t;
+ t.set_from_double(g_conf()->osd_debug_inject_dispatch_delay_duration);
+ t.sleep();
+ }
+ }
+ }
+
+ ceph::signedspan get_mnow();
+
+private:
+ // -- superblock --
+ ceph::mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
+ OSDSuperblock superblock;
+
+public:
+ OSDSuperblock get_superblock() {
+ std::lock_guard l(publish_lock);
+ return superblock;
+ }
+ void publish_superblock(const OSDSuperblock &block) {
+ std::lock_guard l(publish_lock);
+ superblock = block;
+ }
+
+ int get_nodeid() const { return whoami; }
+
+ std::atomic<epoch_t> max_oldest_map;
+private:
+ OSDMapRef osdmap;
+
+public:
+ OSDMapRef get_osdmap() {
+ std::lock_guard l(publish_lock);
+ return osdmap;
+ }
+ epoch_t get_osdmap_epoch() {
+ std::lock_guard l(publish_lock);
+ return osdmap ? osdmap->get_epoch() : 0;
+ }
+ void publish_map(OSDMapRef map) {
+ std::lock_guard l(publish_lock);
+ osdmap = map;
+ }
+
+ /*
+ * osdmap - current published std::map
+ * next_osdmap - pre_published std::map that is about to be published.
+ *
+ * We use the next_osdmap to send messages and initiate connections,
+ * but only if the target is the same instance as the one in the std::map
+ * epoch the current user is working from (i.e., the result is
+ * equivalent to what is in next_osdmap).
+ *
+ * This allows the helpers to start ignoring osds that are about to
+ * go down, and let OSD::handle_osd_map()/note_down_osd() mark them
+ * down, without worrying about reopening connections from threads
+ * working from old maps.
+ */
+private:
+ OSDMapRef next_osdmap;
+ ceph::condition_variable pre_publish_cond;
+ int pre_publish_waiter = 0;
+
+public:
+ void pre_publish_map(OSDMapRef map) {
+ std::lock_guard l(pre_publish_lock);
+ next_osdmap = std::move(map);
+ }
+
+ void activate_map();
+ /// map epochs reserved below
+ std::map<epoch_t, unsigned> map_reservations;
+
+ /// gets ref to next_osdmap and registers the epoch as reserved
+ OSDMapRef get_nextmap_reserved() {
+ std::lock_guard l(pre_publish_lock);
+ epoch_t e = next_osdmap->get_epoch();
+ std::map<epoch_t, unsigned>::iterator i =
+ map_reservations.insert(std::make_pair(e, 0)).first;
+ i->second++;
+ return next_osdmap;
+ }
+ /// releases reservation on map
+ void release_map(OSDMapRef osdmap) {
+ std::lock_guard l(pre_publish_lock);
+ std::map<epoch_t, unsigned>::iterator i =
+ map_reservations.find(osdmap->get_epoch());
+ ceph_assert(i != map_reservations.end());
+ ceph_assert(i->second > 0);
+ if (--(i->second) == 0) {
+ map_reservations.erase(i);
+ }
+ if (pre_publish_waiter) {
+ pre_publish_cond.notify_all();
+ }
+ }
+ /// blocks until there are no reserved maps prior to next_osdmap
+ void await_reserved_maps() {
+ std::unique_lock l{pre_publish_lock};
+ ceph_assert(next_osdmap);
+ pre_publish_waiter++;
+ pre_publish_cond.wait(l, [this] {
+ auto i = map_reservations.cbegin();
+ return (i == map_reservations.cend() ||
+ i->first >= next_osdmap->get_epoch());
+ });
+ pre_publish_waiter--;
+ }
+ OSDMapRef get_next_osdmap() {
+ std::lock_guard l(pre_publish_lock);
+ return next_osdmap;
+ }
+
+ void maybe_share_map(Connection *con,
+ const OSDMapRef& osdmap,
+ epoch_t peer_epoch_lb=0);
+
+ void send_map(class MOSDMap *m, Connection *con);
+ void send_incremental_map(epoch_t since, Connection *con,
+ const OSDMapRef& osdmap);
+ MOSDMap *build_incremental_map_msg(epoch_t from, epoch_t to,
+ OSDSuperblock& superblock);
+
+ ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch);
+ std::pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch); // (back, front)
+ void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch);
+ void send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch);
+ void send_message_osd_cluster(MessageRef m, Connection *con) {
+ con->send_message2(std::move(m));
+ }
+ void send_message_osd_cluster(Message *m, const ConnectionRef& con) {
+ con->send_message(m);
+ }
+ void send_message_osd_client(Message *m, const ConnectionRef& con) {
+ con->send_message(m);
+ }
+ entity_name_t get_cluster_msgr_name() const;
+
+private:
+ // -- scrub scheduling --
+ ceph::mutex sched_scrub_lock = ceph::make_mutex("OSDService::sched_scrub_lock");
+ int scrubs_local;
+ int scrubs_remote;
+
+public:
+ struct ScrubJob {
+ CephContext* cct;
+ /// pg to be scrubbed
+ spg_t pgid;
+ /// a time scheduled for scrub. but the scrub could be delayed if system
+ /// load is too high or it fails to fall in the scrub hours
+ utime_t sched_time;
+ /// the hard upper bound of scrub time
+ utime_t deadline;
+ ScrubJob() : cct(nullptr) {}
+ explicit ScrubJob(CephContext* cct, const spg_t& pg,
+ const utime_t& timestamp,
+ double pool_scrub_min_interval = 0,
+ double pool_scrub_max_interval = 0, bool must = true);
+ /// order the jobs by sched_time
+ bool operator<(const ScrubJob& rhs) const;
+ };
+ std::set<ScrubJob> sched_scrub_pg;
+
+ /// @returns the scrub_reg_stamp used for unregistering the scrub job
+ utime_t reg_pg_scrub(spg_t pgid,
+ utime_t t,
+ double pool_scrub_min_interval,
+ double pool_scrub_max_interval,
+ bool must) {
+ ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
+ must);
+ std::lock_guard l(OSDService::sched_scrub_lock);
+ sched_scrub_pg.insert(scrub_job);
+ return scrub_job.sched_time;
+ }
+
+ void unreg_pg_scrub(spg_t pgid, utime_t t) {
+ std::lock_guard l(sched_scrub_lock);
+ size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t));
+ ceph_assert(removed);
+ }
+
+ bool first_scrub_stamp(ScrubJob *out) {
+ std::lock_guard l(sched_scrub_lock);
+ if (sched_scrub_pg.empty())
+ return false;
+ std::set<ScrubJob>::iterator iter = sched_scrub_pg.begin();
+ *out = *iter;
+ return true;
+ }
+ bool next_scrub_stamp(const ScrubJob& next,
+ ScrubJob *out) {
+ std::lock_guard l(sched_scrub_lock);
+ if (sched_scrub_pg.empty())
+ return false;
+ std::set<ScrubJob>::const_iterator iter = sched_scrub_pg.upper_bound(next);
+ if (iter == sched_scrub_pg.cend())
+ return false;
+ *out = *iter;
+ return true;
+ }
+
+ void dumps_scrub(ceph::Formatter* f);
+
+ bool can_inc_scrubs();
+ bool inc_scrubs_local();
+ void dec_scrubs_local();
+ bool inc_scrubs_remote();
+ void dec_scrubs_remote();
+ void dump_scrub_reservations(ceph::Formatter *f);
+
+ void reply_op_error(OpRequestRef op, int err);
+ void reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv,
+ std::vector<pg_log_op_return_item_t> op_returns);
+ void handle_misdirected_op(PG *pg, OpRequestRef op);
+
+
+private:
+ // -- agent shared state --
+ ceph::mutex agent_lock = ceph::make_mutex("OSDService::agent_lock");
+ ceph::condition_variable agent_cond;
+ std::map<uint64_t, std::set<PGRef> > agent_queue;
+ std::set<PGRef>::iterator agent_queue_pos;
+ bool agent_valid_iterator;
+ int agent_ops;
+ int flush_mode_high_count; //once have one pg with FLUSH_MODE_HIGH then flush objects with high speed
+ std::set<hobject_t> agent_oids;
+ bool agent_active;
+ struct AgentThread : public Thread {
+ OSDService *osd;
+ explicit AgentThread(OSDService *o) : osd(o) {}
+ void *entry() override {
+ osd->agent_entry();
+ return NULL;
+ }
+ } agent_thread;
+ bool agent_stop_flag;
+ ceph::mutex agent_timer_lock = ceph::make_mutex("OSDService::agent_timer_lock");
+ SafeTimer agent_timer;
+
+public:
+ void agent_entry();
+ void agent_stop();
+
+ void _enqueue(PG *pg, uint64_t priority) {
+ if (!agent_queue.empty() &&
+ agent_queue.rbegin()->first < priority)
+ agent_valid_iterator = false; // inserting higher-priority queue
+ std::set<PGRef>& nq = agent_queue[priority];
+ if (nq.empty())
+ agent_cond.notify_all();
+ nq.insert(pg);
+ }
+
+ void _dequeue(PG *pg, uint64_t old_priority) {
+ std::set<PGRef>& oq = agent_queue[old_priority];
+ std::set<PGRef>::iterator p = oq.find(pg);
+ ceph_assert(p != oq.end());
+ if (p == agent_queue_pos)
+ ++agent_queue_pos;
+ oq.erase(p);
+ if (oq.empty()) {
+ if (agent_queue.rbegin()->first == old_priority)
+ agent_valid_iterator = false;
+ agent_queue.erase(old_priority);
+ }
+ }
+
+ /// enable agent for a pg
+ void agent_enable_pg(PG *pg, uint64_t priority) {
+ std::lock_guard l(agent_lock);
+ _enqueue(pg, priority);
+ }
+
+ /// adjust priority for an enagled pg
+ void agent_adjust_pg(PG *pg, uint64_t old_priority, uint64_t new_priority) {
+ std::lock_guard l(agent_lock);
+ ceph_assert(new_priority != old_priority);
+ _enqueue(pg, new_priority);
+ _dequeue(pg, old_priority);
+ }
+
+ /// disable agent for a pg
+ void agent_disable_pg(PG *pg, uint64_t old_priority) {
+ std::lock_guard l(agent_lock);
+ _dequeue(pg, old_priority);
+ }
+
+ /// note start of an async (evict) op
+ void agent_start_evict_op() {
+ std::lock_guard l(agent_lock);
+ ++agent_ops;
+ }
+
+ /// note finish or cancellation of an async (evict) op
+ void agent_finish_evict_op() {
+ std::lock_guard l(agent_lock);
+ ceph_assert(agent_ops > 0);
+ --agent_ops;
+ agent_cond.notify_all();
+ }
+
+ /// note start of an async (flush) op
+ void agent_start_op(const hobject_t& oid) {
+ std::lock_guard l(agent_lock);
+ ++agent_ops;
+ ceph_assert(agent_oids.count(oid) == 0);
+ agent_oids.insert(oid);
+ }
+
+ /// note finish or cancellation of an async (flush) op
+ void agent_finish_op(const hobject_t& oid) {
+ std::lock_guard l(agent_lock);
+ ceph_assert(agent_ops > 0);
+ --agent_ops;
+ ceph_assert(agent_oids.count(oid) == 1);
+ agent_oids.erase(oid);
+ agent_cond.notify_all();
+ }
+
+ /// check if we are operating on an object
+ bool agent_is_active_oid(const hobject_t& oid) {
+ std::lock_guard l(agent_lock);
+ return agent_oids.count(oid);
+ }
+
+ /// get count of active agent ops
+ int agent_get_num_ops() {
+ std::lock_guard l(agent_lock);
+ return agent_ops;
+ }
+
+ void agent_inc_high_count() {
+ std::lock_guard l(agent_lock);
+ flush_mode_high_count ++;
+ }
+
+ void agent_dec_high_count() {
+ std::lock_guard l(agent_lock);
+ flush_mode_high_count --;
+ }
+
+private:
+ /// throttle promotion attempts
+ std::atomic<unsigned int> promote_probability_millis{1000}; ///< probability thousands. one word.
+ PromoteCounter promote_counter;
+ utime_t last_recalibrate;
+ unsigned long promote_max_objects, promote_max_bytes;
+
+public:
+ bool promote_throttle() {
+ // NOTE: lockless! we rely on the probability being a single word.
+ promote_counter.attempt();
+ if ((unsigned)rand() % 1000 > promote_probability_millis)
+ return true; // yes throttle (no promote)
+ if (promote_max_objects &&
+ promote_counter.objects > promote_max_objects)
+ return true; // yes throttle
+ if (promote_max_bytes &&
+ promote_counter.bytes > promote_max_bytes)
+ return true; // yes throttle
+ return false; // no throttle (promote)
+ }
+ void promote_finish(uint64_t bytes) {
+ promote_counter.finish(bytes);
+ }
+ void promote_throttle_recalibrate();
+ unsigned get_num_shards() const {
+ return m_objecter_finishers;
+ }
+ Finisher* get_objecter_finisher(int shard) {
+ return objecter_finishers[shard].get();
+ }
+
+ // -- Objecter, for tiering reads/writes from/to other OSDs --
+ ceph::async::io_context_pool& poolctx;
+ std::unique_ptr<Objecter> objecter;
+ int m_objecter_finishers;
+ std::vector<std::unique_ptr<Finisher>> objecter_finishers;
+
+ // -- Watch --
+ ceph::mutex watch_lock = ceph::make_mutex("OSDService::watch_lock");
+ SafeTimer watch_timer;
+ uint64_t next_notif_id;
+ uint64_t get_next_id(epoch_t cur_epoch) {
+ std::lock_guard l(watch_lock);
+ return (((uint64_t)cur_epoch) << 32) | ((uint64_t)(next_notif_id++));
+ }
+
+ // -- Recovery/Backfill Request Scheduling --
+ ceph::mutex recovery_request_lock = ceph::make_mutex("OSDService::recovery_request_lock");
+ SafeTimer recovery_request_timer;
+
+ // For async recovery sleep
+ bool recovery_needs_sleep = true;
+ ceph::real_clock::time_point recovery_schedule_time;
+
+ // For recovery & scrub & snap
+ ceph::mutex sleep_lock = ceph::make_mutex("OSDService::sleep_lock");
+ SafeTimer sleep_timer;
+
+ // -- tids --
+ // for ops i issue
+ std::atomic<unsigned int> last_tid{0};
+ ceph_tid_t get_tid() {
+ return (ceph_tid_t)last_tid++;
+ }
+
+ // -- backfill_reservation --
+ Finisher reserver_finisher;
+ AsyncReserver<spg_t, Finisher> local_reserver;
+ AsyncReserver<spg_t, Finisher> remote_reserver;
+
+ // -- pg merge --
+ ceph::mutex merge_lock = ceph::make_mutex("OSD::merge_lock");
+ std::map<pg_t,eversion_t> ready_to_merge_source; // pg -> version
+ std::map<pg_t,std::tuple<eversion_t,epoch_t,epoch_t>> ready_to_merge_target; // pg -> (version,les,lec)
+ std::set<pg_t> not_ready_to_merge_source;
+ std::map<pg_t,pg_t> not_ready_to_merge_target;
+ std::set<pg_t> sent_ready_to_merge_source;
+
+ void set_ready_to_merge_source(PG *pg,
+ eversion_t version);
+ void set_ready_to_merge_target(PG *pg,
+ eversion_t version,
+ epoch_t last_epoch_started,
+ epoch_t last_epoch_clean);
+ void set_not_ready_to_merge_source(pg_t source);
+ void set_not_ready_to_merge_target(pg_t target, pg_t source);
+ void clear_ready_to_merge(PG *pg);
+ void send_ready_to_merge();
+ void _send_ready_to_merge();
+ void clear_sent_ready_to_merge();
+ void prune_sent_ready_to_merge(const OSDMapRef& osdmap);
+
+ // -- pg_temp --
+private:
+ ceph::mutex pg_temp_lock = ceph::make_mutex("OSDService::pg_temp_lock");
+ struct pg_temp_t {
+ std::vector<int> acting;
+ bool forced = false;
+ };
+ std::map<pg_t, pg_temp_t> pg_temp_wanted;
+ std::map<pg_t, pg_temp_t> pg_temp_pending;
+ void _sent_pg_temp();
+ friend std::ostream& operator<<(std::ostream&, const pg_temp_t&);
+public:
+ void queue_want_pg_temp(pg_t pgid, const std::vector<int>& want,
+ bool forced = false);
+ void remove_want_pg_temp(pg_t pgid);
+ void requeue_pg_temp();
+ void send_pg_temp();
+
+ ceph::mutex pg_created_lock = ceph::make_mutex("OSDService::pg_created_lock");
+ std::set<pg_t> pg_created;
+ void send_pg_created(pg_t pgid);
+ void prune_pg_created();
+ void send_pg_created();
+
+ AsyncReserver<spg_t, Finisher> snap_reserver;
+ void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
+ void queue_for_snap_trim(PG *pg);
+ void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// queue the message (-> event) that all replicas have reserved scrub resources for us
+ void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// queue the message (-> event) that some replicas denied our scrub resources request
+ void queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability
+ /// of the primary map being created by the backend.
+ void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals a change in the number of in-flight recovery writes
+ void queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that all pending updates were applied
+ void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that the selected chunk (objects range) is available for scrubbing
+ void queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// The chunk selected is blocked by user operations, and cannot be scrubbed now
+ void queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// The block-range that was locked and prevented the scrubbing - is freed
+ void queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that all write OPs are done
+ void queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that the the local (Primary's) scrub map is ready
+ void queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that we (the Primary) got all waited-for scrub-maps from our replicas
+ void queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that all chunks were handled
+ /// Note: always with high priority, as must be acted upon before the
+ /// next scrub request arrives from the Primary (and the primary is free
+ /// to send the request once the replica's map is received).
+ void queue_scrub_is_finished(PG* pg);
+
+ /// Signals that there are more chunks to handle
+ void queue_scrub_next_chunk(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ /// Signals that we have finished comparing the maps for this chunk
+ /// Note: required, as in Crimson this operation is 'futurized'.
+ void queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ void queue_for_rep_scrub(PG* pg,
+ Scrub::scrub_prio_t with_high_priority,
+ unsigned int qu_priority,
+ Scrub::act_token_t act_token);
+
+ /// Signals a change in the number of in-flight recovery writes
+ void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
+
+ /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to
+ /// trigger a re-check of the availability of the scrub map prepared by the
+ /// backend.
+ void queue_for_rep_scrub_resched(PG* pg,
+ Scrub::scrub_prio_t with_high_priority,
+ unsigned int qu_priority,
+ Scrub::act_token_t act_token);
+
+ void queue_for_pg_delete(spg_t pgid, epoch_t e);
+ bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
+
+private:
+ // -- pg recovery and associated throttling --
+ ceph::mutex recovery_lock = ceph::make_mutex("OSDService::recovery_lock");
+ std::list<std::pair<epoch_t, PGRef> > awaiting_throttle;
+
+ /// queue a scrub-related message for a PG
+ template <class MSG_TYPE>
+ void queue_scrub_event_msg(PG* pg,
+ Scrub::scrub_prio_t with_priority,
+ unsigned int qu_priority,
+ Scrub::act_token_t act_token);
+
+ /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
+ /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
+ template <class MSG_TYPE>
+ void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority);
+
+ utime_t defer_recovery_until;
+ uint64_t recovery_ops_active;
+ uint64_t recovery_ops_reserved;
+ bool recovery_paused;
+#ifdef DEBUG_RECOVERY_OIDS
+ std::map<spg_t, std::set<hobject_t> > recovery_oids;
+#endif
+ bool _recover_now(uint64_t *available_pushes);
+ void _maybe_queue_recovery();
+ void _queue_for_recovery(
+ std::pair<epoch_t, PGRef> p, uint64_t reserved_pushes);
+public:
+ void start_recovery_op(PG *pg, const hobject_t& soid);
+ void finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue);
+ bool is_recovery_active();
+ void release_reserved_pushes(uint64_t pushes);
+ void defer_recovery(float defer_for) {
+ defer_recovery_until = ceph_clock_now();
+ defer_recovery_until += defer_for;
+ }
+ void pause_recovery() {
+ std::lock_guard l(recovery_lock);
+ recovery_paused = true;
+ }
+ bool recovery_is_paused() {
+ std::lock_guard l(recovery_lock);
+ return recovery_paused;
+ }
+ void unpause_recovery() {
+ std::lock_guard l(recovery_lock);
+ recovery_paused = false;
+ _maybe_queue_recovery();
+ }
+ void kick_recovery_queue() {
+ std::lock_guard l(recovery_lock);
+ _maybe_queue_recovery();
+ }
+ void clear_queued_recovery(PG *pg) {
+ std::lock_guard l(recovery_lock);
+ awaiting_throttle.remove_if(
+ [pg](decltype(awaiting_throttle)::const_reference awaiting ) {
+ return awaiting.second.get() == pg;
+ });
+ }
+
+ unsigned get_target_pg_log_entries() const;
+
+ // delayed pg activation
+ void queue_for_recovery(PG *pg) {
+ std::lock_guard l(recovery_lock);
+
+ if (pg->is_forced_recovery_or_backfill()) {
+ awaiting_throttle.push_front(std::make_pair(pg->get_osdmap()->get_epoch(), pg));
+ } else {
+ awaiting_throttle.push_back(std::make_pair(pg->get_osdmap()->get_epoch(), pg));
+ }
+ _maybe_queue_recovery();
+ }
+ void queue_recovery_after_sleep(PG *pg, epoch_t queued, uint64_t reserved_pushes) {
+ std::lock_guard l(recovery_lock);
+ _queue_for_recovery(std::make_pair(queued, pg), reserved_pushes);
+ }
+
+ void queue_check_readable(spg_t spgid,
+ epoch_t lpr,
+ ceph::signedspan delay = ceph::signedspan::zero());
+
+ // osd map cache (past osd maps)
+ ceph::mutex map_cache_lock = ceph::make_mutex("OSDService::map_cache_lock");
+ SharedLRU<epoch_t, const OSDMap> map_cache;
+ SimpleLRU<epoch_t, ceph::buffer::list> map_bl_cache;
+ SimpleLRU<epoch_t, ceph::buffer::list> map_bl_inc_cache;
+
+ OSDMapRef try_get_map(epoch_t e);
+ OSDMapRef get_map(epoch_t e) {
+ OSDMapRef ret(try_get_map(e));
+ ceph_assert(ret);
+ return ret;
+ }
+ OSDMapRef add_map(OSDMap *o) {
+ std::lock_guard l(map_cache_lock);
+ return _add_map(o);
+ }
+ OSDMapRef _add_map(OSDMap *o);
+
+ void _add_map_bl(epoch_t e, ceph::buffer::list& bl);
+ bool get_map_bl(epoch_t e, ceph::buffer::list& bl) {
+ std::lock_guard l(map_cache_lock);
+ return _get_map_bl(e, bl);
+ }
+ bool _get_map_bl(epoch_t e, ceph::buffer::list& bl);
+
+ void _add_map_inc_bl(epoch_t e, ceph::buffer::list& bl);
+ bool get_inc_map_bl(epoch_t e, ceph::buffer::list& bl);
+
+ /// identify split child pgids over a osdmap interval
+ void identify_splits_and_merges(
+ OSDMapRef old_map,
+ OSDMapRef new_map,
+ spg_t pgid,
+ std::set<std::pair<spg_t,epoch_t>> *new_children,
+ std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
+
+ void need_heartbeat_peer_update();
+
+ void init();
+ void final_init();
+ void start_shutdown();
+ void shutdown_reserver();
+ void shutdown();
+
+ // -- stats --
+ ceph::mutex stat_lock = ceph::make_mutex("OSDService::stat_lock");
+ osd_stat_t osd_stat;
+ uint32_t seq = 0;
+
+ void set_statfs(const struct store_statfs_t &stbuf,
+ osd_alert_list_t& alerts);
+ osd_stat_t set_osd_stat(std::vector<int>& hb_peers, int num_pgs);
+ void inc_osd_stat_repaired(void);
+ float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
+ osd_stat_t get_osd_stat() {
+ std::lock_guard l(stat_lock);
+ ++seq;
+ osd_stat.up_from = up_epoch;
+ osd_stat.seq = ((uint64_t)osd_stat.up_from << 32) + seq;
+ return osd_stat;
+ }
+ uint64_t get_osd_stat_seq() {
+ std::lock_guard l(stat_lock);
+ return osd_stat.seq;
+ }
+ void get_hb_pingtime(std::map<int, osd_stat_t::Interfaces> *pp)
+ {
+ std::lock_guard l(stat_lock);
+ *pp = osd_stat.hb_pingtime;
+ return;
+ }
+
+ // -- OSD Full Status --
+private:
+ friend TestOpsSocketHook;
+ mutable ceph::mutex full_status_lock = ceph::make_mutex("OSDService::full_status_lock");
+ enum s_names { INVALID = -1, NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state; // ascending
+ const char *get_full_state_name(s_names s) const {
+ switch (s) {
+ case NONE: return "none";
+ case NEARFULL: return "nearfull";
+ case BACKFILLFULL: return "backfillfull";
+ case FULL: return "full";
+ case FAILSAFE: return "failsafe";
+ default: return "???";
+ }
+ }
+ s_names get_full_state(std::string type) const {
+ if (type == "none")
+ return NONE;
+ else if (type == "failsafe")
+ return FAILSAFE;
+ else if (type == "full")
+ return FULL;
+ else if (type == "backfillfull")
+ return BACKFILLFULL;
+ else if (type == "nearfull")
+ return NEARFULL;
+ else
+ return INVALID;
+ }
+ double cur_ratio, physical_ratio; ///< current utilization
+ mutable int64_t injectfull = 0;
+ s_names injectfull_state = NONE;
+ float get_failsafe_full_ratio();
+ bool _check_inject_full(DoutPrefixProvider *dpp, s_names type) const;
+ bool _check_full(DoutPrefixProvider *dpp, s_names type) const;
+public:
+ void check_full_status(float ratio, float pratio);
+ s_names recalc_full_state(float ratio, float pratio, std::string &inject);
+ bool _tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t);
+ bool check_failsafe_full(DoutPrefixProvider *dpp) const;
+ bool check_full(DoutPrefixProvider *dpp) const;
+ bool tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t);
+ bool check_backfill_full(DoutPrefixProvider *dpp) const;
+ bool check_nearfull(DoutPrefixProvider *dpp) const;
+ bool is_failsafe_full() const;
+ bool is_full() const;
+ bool is_backfillfull() const;
+ bool is_nearfull() const;
+ bool need_fullness_update(); ///< osdmap state needs update
+ void set_injectfull(s_names type, int64_t count);
+
+
+ // -- epochs --
+private:
+ // protects access to boot_epoch, up_epoch, bind_epoch
+ mutable ceph::mutex epoch_lock = ceph::make_mutex("OSDService::epoch_lock");
+ epoch_t boot_epoch; // _first_ epoch we were marked up (after this process started)
+ epoch_t up_epoch; // _most_recent_ epoch we were marked up
+ epoch_t bind_epoch; // epoch we last did a bind to new ip:ports
+public:
+ /**
+ * Retrieve the boot_, up_, and bind_ epochs the OSD has std::set. The params
+ * can be NULL if you don't care about them.
+ */
+ void retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
+ epoch_t *_bind_epoch) const;
+ /**
+ * Std::set the boot, up, and bind epochs. Any NULL params will not be std::set.
+ */
+ void set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
+ const epoch_t *_bind_epoch);
+ epoch_t get_boot_epoch() const {
+ epoch_t ret;
+ retrieve_epochs(&ret, NULL, NULL);
+ return ret;
+ }
+ epoch_t get_up_epoch() const {
+ epoch_t ret;
+ retrieve_epochs(NULL, &ret, NULL);
+ return ret;
+ }
+ epoch_t get_bind_epoch() const {
+ epoch_t ret;
+ retrieve_epochs(NULL, NULL, &ret);
+ return ret;
+ }
+
+ void request_osdmap_update(epoch_t e);
+
+ // -- heartbeats --
+ ceph::mutex hb_stamp_lock = ceph::make_mutex("OSDServce::hb_stamp_lock");
+
+ /// osd -> heartbeat stamps
+ std::vector<HeartbeatStampsRef> hb_stamps;
+
+ /// get or create a ref for a peer's HeartbeatStamps
+ HeartbeatStampsRef get_hb_stamps(unsigned osd);
+
+
+ // Timer for readable leases
+ ceph::timer<ceph::mono_clock> mono_timer = ceph::timer<ceph::mono_clock>{ceph::construct_suspended};
+
+ void queue_renew_lease(epoch_t epoch, spg_t spgid);
+
+ // -- stopping --
+ ceph::mutex is_stopping_lock = ceph::make_mutex("OSDService::is_stopping_lock");
+ ceph::condition_variable is_stopping_cond;
+ enum {
+ NOT_STOPPING,
+ PREPARING_TO_STOP,
+ STOPPING };
+ std::atomic<int> state{NOT_STOPPING};
+ int get_state() const {
+ return state;
+ }
+ void set_state(int s) {
+ state = s;
+ }
+ bool is_stopping() const {
+ return state == STOPPING;
+ }
+ bool is_preparing_to_stop() const {
+ return state == PREPARING_TO_STOP;
+ }
+ bool prepare_to_stop();
+ void got_stop_ack();
+
+
+#ifdef PG_DEBUG_REFS
+ ceph::mutex pgid_lock = ceph::make_mutex("OSDService::pgid_lock");
+ std::map<spg_t, int> pgid_tracker;
+ std::map<spg_t, PG*> live_pgs;
+ void add_pgid(spg_t pgid, PG *pg);
+ void remove_pgid(spg_t pgid, PG *pg);
+ void dump_live_pgids();
+#endif
+
+ explicit OSDService(OSD *osd, ceph::async::io_context_pool& poolctx);
+ ~OSDService() = default;
+};
+
+/*
+
+ Each PG slot includes queues for events that are processing and/or waiting
+ for a PG to be materialized in the slot.
+
+ These are the constraints:
+
+ - client ops must remained ordered by client, regardless of std::map epoch
+ - peering messages/events from peers must remain ordered by peer
+ - peering messages and client ops need not be ordered relative to each other
+
+ - some peering events can create a pg (e.g., notify)
+ - the query peering event can proceed when a PG doesn't exist
+
+ Implementation notes:
+
+ - everybody waits for split. If the OSD has the parent PG it will instantiate
+ the PGSlot early and mark it waiting_for_split. Everything will wait until
+ the parent is able to commit the split operation and the child PG's are
+ materialized in the child slots.
+
+ - every event has an epoch property and will wait for the OSDShard to catch
+ up to that epoch. For example, if we get a peering event from a future
+ epoch, the event will wait in the slot until the local OSD has caught up.
+ (We should be judicious in specifying the required epoch [by, e.g., setting
+ it to the same_interval_since epoch] so that we don't wait for epochs that
+ don't affect the given PG.)
+
+ - we maintain two separate wait lists, *waiting* and *waiting_peering*. The
+ OpSchedulerItem has an is_peering() bool to determine which we use. Waiting
+ peering events are queued up by epoch required.
+
+ - when we wake a PG slot (e.g., we finished split, or got a newer osdmap, or
+ materialized the PG), we wake *all* waiting items. (This could be optimized,
+ probably, but we don't bother.) We always requeue peering items ahead of
+ client ops.
+
+ - some peering events are marked !peering_requires_pg (PGQuery). if we do
+ not have a PG these are processed immediately (under the shard lock).
+
+ - we do not have a PG present, we check if the slot maps to the current host.
+ if so, we either queue the item and wait for the PG to materialize, or
+ (if the event is a pg creating event like PGNotify), we materialize the PG.
+
+ - when we advance the osdmap on the OSDShard, we scan pg slots and
+ discard any slots with no pg (and not waiting_for_split) that no
+ longer std::map to the current host.
+
+ */
+
+struct OSDShardPGSlot {
+ using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
+ PGRef pg; ///< pg reference
+ std::deque<OpSchedulerItem> to_process; ///< order items for this slot
+ int num_running = 0; ///< _process threads doing pg lookup/lock
+
+ std::deque<OpSchedulerItem> waiting; ///< waiting for pg (or map + pg)
+
+ /// waiting for map (peering evt)
+ std::map<epoch_t,std::deque<OpSchedulerItem>> waiting_peering;
+
+ /// incremented by wake_pg_waiters; indicates racing _process threads
+ /// should bail out (their op has been requeued)
+ uint64_t requeue_seq = 0;
+
+ /// waiting for split child to materialize in these epoch(s)
+ std::set<epoch_t> waiting_for_split;
+
+ epoch_t epoch = 0;
+ boost::intrusive::set_member_hook<> pg_epoch_item;
+
+ /// waiting for a merge (source or target) by this epoch
+ epoch_t waiting_for_merge_epoch = 0;
+};
+
+struct OSDShard {
+ const unsigned shard_id;
+ CephContext *cct;
+ OSD *osd;
+
+ std::string shard_name;
+
+ std::string sdata_wait_lock_name;
+ ceph::mutex sdata_wait_lock;
+ ceph::condition_variable sdata_cond;
+ int waiting_threads = 0;
+
+ ceph::mutex osdmap_lock; ///< protect shard_osdmap updates vs users w/o shard_lock
+ OSDMapRef shard_osdmap;
+
+ OSDMapRef get_osdmap() {
+ std::lock_guard l(osdmap_lock);
+ return shard_osdmap;
+ }
+
+ std::string shard_lock_name;
+ ceph::mutex shard_lock; ///< protects remaining members below
+
+ /// map of slots for each spg_t. maintains ordering of items dequeued
+ /// from scheduler while _process thread drops shard lock to acquire the
+ /// pg lock. stale slots are removed by consume_map.
+ std::unordered_map<spg_t,std::unique_ptr<OSDShardPGSlot>> pg_slots;
+
+ struct pg_slot_compare_by_epoch {
+ bool operator()(const OSDShardPGSlot& l, const OSDShardPGSlot& r) const {
+ return l.epoch < r.epoch;
+ }
+ };
+
+ /// maintain an ordering of pg slots by pg epoch
+ boost::intrusive::multiset<
+ OSDShardPGSlot,
+ boost::intrusive::member_hook<
+ OSDShardPGSlot,
+ boost::intrusive::set_member_hook<>,
+ &OSDShardPGSlot::pg_epoch_item>,
+ boost::intrusive::compare<pg_slot_compare_by_epoch>> pg_slots_by_epoch;
+ int waiting_for_min_pg_epoch = 0;
+ ceph::condition_variable min_pg_epoch_cond;
+
+ /// priority queue
+ ceph::osd::scheduler::OpSchedulerRef scheduler;
+
+ bool stop_waiting = false;
+
+ ContextQueue context_queue;
+
+ void _attach_pg(OSDShardPGSlot *slot, PG *pg);
+ void _detach_pg(OSDShardPGSlot *slot);
+
+ void update_pg_epoch(OSDShardPGSlot *slot, epoch_t epoch);
+ epoch_t get_min_pg_epoch();
+ void wait_min_pg_epoch(epoch_t need);
+
+ /// return newest epoch we are waiting for
+ epoch_t get_max_waiting_epoch();
+
+ /// push osdmap into shard
+ void consume_map(
+ const OSDMapRef& osdmap,
+ unsigned *pushes_to_free);
+
+ void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot);
+
+ void identify_splits_and_merges(
+ const OSDMapRef& as_of_osdmap,
+ std::set<std::pair<spg_t,epoch_t>> *split_children,
+ std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
+ void _prime_splits(std::set<std::pair<spg_t,epoch_t>> *pgids);
+ void prime_splits(const OSDMapRef& as_of_osdmap,
+ std::set<std::pair<spg_t,epoch_t>> *pgids);
+ void prime_merges(const OSDMapRef& as_of_osdmap,
+ std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
+ void register_and_wake_split_child(PG *pg);
+ void unprime_split_children(spg_t parent, unsigned old_pg_num);
+ void update_scheduler_config();
+
+ OSDShard(
+ int id,
+ CephContext *cct,
+ OSD *osd);
+};
+
+class OSD : public Dispatcher,
+ public md_config_obs_t {
+ using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
+
+ /** OSD **/
+ // global lock
+ ceph::mutex osd_lock = ceph::make_mutex("OSD::osd_lock");
+ SafeTimer tick_timer; // safe timer (osd_lock)
+
+ // Tick timer for those stuff that do not need osd_lock
+ ceph::mutex tick_timer_lock = ceph::make_mutex("OSD::tick_timer_lock");
+ SafeTimer tick_timer_without_osd_lock;
+ std::string gss_ktfile_client{};
+
+public:
+ // config observer bits
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override;
+ void update_log_config();
+ void check_config();
+
+protected:
+
+ const double OSD_TICK_INTERVAL = { 1.0 };
+ double get_tick_interval() const;
+
+ Messenger *cluster_messenger;
+ Messenger *client_messenger;
+ Messenger *objecter_messenger;
+ MonClient *monc; // check the "monc helpers" list before accessing directly
+ MgrClient mgrc;
+ PerfCounters *logger;
+ PerfCounters *recoverystate_perf;
+ ObjectStore *store;
+#ifdef HAVE_LIBFUSE
+ FuseStore *fuse_store = nullptr;
+#endif
+ LogClient log_client;
+ LogChannelRef clog;
+
+ int whoami;
+ std::string dev_path, journal_path;
+
+ ceph_release_t last_require_osd_release{ceph_release_t::unknown};
+
+ int numa_node = -1;
+ size_t numa_cpu_set_size = 0;
+ cpu_set_t numa_cpu_set;
+
+ bool store_is_rotational = true;
+ bool journal_is_rotational = true;
+
+ ZTracer::Endpoint trace_endpoint;
+ PerfCounters* create_logger();
+ PerfCounters* create_recoverystate_perf();
+ void tick();
+ void tick_without_osd_lock();
+ void _dispatch(Message *m);
+ void dispatch_op(OpRequestRef op);
+
+ void check_osdmap_features();
+
+ // asok
+ friend class OSDSocketHook;
+ class OSDSocketHook *asok_hook;
+ void asok_command(
+ std::string_view prefix,
+ const cmdmap_t& cmdmap,
+ ceph::Formatter *f,
+ const ceph::buffer::list& inbl,
+ std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish);
+
+public:
+ int get_nodeid() { return whoami; }
+
+ static ghobject_t get_osdmap_pobject_name(epoch_t epoch) {
+ char foo[20];
+ snprintf(foo, sizeof(foo), "osdmap.%d", epoch);
+ return ghobject_t(hobject_t(sobject_t(object_t(foo), 0)));
+ }
+ static ghobject_t get_inc_osdmap_pobject_name(epoch_t epoch) {
+ char foo[22];
+ snprintf(foo, sizeof(foo), "inc_osdmap.%d", epoch);
+ return ghobject_t(hobject_t(sobject_t(object_t(foo), 0)));
+ }
+
+ static ghobject_t make_snapmapper_oid() {
+ return ghobject_t(hobject_t(
+ sobject_t(
+ object_t("snapmapper"),
+ 0)));
+ }
+ static ghobject_t make_purged_snaps_oid() {
+ return ghobject_t(hobject_t(
+ sobject_t(
+ object_t("purged_snaps"),
+ 0)));
+ }
+
+ static ghobject_t make_pg_log_oid(spg_t pg) {
+ std::stringstream ss;
+ ss << "pglog_" << pg;
+ std::string s;
+ getline(ss, s);
+ return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0)));
+ }
+
+ static ghobject_t make_pg_biginfo_oid(spg_t pg) {
+ std::stringstream ss;
+ ss << "pginfo_" << pg;
+ std::string s;
+ getline(ss, s);
+ return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0)));
+ }
+ static ghobject_t make_infos_oid() {
+ hobject_t oid(sobject_t("infos", CEPH_NOSNAP));
+ return ghobject_t(oid);
+ }
+
+ static ghobject_t make_final_pool_info_oid(int64_t pool) {
+ return ghobject_t(
+ hobject_t(
+ sobject_t(
+ object_t(std::string("final_pool_") + stringify(pool)),
+ CEPH_NOSNAP)));
+ }
+
+ static ghobject_t make_pg_num_history_oid() {
+ return ghobject_t(hobject_t(sobject_t("pg_num_history", CEPH_NOSNAP)));
+ }
+
+ static void recursive_remove_collection(CephContext* cct,
+ ObjectStore *store,
+ spg_t pgid,
+ coll_t tmp);
+
+ /**
+ * get_osd_initial_compat_set()
+ *
+ * Get the initial feature std::set for this OSD. Features
+ * here are automatically upgraded.
+ *
+ * Return value: Initial osd CompatSet
+ */
+ static CompatSet get_osd_initial_compat_set();
+
+ /**
+ * get_osd_compat_set()
+ *
+ * Get all features supported by this OSD
+ *
+ * Return value: CompatSet of all supported features
+ */
+ static CompatSet get_osd_compat_set();
+
+
+private:
+ class C_Tick;
+ class C_Tick_WithoutOSDLock;
+
+ // -- config settings --
+ float m_osd_pg_epoch_max_lag_factor;
+
+ // -- superblock --
+ OSDSuperblock superblock;
+
+ void write_superblock();
+ void write_superblock(ObjectStore::Transaction& t);
+ int read_superblock();
+
+ void clear_temp_objects();
+
+ CompatSet osd_compat;
+
+ // -- state --
+public:
+ typedef enum {
+ STATE_INITIALIZING = 1,
+ STATE_PREBOOT,
+ STATE_BOOTING,
+ STATE_ACTIVE,
+ STATE_STOPPING,
+ STATE_WAITING_FOR_HEALTHY
+ } osd_state_t;
+
+ static const char *get_state_name(int s) {
+ switch (s) {
+ case STATE_INITIALIZING: return "initializing";
+ case STATE_PREBOOT: return "preboot";
+ case STATE_BOOTING: return "booting";
+ case STATE_ACTIVE: return "active";
+ case STATE_STOPPING: return "stopping";
+ case STATE_WAITING_FOR_HEALTHY: return "waiting_for_healthy";
+ default: return "???";
+ }
+ }
+
+private:
+ std::atomic<int> state{STATE_INITIALIZING};
+
+public:
+ int get_state() const {
+ return state;
+ }
+ void set_state(int s) {
+ state = s;
+ }
+ bool is_initializing() const {
+ return state == STATE_INITIALIZING;
+ }
+ bool is_preboot() const {
+ return state == STATE_PREBOOT;
+ }
+ bool is_booting() const {
+ return state == STATE_BOOTING;
+ }
+ bool is_active() const {
+ return state == STATE_ACTIVE;
+ }
+ bool is_stopping() const {
+ return state == STATE_STOPPING;
+ }
+ bool is_waiting_for_healthy() const {
+ return state == STATE_WAITING_FOR_HEALTHY;
+ }
+
+private:
+
+ ShardedThreadPool osd_op_tp;
+
+ void get_latest_osdmap();
+
+ // -- sessions --
+private:
+ void dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap);
+
+ ceph::mutex session_waiting_lock = ceph::make_mutex("OSD::session_waiting_lock");
+ std::set<ceph::ref_t<Session>> session_waiting_for_map;
+
+ /// Caller assumes refs for included Sessions
+ void get_sessions_waiting_for_map(std::set<ceph::ref_t<Session>> *out) {
+ std::lock_guard l(session_waiting_lock);
+ out->swap(session_waiting_for_map);
+ }
+ void register_session_waiting_on_map(const ceph::ref_t<Session>& session) {
+ std::lock_guard l(session_waiting_lock);
+ session_waiting_for_map.insert(session);
+ }
+ void clear_session_waiting_on_map(const ceph::ref_t<Session>& session) {
+ std::lock_guard l(session_waiting_lock);
+ session_waiting_for_map.erase(session);
+ }
+ void dispatch_sessions_waiting_on_map() {
+ std::set<ceph::ref_t<Session>> sessions_to_check;
+ get_sessions_waiting_for_map(&sessions_to_check);
+ for (auto i = sessions_to_check.begin();
+ i != sessions_to_check.end();
+ sessions_to_check.erase(i++)) {
+ std::lock_guard l{(*i)->session_dispatch_lock};
+ dispatch_session_waiting(*i, get_osdmap());
+ }
+ }
+ void session_handle_reset(const ceph::ref_t<Session>& session) {
+ std::lock_guard l(session->session_dispatch_lock);
+ clear_session_waiting_on_map(session);
+
+ session->clear_backoffs();
+
+ /* Messages have connection refs, we need to clear the
+ * connection->session->message->connection
+ * cycles which result.
+ * Bug #12338
+ */
+ session->waiting_on_map.clear_and_dispose(TrackedOp::Putter());
+ }
+
+private:
+ /**
+ * @defgroup monc helpers
+ * @{
+ * Right now we only have the one
+ */
+
+ /**
+ * Ask the Monitors for a sequence of OSDMaps.
+ *
+ * @param epoch The epoch to start with when replying
+ * @param force_request True if this request forces a new subscription to
+ * the monitors; false if an outstanding request that encompasses it is
+ * sufficient.
+ */
+ void osdmap_subscribe(version_t epoch, bool force_request);
+ /** @} monc helpers */
+
+ ceph::mutex osdmap_subscribe_lock = ceph::make_mutex("OSD::osdmap_subscribe_lock");
+ epoch_t latest_subscribed_epoch{0};
+
+ // -- heartbeat --
+ /// information about a heartbeat peer
+ struct HeartbeatInfo {
+ int peer; ///< peer
+ ConnectionRef con_front; ///< peer connection (front)
+ ConnectionRef con_back; ///< peer connection (back)
+ utime_t first_tx; ///< time we sent our first ping request
+ utime_t last_tx; ///< last time we sent a ping request
+ utime_t last_rx_front; ///< last time we got a ping reply on the front side
+ utime_t last_rx_back; ///< last time we got a ping reply on the back side
+ epoch_t epoch; ///< most recent epoch we wanted this peer
+ /// number of connections we send and receive heartbeat pings/replies
+ static constexpr int HEARTBEAT_MAX_CONN = 2;
+ /// history of inflight pings, arranging by timestamp we sent
+ /// send time -> deadline -> remaining replies
+ std::map<utime_t, std::pair<utime_t, int>> ping_history;
+
+ utime_t hb_interval_start;
+ uint32_t hb_average_count = 0;
+ uint32_t hb_index = 0;
+
+ uint32_t hb_total_back = 0;
+ uint32_t hb_min_back = UINT_MAX;
+ uint32_t hb_max_back = 0;
+ std::vector<uint32_t> hb_back_pingtime;
+ std::vector<uint32_t> hb_back_min;
+ std::vector<uint32_t> hb_back_max;
+
+ uint32_t hb_total_front = 0;
+ uint32_t hb_min_front = UINT_MAX;
+ uint32_t hb_max_front = 0;
+ std::vector<uint32_t> hb_front_pingtime;
+ std::vector<uint32_t> hb_front_min;
+ std::vector<uint32_t> hb_front_max;
+
+ bool is_stale(utime_t stale) const {
+ if (ping_history.empty()) {
+ return false;
+ }
+ utime_t oldest_deadline = ping_history.begin()->second.first;
+ return oldest_deadline <= stale;
+ }
+
+ bool is_unhealthy(utime_t now) const {
+ if (ping_history.empty()) {
+ /// we haven't sent a ping yet or we have got all replies,
+ /// in either way we are safe and healthy for now
+ return false;
+ }
+
+ utime_t oldest_deadline = ping_history.begin()->second.first;
+ return now > oldest_deadline;
+ }
+
+ bool is_healthy(utime_t now) const {
+ if (last_rx_front == utime_t() || last_rx_back == utime_t()) {
+ // only declare to be healthy until we have received the first
+ // replies from both front/back connections
+ return false;
+ }
+ return !is_unhealthy(now);
+ }
+
+ void clear_mark_down(Connection *except = nullptr) {
+ if (con_back && con_back != except) {
+ con_back->mark_down();
+ con_back->clear_priv();
+ con_back.reset(nullptr);
+ }
+ if (con_front && con_front != except) {
+ con_front->mark_down();
+ con_front->clear_priv();
+ con_front.reset(nullptr);
+ }
+ }
+ };
+
+ ceph::mutex heartbeat_lock = ceph::make_mutex("OSD::heartbeat_lock");
+ std::map<int, int> debug_heartbeat_drops_remaining;
+ ceph::condition_variable heartbeat_cond;
+ bool heartbeat_stop;
+ std::atomic<bool> heartbeat_need_update;
+ std::map<int,HeartbeatInfo> heartbeat_peers; ///< map of osd id to HeartbeatInfo
+ utime_t last_mon_heartbeat;
+ Messenger *hb_front_client_messenger;
+ Messenger *hb_back_client_messenger;
+ Messenger *hb_front_server_messenger;
+ Messenger *hb_back_server_messenger;
+ utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state
+ double daily_loadavg;
+ ceph::mono_time startup_time;
+
+ // Track ping repsonse times using vector as a circular buffer
+ // MUST BE A POWER OF 2
+ const uint32_t hb_vector_size = 16;
+
+ void _add_heartbeat_peer(int p);
+ void _remove_heartbeat_peer(int p);
+ bool heartbeat_reset(Connection *con);
+ void maybe_update_heartbeat_peers();
+ void reset_heartbeat_peers(bool all);
+ bool heartbeat_peers_need_update() {
+ return heartbeat_need_update.load();
+ }
+ void heartbeat_set_peers_need_update() {
+ heartbeat_need_update.store(true);
+ }
+ void heartbeat_clear_peers_need_update() {
+ heartbeat_need_update.store(false);
+ }
+ void heartbeat();
+ void heartbeat_check();
+ void heartbeat_entry();
+ void need_heartbeat_peer_update();
+
+ void heartbeat_kick() {
+ std::lock_guard l(heartbeat_lock);
+ heartbeat_cond.notify_all();
+ }
+
+ struct T_Heartbeat : public Thread {
+ OSD *osd;
+ explicit T_Heartbeat(OSD *o) : osd(o) {}
+ void *entry() override {
+ osd->heartbeat_entry();
+ return 0;
+ }
+ } heartbeat_thread;
+
+public:
+ bool heartbeat_dispatch(Message *m);
+
+ struct HeartbeatDispatcher : public Dispatcher {
+ OSD *osd;
+ explicit HeartbeatDispatcher(OSD *o) : Dispatcher(o->cct), osd(o) {}
+
+ bool ms_can_fast_dispatch_any() const override { return true; }
+ bool ms_can_fast_dispatch(const Message *m) const override {
+ switch (m->get_type()) {
+ case CEPH_MSG_PING:
+ case MSG_OSD_PING:
+ return true;
+ default:
+ return false;
+ }
+ }
+ void ms_fast_dispatch(Message *m) override {
+ osd->heartbeat_dispatch(m);
+ }
+ bool ms_dispatch(Message *m) override {
+ return osd->heartbeat_dispatch(m);
+ }
+ bool ms_handle_reset(Connection *con) override {
+ return osd->heartbeat_reset(con);
+ }
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override {
+ return osd->ms_handle_refused(con);
+ }
+ int ms_handle_authentication(Connection *con) override {
+ return true;
+ }
+ } heartbeat_dispatcher;
+
+private:
+ // -- waiters --
+ std::list<OpRequestRef> finished;
+
+ void take_waiters(std::list<OpRequestRef>& ls) {
+ ceph_assert(ceph_mutex_is_locked(osd_lock));
+ finished.splice(finished.end(), ls);
+ }
+ void do_waiters();
+
+ // -- op tracking --
+ OpTracker op_tracker;
+ void test_ops(std::string command, std::string args, std::ostream& ss);
+ friend class TestOpsSocketHook;
+ TestOpsSocketHook *test_ops_hook;
+ friend struct C_FinishSplits;
+ friend struct C_OpenPGs;
+
+protected:
+
+ /*
+ * The ordered op delivery chain is:
+ *
+ * fast dispatch -> scheduler back
+ * scheduler front <-> to_process back
+ * to_process front -> RunVis(item)
+ * <- queue_front()
+ *
+ * The scheduler is per-shard, and to_process is per pg_slot. Items can be
+ * pushed back up into to_process and/or scheduler while order is preserved.
+ *
+ * Multiple worker threads can operate on each shard.
+ *
+ * Under normal circumstances, num_running == to_process.size(). There are
+ * two times when that is not true: (1) when waiting_for_pg == true and
+ * to_process is accumulating requests that are waiting for the pg to be
+ * instantiated; in that case they will all get requeued together by
+ * wake_pg_waiters, and (2) when wake_pg_waiters just ran, waiting_for_pg
+ * and already requeued the items.
+ */
+ friend class ceph::osd::scheduler::PGOpItem;
+ friend class ceph::osd::scheduler::PGPeeringItem;
+ friend class ceph::osd::scheduler::PGRecovery;
+ friend class ceph::osd::scheduler::PGRecoveryMsg;
+ friend class ceph::osd::scheduler::PGDelete;
+
+ class ShardedOpWQ
+ : public ShardedThreadPool::ShardedWQ<OpSchedulerItem>
+ {
+ OSD *osd;
+
+ public:
+ ShardedOpWQ(OSD *o,
+ ceph::timespan ti,
+ ceph::timespan si,
+ ShardedThreadPool* tp)
+ : ShardedThreadPool::ShardedWQ<OpSchedulerItem>(ti, si, tp),
+ osd(o) {
+ }
+
+ void _add_slot_waiter(
+ spg_t token,
+ OSDShardPGSlot *slot,
+ OpSchedulerItem&& qi);
+
+ /// try to do some work
+ void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override;
+
+ /// enqueue a new item
+ void _enqueue(OpSchedulerItem&& item) override;
+
+ /// requeue an old item (at the front of the line)
+ void _enqueue_front(OpSchedulerItem&& item) override;
+
+ void return_waiting_threads() override {
+ for(uint32_t i = 0; i < osd->num_shards; i++) {
+ OSDShard* sdata = osd->shards[i];
+ assert (NULL != sdata);
+ std::scoped_lock l{sdata->sdata_wait_lock};
+ sdata->stop_waiting = true;
+ sdata->sdata_cond.notify_all();
+ }
+ }
+
+ void stop_return_waiting_threads() override {
+ for(uint32_t i = 0; i < osd->num_shards; i++) {
+ OSDShard* sdata = osd->shards[i];
+ assert (NULL != sdata);
+ std::scoped_lock l{sdata->sdata_wait_lock};
+ sdata->stop_waiting = false;
+ }
+ }
+
+ void dump(ceph::Formatter *f) {
+ for(uint32_t i = 0; i < osd->num_shards; i++) {
+ auto &&sdata = osd->shards[i];
+
+ char queue_name[32] = {0};
+ snprintf(queue_name, sizeof(queue_name), "%s%" PRIu32, "OSD:ShardedOpWQ:", i);
+ ceph_assert(NULL != sdata);
+
+ std::scoped_lock l{sdata->shard_lock};
+ f->open_object_section(queue_name);
+ sdata->scheduler->dump(*f);
+ f->close_section();
+ }
+ }
+
+ bool is_shard_empty(uint32_t thread_index) override {
+ uint32_t shard_index = thread_index % osd->num_shards;
+ auto &&sdata = osd->shards[shard_index];
+ ceph_assert(sdata);
+ std::lock_guard l(sdata->shard_lock);
+ if (thread_index < osd->num_shards) {
+ return sdata->scheduler->empty() && sdata->context_queue.empty();
+ } else {
+ return sdata->scheduler->empty();
+ }
+ }
+
+ void handle_oncommits(std::list<Context*>& oncommits) {
+ for (auto p : oncommits) {
+ p->complete(0);
+ }
+ }
+ } op_shardedwq;
+
+
+ void enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch);
+ void dequeue_op(
+ PGRef pg, OpRequestRef op,
+ ThreadPool::TPHandle &handle);
+
+ void enqueue_peering_evt(
+ spg_t pgid,
+ PGPeeringEventRef ref);
+ void dequeue_peering_evt(
+ OSDShard *sdata,
+ PG *pg,
+ PGPeeringEventRef ref,
+ ThreadPool::TPHandle& handle);
+
+ void dequeue_delete(
+ OSDShard *sdata,
+ PG *pg,
+ epoch_t epoch,
+ ThreadPool::TPHandle& handle);
+
+ friend class PG;
+ friend struct OSDShard;
+ friend class PrimaryLogPG;
+ friend class PgScrubber;
+
+
+ protected:
+
+ // -- osd map --
+ // TODO: switch to std::atomic<OSDMapRef> when C++20 will be available.
+ OSDMapRef _osdmap;
+ void set_osdmap(OSDMapRef osdmap) {
+ std::atomic_store(&_osdmap, osdmap);
+ }
+ OSDMapRef get_osdmap() const {
+ return std::atomic_load(&_osdmap);
+ }
+ epoch_t get_osdmap_epoch() const {
+ // XXX: performance?
+ auto osdmap = get_osdmap();
+ return osdmap ? osdmap->get_epoch() : 0;
+ }
+
+ pool_pg_num_history_t pg_num_history;
+
+ ceph::shared_mutex map_lock = ceph::make_shared_mutex("OSD::map_lock");
+ std::list<OpRequestRef> waiting_for_osdmap;
+ std::deque<utime_t> osd_markdown_log;
+
+ friend struct send_map_on_destruct;
+
+ void wait_for_new_map(OpRequestRef op);
+ void handle_osd_map(class MOSDMap *m);
+ void _committed_osd_maps(epoch_t first, epoch_t last, class MOSDMap *m);
+ void trim_maps(epoch_t oldest, int nreceived, bool skip_maps);
+ void note_down_osd(int osd);
+ void note_up_osd(int osd);
+ friend struct C_OnMapCommit;
+
+ bool advance_pg(
+ epoch_t advance_to,
+ PG *pg,
+ ThreadPool::TPHandle &handle,
+ PeeringCtx &rctx);
+ void consume_map();
+ void activate_map();
+
+ // osd map cache (past osd maps)
+ OSDMapRef get_map(epoch_t e) {
+ return service.get_map(e);
+ }
+ OSDMapRef add_map(OSDMap *o) {
+ return service.add_map(o);
+ }
+ bool get_map_bl(epoch_t e, ceph::buffer::list& bl) {
+ return service.get_map_bl(e, bl);
+ }
+
+public:
+ // -- shards --
+ std::vector<OSDShard*> shards;
+ uint32_t num_shards = 0;
+
+ void inc_num_pgs() {
+ ++num_pgs;
+ }
+ void dec_num_pgs() {
+ --num_pgs;
+ }
+ int get_num_pgs() const {
+ return num_pgs;
+ }
+
+protected:
+ ceph::mutex merge_lock = ceph::make_mutex("OSD::merge_lock");
+ /// merge epoch -> target pgid -> source pgid -> pg
+ std::map<epoch_t,std::map<spg_t,std::map<spg_t,PGRef>>> merge_waiters;
+
+ bool add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef source,
+ unsigned need);
+
+ // -- placement groups --
+ std::atomic<size_t> num_pgs = {0};
+
+ std::mutex pending_creates_lock;
+ using create_from_osd_t = std::pair<spg_t, bool /* is primary*/>;
+ std::set<create_from_osd_t> pending_creates_from_osd;
+ unsigned pending_creates_from_mon = 0;
+
+ PGRecoveryStats pg_recovery_stats;
+
+ PGRef _lookup_pg(spg_t pgid);
+ PGRef _lookup_lock_pg(spg_t pgid);
+ void register_pg(PGRef pg);
+ bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
+
+ void _get_pgs(std::vector<PGRef> *v, bool clear_too=false);
+ void _get_pgids(std::vector<spg_t> *v);
+
+public:
+ PGRef lookup_lock_pg(spg_t pgid);
+
+ std::set<int64_t> get_mapped_pools();
+
+protected:
+ PG* _make_pg(OSDMapRef createmap, spg_t pgid);
+
+ bool maybe_wait_for_max_pg(const OSDMapRef& osdmap,
+ spg_t pgid, bool is_mon_create);
+ void resume_creating_pg();
+
+ void load_pgs();
+
+ /// build initial pg history and intervals on create
+ void build_initial_pg_history(
+ spg_t pgid,
+ epoch_t created,
+ utime_t created_stamp,
+ pg_history_t *h,
+ PastIntervals *pi);
+
+ epoch_t last_pg_create_epoch;
+
+ void handle_pg_create(OpRequestRef op);
+
+ void split_pgs(
+ PG *parent,
+ const std::set<spg_t> &childpgids, std::set<PGRef> *out_pgs,
+ OSDMapRef curmap,
+ OSDMapRef nextmap,
+ PeeringCtx &rctx);
+ void _finish_splits(std::set<PGRef>& pgs);
+
+ // == monitor interaction ==
+ ceph::mutex mon_report_lock = ceph::make_mutex("OSD::mon_report_lock");
+ utime_t last_mon_report;
+ Finisher boot_finisher;
+
+ // -- boot --
+ void start_boot();
+ void _got_mon_epochs(epoch_t oldest, epoch_t newest);
+ void _preboot(epoch_t oldest, epoch_t newest);
+ void _send_boot();
+ void _collect_metadata(std::map<std::string,std::string> *pmeta);
+ void _get_purged_snaps();
+ void handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *r);
+
+ void start_waiting_for_healthy();
+ bool _is_healthy();
+
+ void send_full_update();
+
+ friend struct CB_OSD_GetVersion;
+
+ // -- alive --
+ epoch_t up_thru_wanted;
+
+ void queue_want_up_thru(epoch_t want);
+ void send_alive();
+
+ // -- full map requests --
+ epoch_t requested_full_first, requested_full_last;
+
+ void request_full_map(epoch_t first, epoch_t last);
+ void rerequest_full_maps() {
+ epoch_t first = requested_full_first;
+ epoch_t last = requested_full_last;
+ requested_full_first = 0;
+ requested_full_last = 0;
+ request_full_map(first, last);
+ }
+ void got_full_map(epoch_t e);
+
+ // -- failures --
+ std::map<int,utime_t> failure_queue;
+ std::map<int,std::pair<utime_t,entity_addrvec_t> > failure_pending;
+
+ void requeue_failures();
+ void send_failures();
+ void send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs);
+ void cancel_pending_failures();
+
+ ceph::coarse_mono_clock::time_point last_sent_beacon;
+ ceph::mutex min_last_epoch_clean_lock = ceph::make_mutex("OSD::min_last_epoch_clean_lock");
+ epoch_t min_last_epoch_clean = 0;
+ // which pgs were scanned for min_lec
+ std::vector<pg_t> min_last_epoch_clean_pgs;
+ void send_beacon(const ceph::coarse_mono_clock::time_point& now);
+
+ ceph_tid_t get_tid() {
+ return service.get_tid();
+ }
+
+ double scrub_sleep_time(bool must_scrub);
+
+ // -- generic pg peering --
+ PeeringCtx create_context();
+ void dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
+ ThreadPool::TPHandle *handle = NULL);
+
+ bool require_mon_peer(const Message *m);
+ bool require_mon_or_mgr_peer(const Message *m);
+ bool require_osd_peer(const Message *m);
+ /***
+ * Verifies that we were alive in the given epoch, and that
+ * still are.
+ */
+ bool require_self_aliveness(const Message *m, epoch_t alive_since);
+ /**
+ * Verifies that the OSD who sent the given op has the same
+ * address as in the given std::map.
+ * @pre op was sent by an OSD using the cluster messenger
+ */
+ bool require_same_peer_instance(const Message *m, const OSDMapRef& map,
+ bool is_fast_dispatch);
+
+ bool require_same_or_newer_map(OpRequestRef& op, epoch_t e,
+ bool is_fast_dispatch);
+
+ void handle_fast_pg_create(MOSDPGCreate2 *m);
+ void handle_fast_pg_query(MOSDPGQuery *m);
+ void handle_pg_query_nopg(const MQuery& q);
+ void handle_fast_pg_notify(MOSDPGNotify *m);
+ void handle_pg_notify_nopg(const MNotifyRec& q);
+ void handle_fast_pg_info(MOSDPGInfo *m);
+ void handle_fast_pg_remove(MOSDPGRemove *m);
+
+public:
+ // used by OSDShard
+ PGRef handle_pg_create_info(const OSDMapRef& osdmap, const PGCreateInfo *info);
+protected:
+
+ void handle_fast_force_recovery(MOSDForceRecovery *m);
+
+ // -- commands --
+ void handle_command(class MCommand *m);
+
+
+ // -- pg recovery --
+ void do_recovery(PG *pg, epoch_t epoch_queued, uint64_t pushes_reserved,
+ ThreadPool::TPHandle &handle);
+
+
+ // -- scrubbing --
+ void sched_scrub();
+ void resched_all_scrubs();
+ bool scrub_random_backoff();
+ bool scrub_load_below_threshold();
+ bool scrub_time_permit(utime_t now);
+
+ // -- status reporting --
+ MPGStats *collect_pg_stats();
+ std::vector<DaemonHealthMetric> get_health_metrics();
+
+
+private:
+ bool ms_can_fast_dispatch_any() const override { return true; }
+ bool ms_can_fast_dispatch(const Message *m) const override {
+ switch (m->get_type()) {
+ case CEPH_MSG_PING:
+ case CEPH_MSG_OSD_OP:
+ case CEPH_MSG_OSD_BACKOFF:
+ case MSG_OSD_SCRUB2:
+ case MSG_OSD_FORCE_RECOVERY:
+ case MSG_MON_COMMAND:
+ case MSG_OSD_PG_CREATE2:
+ case MSG_OSD_PG_QUERY:
+ case MSG_OSD_PG_QUERY2:
+ case MSG_OSD_PG_INFO:
+ case MSG_OSD_PG_INFO2:
+ case MSG_OSD_PG_NOTIFY:
+ case MSG_OSD_PG_NOTIFY2:
+ case MSG_OSD_PG_LOG:
+ case MSG_OSD_PG_TRIM:
+ case MSG_OSD_PG_REMOVE:
+ case MSG_OSD_BACKFILL_RESERVE:
+ case MSG_OSD_RECOVERY_RESERVE:
+ case MSG_OSD_REPOP:
+ case MSG_OSD_REPOPREPLY:
+ case MSG_OSD_PG_PUSH:
+ case MSG_OSD_PG_PULL:
+ case MSG_OSD_PG_PUSH_REPLY:
+ case MSG_OSD_PG_SCAN:
+ case MSG_OSD_PG_BACKFILL:
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ case MSG_OSD_EC_WRITE:
+ case MSG_OSD_EC_WRITE_REPLY:
+ case MSG_OSD_EC_READ:
+ case MSG_OSD_EC_READ_REPLY:
+ case MSG_OSD_SCRUB_RESERVE:
+ case MSG_OSD_REP_SCRUB:
+ case MSG_OSD_REP_SCRUBMAP:
+ case MSG_OSD_PG_UPDATE_LOG_MISSING:
+ case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ case MSG_OSD_PG_LEASE:
+ case MSG_OSD_PG_LEASE_ACK:
+ return true;
+ default:
+ return false;
+ }
+ }
+ void ms_fast_dispatch(Message *m) override;
+ bool ms_dispatch(Message *m) override;
+ void ms_handle_connect(Connection *con) override;
+ void ms_handle_fast_connect(Connection *con) override;
+ void ms_handle_fast_accept(Connection *con) override;
+ int ms_handle_authentication(Connection *con) override;
+ bool ms_handle_reset(Connection *con) override;
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override;
+
+ public:
+ /* internal and external can point to the same messenger, they will still
+ * be cleaned up properly*/
+ OSD(CephContext *cct_,
+ ObjectStore *store_,
+ int id,
+ Messenger *internal,
+ Messenger *external,
+ Messenger *hb_front_client,
+ Messenger *hb_back_client,
+ Messenger *hb_front_server,
+ Messenger *hb_back_server,
+ Messenger *osdc_messenger,
+ MonClient *mc, const std::string &dev, const std::string &jdev,
+ ceph::async::io_context_pool& poolctx);
+ ~OSD() override;
+
+ // static bits
+ static int mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, std::string osdspec_affinity);
+
+ /* remove any non-user xattrs from a std::map of them */
+ void filter_xattrs(std::map<std::string, ceph::buffer::ptr>& attrs) {
+ for (std::map<std::string, ceph::buffer::ptr>::iterator iter = attrs.begin();
+ iter != attrs.end();
+ ) {
+ if (('_' != iter->first.at(0)) || (iter->first.size() == 1))
+ attrs.erase(iter++);
+ else ++iter;
+ }
+ }
+
+private:
+ int mon_cmd_maybe_osd_create(std::string &cmd);
+ int update_crush_device_class();
+ int update_crush_location();
+
+ static int write_meta(CephContext *cct,
+ ObjectStore *store,
+ uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, std::string& osdspec_affinity);
+
+ void handle_scrub(class MOSDScrub *m);
+ void handle_fast_scrub(class MOSDScrub2 *m);
+ void handle_osd_ping(class MOSDPing *m);
+
+ size_t get_num_cache_shards();
+ int get_num_op_shards();
+ int get_num_op_threads();
+
+ float get_osd_recovery_sleep();
+ float get_osd_delete_sleep();
+ float get_osd_snap_trim_sleep();
+
+ int get_recovery_max_active();
+ void maybe_override_max_osd_capacity_for_qos();
+ bool maybe_override_options_for_qos();
+ int run_osd_bench_test(int64_t count,
+ int64_t bsize,
+ int64_t osize,
+ int64_t onum,
+ double *elapsed,
+ std::ostream& ss);
+ int mon_cmd_set_config(const std::string &key, const std::string &val);
+
+ void scrub_purged_snaps();
+ void probe_smart(const std::string& devid, std::ostream& ss);
+
+public:
+ static int peek_meta(ObjectStore *store,
+ std::string *magic,
+ uuid_d *cluster_fsid,
+ uuid_d *osd_fsid,
+ int *whoami,
+ ceph_release_t *min_osd_release);
+
+
+ // startup/shutdown
+ int pre_init();
+ int init();
+ void final_init();
+
+ int enable_disable_fuse(bool stop);
+ int set_numa_affinity();
+
+ void suicide(int exitcode);
+ int shutdown();
+
+ void handle_signal(int signum);
+
+ /// check if we can throw out op from a disconnected client
+ static bool op_is_discardable(const MOSDOp *m);
+
+public:
+ OSDService service;
+ friend class OSDService;
+
+private:
+ void set_perf_queries(const ConfigPayload &config_payload);
+ MetricPayload get_perf_reports();
+
+ ceph::mutex m_perf_queries_lock = ceph::make_mutex("OSD::m_perf_queries_lock");
+ std::list<OSDPerfMetricQuery> m_perf_queries;
+ std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> m_perf_limits;
+};
+
+
+//compatibility of the executable
+extern const CompatSet::Feature ceph_osd_feature_compat[];
+extern const CompatSet::Feature ceph_osd_feature_ro_compat[];
+extern const CompatSet::Feature ceph_osd_feature_incompat[];
+
+#endif // CEPH_OSD_H
diff --git a/src/osd/OSDCap.cc b/src/osd/OSDCap.cc
new file mode 100644
index 000000000..e7bf05827
--- /dev/null
+++ b/src/osd/OSDCap.cc
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/spirit/include/phoenix_operator.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "OSDCap.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/ipaddr.h"
+
+using std::ostream;
+using std::string;
+using std::vector;
+
+ostream& operator<<(ostream& out, const osd_rwxa_t& p)
+{
+ if (p == OSD_CAP_ANY)
+ return out << "*";
+
+ if (p & OSD_CAP_R)
+ out << "r";
+ if (p & OSD_CAP_W)
+ out << "w";
+ if ((p & OSD_CAP_X) == OSD_CAP_X) {
+ out << "x";
+ } else {
+ if (p & OSD_CAP_CLS_R)
+ out << " class-read";
+ if (p & OSD_CAP_CLS_W)
+ out << " class-write";
+ }
+ return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapSpec& s)
+{
+ if (s.allow)
+ return out << s.allow;
+ if (s.class_name.length()) {
+ out << "class '" << s.class_name << "'";
+ if (!s.method_name.empty()) {
+ out << " '" << s.method_name << "'";
+ }
+ }
+ return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns)
+{
+ if (!pns.pool_name.empty()) {
+ out << "pool " << pns.pool_name << " ";
+ }
+ if (pns.nspace) {
+ out << "namespace ";
+ if (pns.nspace->empty()) {
+ out << "\"\"";
+ } else {
+ out << *pns.nspace;
+ }
+ out << " ";
+ }
+ return out;
+}
+
+ostream& operator<<(ostream &out, const OSDCapPoolTag &pt)
+{
+ out << "app " << pt.application << " key " << pt.key << " val " << pt.value
+ << " ";
+ return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapMatch& m)
+{
+ if (!m.pool_namespace.pool_name.empty() || m.pool_namespace.nspace) {
+ out << m.pool_namespace;
+ }
+
+ if (!m.pool_tag.application.empty()) {
+ out << m.pool_tag;
+ }
+
+ if (m.object_prefix.length()) {
+ out << "object_prefix " << m.object_prefix << " ";
+ }
+ return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapProfile& m)
+{
+ out << "profile " << m.name;
+ out << m.pool_namespace;
+ return out;
+}
+
+bool OSDCapPoolNamespace::is_match(const std::string& pn,
+ const std::string& ns) const
+{
+ if (!pool_name.empty()) {
+ if (pool_name != pn) {
+ return false;
+ }
+ }
+ if (nspace) {
+ if (!nspace->empty() && nspace->back() == '*' &&
+ boost::starts_with(ns, nspace->substr(0, nspace->length() - 1))) {
+ return true;
+ }
+
+ if (*nspace != ns) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool OSDCapPoolNamespace::is_match_all() const
+{
+ if (!pool_name.empty())
+ return false;
+ if (nspace)
+ return false;
+ return true;
+}
+
+bool OSDCapPoolTag::is_match(const app_map_t& app_map) const
+{
+ if (application.empty()) {
+ return true;
+ }
+ auto kv_map = app_map.find(application);
+ if (kv_map == app_map.end()) {
+ return false;
+ }
+ if (!key.compare("*") && !value.compare("*")) {
+ return true;
+ }
+ if (!key.compare("*")) {
+ for (auto it : kv_map->second) {
+ if (it.second == value) {
+ return true;
+ }
+ }
+ return false;
+ }
+ auto kv_val = kv_map->second.find(key);
+ if (kv_val == kv_map->second.end()) {
+ return false;
+ }
+ if (!value.compare("*")) {
+ return true;
+ }
+ return kv_val->second == value;
+}
+
+bool OSDCapPoolTag::is_match_all() const {
+ return application.empty();
+}
+
+bool OSDCapMatch::is_match(const string& pn, const string& ns,
+ const OSDCapPoolTag::app_map_t& app_map,
+ const string& object) const
+{
+ if (!pool_namespace.is_match(pn, ns)) {
+ return false;
+ } else if (!pool_tag.is_match(app_map)) {
+ return false;
+ }
+
+ if (object_prefix.length()) {
+ if (object.find(object_prefix) != 0)
+ return false;
+ }
+ return true;
+}
+
+bool OSDCapMatch::is_match_all() const
+{
+if (!pool_namespace.is_match_all()) {
+ return false;
+ } else if (!pool_tag.is_match_all()) {
+ return false;
+ }
+
+ if (object_prefix.length()) {
+ return false;
+ }
+ return true;
+}
+
+ostream& operator<<(ostream& out, const OSDCapGrant& g)
+{
+ out << "grant(";
+ if (g.profile.is_valid()) {
+ out << g.profile << " [";
+ for (auto it = g.profile_grants.cbegin();
+ it != g.profile_grants.cend(); ++it) {
+ if (it != g.profile_grants.cbegin()) {
+ out << ",";
+ }
+ out << *it;
+ }
+ out << "]";
+ } else {
+ out << g.match << g.spec;
+ }
+ if (g.network.size()) {
+ out << " network " << g.network;
+ }
+ out << ")";
+ return out;
+}
+
+void OSDCapGrant::set_network(const string& n)
+{
+ network = n;
+ network_valid = ::parse_network(n.c_str(), &network_parsed, &network_prefix);
+}
+
+bool OSDCapGrant::allow_all() const
+{
+ if (profile.is_valid()) {
+ return std::any_of(profile_grants.cbegin(), profile_grants.cend(),
+ [](const OSDCapGrant& grant) {
+ return grant.allow_all();
+ });
+ }
+
+ return (match.is_match_all() && spec.allow_all());
+}
+
+bool OSDCapGrant::is_capable(
+ const string& pool_name,
+ const string& ns,
+ const OSDCapPoolTag::app_map_t& application_metadata,
+ const string& object,
+ bool op_may_read,
+ bool op_may_write,
+ const std::vector<OpInfo::ClassInfo>& classes,
+ const entity_addr_t& addr,
+ std::vector<bool>* class_allowed) const
+{
+ osd_rwxa_t allow = 0;
+
+ if (network.size() &&
+ (!network_valid ||
+ !network_contains(network_parsed,
+ network_prefix,
+ addr))) {
+ return false;
+ }
+
+ if (profile.is_valid()) {
+ return std::any_of(profile_grants.cbegin(), profile_grants.cend(),
+ [&](const OSDCapGrant& grant) {
+ return grant.is_capable(pool_name, ns,
+ application_metadata,
+ object, op_may_read,
+ op_may_write, classes, addr,
+ class_allowed);
+ });
+ } else {
+ if (match.is_match(pool_name, ns, application_metadata, object)) {
+ allow = allow | spec.allow;
+ if ((op_may_read && !(allow & OSD_CAP_R)) ||
+ (op_may_write && !(allow & OSD_CAP_W))) {
+ return false;
+ }
+ if (!classes.empty()) {
+ // check 'allow *'
+ if (spec.allow_all()) {
+ return true;
+ }
+
+ // compare this grant to each class in the operation
+ for (size_t i = 0; i < classes.size(); ++i) {
+ // check 'allow class foo [method_name]'
+ if (!spec.class_name.empty() &&
+ classes[i].class_name == spec.class_name &&
+ (spec.method_name.empty() ||
+ classes[i].method_name == spec.method_name)) {
+ (*class_allowed)[i] = true;
+ continue;
+ }
+ // check 'allow x | class-{rw}': must be on allow list
+ if (!classes[i].allowed) {
+ continue;
+ }
+ if ((classes[i].read && !(allow & OSD_CAP_CLS_R)) ||
+ (classes[i].write && !(allow & OSD_CAP_CLS_W))) {
+ continue;
+ }
+ (*class_allowed)[i] = true;
+ }
+ if (!std::all_of(class_allowed->cbegin(), class_allowed->cend(),
+ [](bool v) { return v; })) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+void OSDCapGrant::expand_profile()
+{
+ if (profile.name == "read-only") {
+ // grants READ-ONLY caps to the OSD
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R)));
+ return;
+ }
+ if (profile.name == "read-write") {
+ // grants READ-WRITE caps to the OSD
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R | OSD_CAP_W)));
+ }
+
+ if (profile.name == "rbd") {
+ // RBD read-write grant
+ profile_grants.emplace_back(OSDCapMatch(string(), "rbd_info"),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R)));
+ profile_grants.emplace_back(OSDCapMatch(string(), "rbd_children"),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R)));
+ profile_grants.emplace_back(OSDCapMatch(string(), "rbd_mirroring"),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R)));
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace.pool_name),
+ OSDCapSpec("rbd", "metadata_list"));
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R |
+ OSD_CAP_W |
+ OSD_CAP_X)));
+ }
+ if (profile.name == "rbd-read-only") {
+ // RBD read-only grant
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+ OSDCapSpec(osd_rwxa_t(OSD_CAP_R |
+ OSD_CAP_CLS_R)));
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace,
+ "rbd_header."),
+ OSDCapSpec("rbd", "child_attach"));
+ profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace,
+ "rbd_header."),
+ OSDCapSpec("rbd", "child_detach"));
+ }
+}
+
+bool OSDCap::allow_all() const
+{
+ for (auto &grant : grants) {
+ if (grant.allow_all()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void OSDCap::set_allow_all()
+{
+ grants.clear();
+ grants.push_back(OSDCapGrant(OSDCapMatch(), OSDCapSpec(OSD_CAP_ANY)));
+}
+
+bool OSDCap::is_capable(const string& pool_name, const string& ns,
+ const OSDCapPoolTag::app_map_t& application_metadata,
+ const string& object,
+ bool op_may_read, bool op_may_write,
+ const std::vector<OpInfo::ClassInfo>& classes,
+ const entity_addr_t& addr) const
+{
+ std::vector<bool> class_allowed(classes.size(), false);
+ for (auto &grant : grants) {
+ if (grant.is_capable(pool_name, ns, application_metadata,
+ object, op_may_read, op_may_write, classes, addr,
+ &class_allowed)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+// grammar
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+template <typename Iterator>
+struct OSDCapParser : qi::grammar<Iterator, OSDCap()>
+{
+ OSDCapParser() : OSDCapParser::base_type(osdcap)
+ {
+ using qi::char_;
+ using qi::int_;
+ using qi::lexeme;
+ using qi::alnum;
+ using qi::_val;
+ using qi::_1;
+ using qi::_2;
+ using qi::_3;
+ using qi::eps;
+ using qi::lit;
+
+ quoted_string %=
+ lexeme['"' >> +(char_ - '"') >> '"'] |
+ lexeme['\'' >> +(char_ - '\'') >> '\''];
+ equoted_string %=
+ lexeme['"' >> *(char_ - '"') >> '"'] |
+ lexeme['\'' >> *(char_ - '\'') >> '\''];
+ unquoted_word %= +char_("a-zA-Z0-9_./-");
+ str %= quoted_string | unquoted_word;
+ estr %= equoted_string | unquoted_word;
+ network_str %= +char_("/.:a-fA-F0-9][");
+
+ spaces = +ascii::space;
+
+ wildcard = (lit('*') | lit("all")) [_val = "*"];
+
+ pool_name %= -(spaces >> lit("pool") >> (lit('=') | spaces) >> str);
+ nspace %= (spaces >> lit("namespace")
+ >> (lit('=') | spaces)
+ >> estr >> -char_('*'));
+
+ // match := [pool[=]<poolname> [namespace[=]<namespace>]] [object_prefix <prefix>]
+ object_prefix %= -(spaces >> lit("object_prefix") >> spaces >> str);
+ pooltag %= (spaces >> lit("tag")
+ >> spaces >> str // application
+ >> spaces >> (wildcard | str) // key
+ >> -spaces >> lit('=') >> -spaces >> (wildcard | str)); // value
+
+ match = (
+ pooltag [_val = phoenix::construct<OSDCapMatch>(_1)] |
+ (nspace >> pooltag) [_val = phoenix::construct<OSDCapMatch>(_1, _2)] |
+ (pool_name >> nspace >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2, _3)] |
+ (pool_name >> object_prefix) [_val = phoenix::construct<OSDCapMatch>(_1, _2)]
+ );
+
+ // rwxa := * | [r][w][x] [class-read] [class-write]
+ rwxa =
+ (spaces >> wildcard[_val = OSD_CAP_ANY]) |
+ ( eps[_val = 0] >>
+ (
+ spaces >>
+ ( lit('r')[_val |= OSD_CAP_R] ||
+ lit('w')[_val |= OSD_CAP_W] ||
+ lit('x')[_val |= OSD_CAP_X] )) ||
+ ( (spaces >> lit("class-read")[_val |= OSD_CAP_CLS_R]) ||
+ (spaces >> lit("class-write")[_val |= OSD_CAP_CLS_W]) ));
+
+ // capspec := * | rwx | class <name> [<method name>]
+ class_name %= (spaces >> lit("class") >> spaces >> str);
+ method_name %= -(spaces >> str);
+ capspec = (
+ (rwxa) [_val = phoenix::construct<OSDCapSpec>(_1)] |
+ (class_name >> method_name) [_val = phoenix::construct<OSDCapSpec>(_1, _2)]);
+
+ // profile := profile <name> [pool[=]<pool> [namespace[=]<namespace>]]
+ profile_name %= (lit("profile") >> (lit('=') | spaces) >> str);
+ profile = (
+ (profile_name >> pool_name >> nspace) [_val = phoenix::construct<OSDCapProfile>(_1, _2, _3)] |
+ (profile_name >> pool_name) [_val = phoenix::construct<OSDCapProfile>(_1, _2)]);
+
+ // grant := allow match capspec
+ grant = (*ascii::blank >>
+ ((lit("allow") >> capspec >> match >>
+ -(spaces >> lit("network") >> spaces >> network_str))
+ [_val = phoenix::construct<OSDCapGrant>(_2, _1, _3)] |
+ (lit("allow") >> match >> capspec >>
+ -(spaces >> lit("network") >> spaces >> network_str))
+ [_val = phoenix::construct<OSDCapGrant>(_1, _2, _3)] |
+ (profile >> -(spaces >> lit("network") >> spaces >> network_str))
+ [_val = phoenix::construct<OSDCapGrant>(_1, _2)]
+ ) >> *ascii::blank);
+ // osdcap := grant [grant ...]
+ grants %= (grant % (lit(';') | lit(',')));
+ osdcap = grants [_val = phoenix::construct<OSDCap>(_1)];
+ }
+ qi::rule<Iterator> spaces;
+ qi::rule<Iterator, unsigned()> rwxa;
+ qi::rule<Iterator, string()> quoted_string, equoted_string;
+ qi::rule<Iterator, string()> unquoted_word;
+ qi::rule<Iterator, string()> str, estr, network_str;
+ qi::rule<Iterator, string()> wildcard;
+ qi::rule<Iterator, string()> class_name;
+ qi::rule<Iterator, string()> method_name;
+ qi::rule<Iterator, OSDCapSpec()> capspec;
+ qi::rule<Iterator, string()> pool_name;
+ qi::rule<Iterator, string()> nspace;
+ qi::rule<Iterator, string()> object_prefix;
+ qi::rule<Iterator, OSDCapPoolTag()> pooltag;
+ qi::rule<Iterator, OSDCapMatch()> match;
+ qi::rule<Iterator, string()> profile_name;
+ qi::rule<Iterator, OSDCapProfile()> profile;
+ qi::rule<Iterator, OSDCapGrant()> grant;
+ qi::rule<Iterator, std::vector<OSDCapGrant>()> grants;
+ qi::rule<Iterator, OSDCap()> osdcap;
+};
+
+bool OSDCap::parse(const string& str, ostream *err)
+{
+ OSDCapParser<string::const_iterator> g;
+ string::const_iterator iter = str.begin();
+ string::const_iterator end = str.end();
+
+ bool r = qi::phrase_parse(iter, end, g, ascii::space, *this);
+ if (r && iter == end)
+ return true;
+
+ // Make sure no grants are kept after parsing failed!
+ grants.clear();
+
+ if (err)
+ *err << "osd capability parse failed, stopped at '" << std::string(iter, end)
+ << "' of '" << str << "'";
+
+ return false;
+}
diff --git a/src/osd/OSDCap.h b/src/osd/OSDCap.h
new file mode 100644
index 000000000..394b1a726
--- /dev/null
+++ b/src/osd/OSDCap.h
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ * OSDCaps: Hold the capabilities associated with a single authenticated
+ * user key. These are specified by text strings of the form
+ * "allow r" (which allows reading anything on the OSD)
+ * "allow rwx pool foo" (which allows full access to listed pools)
+ * "allow *" (which allows full access to EVERYTHING)
+ *
+ * The full grammar is documented in the parser in OSDCap.cc.
+ *
+ * The OSD assumes that anyone with * caps is an admin and has full
+ * message permissions. This means that only the monitor and the OSDs
+ * should get *
+ */
+
+#ifndef CEPH_OSDCAP_H
+#define CEPH_OSDCAP_H
+
+#include <ostream>
+using std::ostream;
+
+#include "include/types.h"
+#include "OpRequest.h"
+
+#include <list>
+#include <vector>
+#include <boost/optional.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+
+static const __u8 OSD_CAP_R = (1 << 1); // read
+static const __u8 OSD_CAP_W = (1 << 2); // write
+static const __u8 OSD_CAP_CLS_R = (1 << 3); // class read
+static const __u8 OSD_CAP_CLS_W = (1 << 4); // class write
+static const __u8 OSD_CAP_X = (OSD_CAP_CLS_R | OSD_CAP_CLS_W); // execute
+static const __u8 OSD_CAP_ANY = 0xff; // *
+
+struct osd_rwxa_t {
+ __u8 val;
+
+ // cppcheck-suppress noExplicitConstructor
+ osd_rwxa_t(__u8 v = 0) : val(v) {}
+ osd_rwxa_t& operator=(__u8 v) {
+ val = v;
+ return *this;
+ }
+ operator __u8() const {
+ return val;
+ }
+};
+
+ostream& operator<<(ostream& out, const osd_rwxa_t& p);
+
+struct OSDCapSpec {
+ osd_rwxa_t allow;
+ std::string class_name;
+ std::string method_name;
+
+ OSDCapSpec() : allow(0) {}
+ explicit OSDCapSpec(osd_rwxa_t v) : allow(v) {}
+ OSDCapSpec(std::string class_name, std::string method_name)
+ : allow(0), class_name(std::move(class_name)),
+ method_name(std::move(method_name)) {}
+
+ bool allow_all() const {
+ return allow == OSD_CAP_ANY;
+ }
+};
+
+ostream& operator<<(ostream& out, const OSDCapSpec& s);
+
+struct OSDCapPoolNamespace {
+ std::string pool_name;
+ boost::optional<std::string> nspace = boost::none;
+
+ OSDCapPoolNamespace() {
+ }
+ OSDCapPoolNamespace(const std::string& pool_name,
+ const boost::optional<std::string>& nspace = boost::none)
+ : pool_name(pool_name), nspace(nspace) {
+ }
+
+ bool is_match(const std::string& pn, const std::string& ns) const;
+ bool is_match_all() const;
+};
+
+ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns);
+
+struct OSDCapPoolTag {
+ typedef std::map<std::string, std::map<std::string, std::string> > app_map_t;
+ std::string application;
+ std::string key;
+ std::string value;
+
+ OSDCapPoolTag () {}
+ OSDCapPoolTag(const std::string& application, const std::string& key,
+ const std::string& value) :
+ application(application), key(key), value(value) {}
+
+ bool is_match(const app_map_t& app_map) const;
+ bool is_match_all() const;
+};
+// adapt for parsing with boost::spirit::qi in OSDCapParser
+BOOST_FUSION_ADAPT_STRUCT(OSDCapPoolTag,
+ (std::string, application)
+ (std::string, key)
+ (std::string, value))
+
+ostream& operator<<(ostream& out, const OSDCapPoolTag& pt);
+
+struct OSDCapMatch {
+ typedef std::map<std::string, std::map<std::string, std::string> > app_map_t;
+ OSDCapPoolNamespace pool_namespace;
+ OSDCapPoolTag pool_tag;
+ std::string object_prefix;
+
+ OSDCapMatch() {}
+ explicit OSDCapMatch(const OSDCapPoolTag& pt) : pool_tag(pt) {}
+ explicit OSDCapMatch(const OSDCapPoolNamespace& pns) : pool_namespace(pns) {}
+ OSDCapMatch(const OSDCapPoolNamespace& pns, const std::string& pre)
+ : pool_namespace(pns), object_prefix(pre) {}
+ OSDCapMatch(const std::string& pl, const std::string& pre)
+ : pool_namespace(pl), object_prefix(pre) {}
+ OSDCapMatch(const std::string& pl, const std::string& ns,
+ const std::string& pre)
+ : pool_namespace(pl, ns), object_prefix(pre) {}
+ OSDCapMatch(const std::string& dummy, const std::string& app,
+ const std::string& key, const std::string& val)
+ : pool_tag(app, key, val) {}
+ OSDCapMatch(const std::string& ns, const OSDCapPoolTag& pt)
+ : pool_namespace("", ns), pool_tag(pt) {}
+
+ /**
+ * check if given request parameters match our constraints
+ *
+ * @param pool_name pool name
+ * @param nspace_name namespace name
+ * @param object object name
+ * @return true if we match, false otherwise
+ */
+ bool is_match(const std::string& pool_name, const std::string& nspace_name,
+ const app_map_t& app_map,
+ const std::string& object) const;
+ bool is_match_all() const;
+};
+
+ostream& operator<<(ostream& out, const OSDCapMatch& m);
+
+
+struct OSDCapProfile {
+ std::string name;
+ OSDCapPoolNamespace pool_namespace;
+
+ OSDCapProfile() {
+ }
+ OSDCapProfile(const std::string& name,
+ const std::string& pool_name,
+ const boost::optional<std::string>& nspace = boost::none)
+ : name(name), pool_namespace(pool_name, nspace) {
+ }
+
+ inline bool is_valid() const {
+ return !name.empty();
+ }
+};
+
+ostream& operator<<(ostream& out, const OSDCapProfile& m);
+
+struct OSDCapGrant {
+ OSDCapMatch match;
+ OSDCapSpec spec;
+ OSDCapProfile profile;
+ std::string network;
+ entity_addr_t network_parsed;
+ unsigned network_prefix = 0;
+ bool network_valid = true;
+
+ // explicit grants that a profile grant expands to; populated as
+ // needed by expand_profile() and cached here.
+ std::list<OSDCapGrant> profile_grants;
+
+ OSDCapGrant() {}
+ OSDCapGrant(const OSDCapMatch& m, const OSDCapSpec& s,
+ boost::optional<std::string> n = {})
+ : match(m), spec(s) {
+ if (n) {
+ set_network(*n);
+ }
+ }
+ explicit OSDCapGrant(const OSDCapProfile& profile,
+ boost::optional<std::string> n = {})
+ : profile(profile) {
+ if (n) {
+ set_network(*n);
+ }
+ expand_profile();
+ }
+
+ void set_network(const std::string& n);
+
+ bool allow_all() const;
+ bool is_capable(const std::string& pool_name, const std::string& ns,
+ const OSDCapPoolTag::app_map_t& application_metadata,
+ const std::string& object, bool op_may_read, bool op_may_write,
+ const std::vector<OpInfo::ClassInfo>& classes,
+ const entity_addr_t& addr,
+ std::vector<bool>* class_allowed) const;
+
+ void expand_profile();
+};
+
+ostream& operator<<(ostream& out, const OSDCapGrant& g);
+
+
+struct OSDCap {
+ std::vector<OSDCapGrant> grants;
+
+ OSDCap() {}
+ explicit OSDCap(std::vector<OSDCapGrant> g) : grants(std::move(g)) {}
+
+ bool allow_all() const;
+ void set_allow_all();
+ bool parse(const std::string& str, ostream *err=NULL);
+
+ /**
+ * check if we are capable of something
+ *
+ * This method actually checks a description of a particular operation against
+ * what the capability has specified. Currently that is just rwx with matches
+ * against pool, and object name prefix.
+ *
+ * @param pool_name name of the pool we are accessing
+ * @param ns name of the namespace we are accessing
+ * @param object name of the object we are accessing
+ * @param op_may_read whether the operation may need to read
+ * @param op_may_write whether the operation may need to write
+ * @param classes (class-name, rd, wr, allowed-flag) tuples
+ * @return true if the operation is allowed, false otherwise
+ */
+ bool is_capable(const std::string& pool_name, const std::string& ns,
+ const OSDCapPoolTag::app_map_t& application_metadata,
+ const std::string& object, bool op_may_read, bool op_may_write,
+ const std::vector<OpInfo::ClassInfo>& classes,
+ const entity_addr_t& addr) const;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const OSDCap& cap)
+{
+ return out << "osdcap" << cap.grants;
+}
+
+#endif
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
new file mode 100644
index 000000000..6e5caf53a
--- /dev/null
+++ b/src/osd/OSDMap.cc
@@ -0,0 +1,6412 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <optional>
+#include <random>
+
+#include <boost/algorithm/string.hpp>
+
+#include "OSDMap.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "include/ceph_features.h"
+#include "include/common_fwd.h"
+#include "include/str_map.h"
+
+#include "common/code_environment.h"
+#include "mon/health_check.h"
+
+#include "crush/CrushTreeDumper.h"
+#include "common/Clock.h"
+#include "mon/PGMap.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::multimap;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::unordered_map;
+using std::vector;
+
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+#define dout_subsys ceph_subsys_osd
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
+
+
+// ----------------------------------
+// osd_info_t
+
+void osd_info_t::dump(Formatter *f) const
+{
+ f->dump_int("last_clean_begin", last_clean_begin);
+ f->dump_int("last_clean_end", last_clean_end);
+ f->dump_int("up_from", up_from);
+ f->dump_int("up_thru", up_thru);
+ f->dump_int("down_at", down_at);
+ f->dump_int("lost_at", lost_at);
+}
+
+void osd_info_t::encode(ceph::buffer::list& bl) const
+{
+ using ceph::encode;
+ __u8 struct_v = 1;
+ encode(struct_v, bl);
+ encode(last_clean_begin, bl);
+ encode(last_clean_end, bl);
+ encode(up_from, bl);
+ encode(up_thru, bl);
+ encode(down_at, bl);
+ encode(lost_at, bl);
+}
+
+void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ using ceph::decode;
+ __u8 struct_v;
+ decode(struct_v, bl);
+ decode(last_clean_begin, bl);
+ decode(last_clean_end, bl);
+ decode(up_from, bl);
+ decode(up_thru, bl);
+ decode(down_at, bl);
+ decode(lost_at, bl);
+}
+
+void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
+{
+ o.push_back(new osd_info_t);
+ o.push_back(new osd_info_t);
+ o.back()->last_clean_begin = 1;
+ o.back()->last_clean_end = 2;
+ o.back()->up_from = 30;
+ o.back()->up_thru = 40;
+ o.back()->down_at = 5;
+ o.back()->lost_at = 6;
+}
+
+ostream& operator<<(ostream& out, const osd_info_t& info)
+{
+ out << "up_from " << info.up_from
+ << " up_thru " << info.up_thru
+ << " down_at " << info.down_at
+ << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
+ if (info.lost_at)
+ out << " lost_at " << info.lost_at;
+ return out;
+}
+
+// ----------------------------------
+// osd_xinfo_t
+
+void osd_xinfo_t::dump(Formatter *f) const
+{
+ f->dump_stream("down_stamp") << down_stamp;
+ f->dump_float("laggy_probability", laggy_probability);
+ f->dump_int("laggy_interval", laggy_interval);
+ f->dump_int("features", features);
+ f->dump_unsigned("old_weight", old_weight);
+ f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
+ f->dump_int("dead_epoch", dead_epoch);
+}
+
+void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
+{
+ uint8_t v = 4;
+ if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
+ v = 3;
+ }
+ ENCODE_START(v, 1, bl);
+ encode(down_stamp, bl);
+ __u32 lp = laggy_probability * float(0xfffffffful);
+ encode(lp, bl);
+ encode(laggy_interval, bl);
+ encode(features, bl);
+ encode(old_weight, bl);
+ if (v >= 4) {
+ encode(last_purged_snaps_scrub, bl);
+ encode(dead_epoch, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(4, bl);
+ decode(down_stamp, bl);
+ __u32 lp;
+ decode(lp, bl);
+ laggy_probability = (float)lp / (float)0xffffffff;
+ decode(laggy_interval, bl);
+ if (struct_v >= 2)
+ decode(features, bl);
+ else
+ features = 0;
+ if (struct_v >= 3)
+ decode(old_weight, bl);
+ else
+ old_weight = 0;
+ if (struct_v >= 4) {
+ decode(last_purged_snaps_scrub, bl);
+ decode(dead_epoch, bl);
+ } else {
+ dead_epoch = 0;
+ }
+ DECODE_FINISH(bl);
+}
+
+void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
+{
+ o.push_back(new osd_xinfo_t);
+ o.push_back(new osd_xinfo_t);
+ o.back()->down_stamp = utime_t(2, 3);
+ o.back()->laggy_probability = .123;
+ o.back()->laggy_interval = 123456;
+ o.back()->old_weight = 0x7fff;
+}
+
+ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
+{
+ return out << "down_stamp " << xi.down_stamp
+ << " laggy_probability " << xi.laggy_probability
+ << " laggy_interval " << xi.laggy_interval
+ << " old_weight " << xi.old_weight
+ << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
+ << " dead_epoch " << xi.dead_epoch;
+}
+
+// ----------------------------------
+// OSDMap::Incremental
+
+int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
+{
+ int n = 0;
+ for (auto &weight : new_weight) {
+ if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
+ n++; // marked out
+ else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
+ n--; // marked in
+ }
+ return n;
+}
+
+int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
+{
+ int n = 0;
+ for (auto &state : new_state) { //
+ if (state.second & CEPH_OSD_UP) {
+ if (previous->is_up(state.first))
+ n++; // marked down
+ else
+ n--; // marked up
+ }
+ }
+ return n;
+}
+
+int OSDMap::Incremental::identify_osd(uuid_d u) const
+{
+ for (auto &uuid : new_uuid)
+ if (uuid.second == u)
+ return uuid.first;
+ return -1;
+}
+
+int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
+ const OSDMap& osdmap)
+{
+ ceph_assert(epoch == osdmap.get_epoch() + 1);
+
+ for (auto &new_pool : new_pools) {
+ if (!new_pool.second.tiers.empty()) {
+ pg_pool_t& base = new_pool.second;
+
+ auto new_rem_it = new_removed_snaps.find(new_pool.first);
+
+ for (const auto &tier_pool : base.tiers) {
+ const auto &r = new_pools.find(tier_pool);
+ pg_pool_t *tier = 0;
+ if (r == new_pools.end()) {
+ const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
+ if (!orig) {
+ lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
+ return -EIO;
+ }
+ tier = get_new_pool(tier_pool, orig);
+ } else {
+ tier = &r->second;
+ }
+ if (tier->tier_of != new_pool.first) {
+ lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
+ return -EIO;
+ }
+
+ ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
+ << tier_pool << dendl;
+ tier->snap_seq = base.snap_seq;
+ tier->snap_epoch = base.snap_epoch;
+ tier->snaps = base.snaps;
+ tier->removed_snaps = base.removed_snaps;
+ tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
+ pg_pool_t::FLAG_POOL_SNAPS);
+
+ if (new_rem_it != new_removed_snaps.end()) {
+ new_removed_snaps[tier_pool] = new_rem_it->second;
+ }
+
+ tier->application_metadata = base.application_metadata;
+ }
+ }
+ }
+ return 0;
+}
+
+// ----------------------------------
+// OSDMap
+
+bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
+{
+ if (id >= 0)
+ return is_down(id);
+
+ if (down_cache &&
+ down_cache->count(id)) {
+ return true;
+ }
+
+ list<int> children;
+ crush->get_children(id, &children);
+ for (const auto &child : children) {
+ if (!subtree_is_down(child, down_cache)) {
+ return false;
+ }
+ }
+ if (down_cache) {
+ down_cache->insert(id);
+ }
+ return true;
+}
+
+bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+{
+ // use a stack-local down_cache if we didn't get one from the
+ // caller. then at least this particular call will avoid duplicated
+ // work.
+ set<int> local_down_cache;
+ if (!down_cache) {
+ down_cache = &local_down_cache;
+ }
+
+ int current = id;
+ while (true) {
+ int type;
+ if (current >= 0) {
+ type = 0;
+ } else {
+ type = crush->get_bucket_type(current);
+ }
+ ceph_assert(type >= 0);
+
+ if (!subtree_is_down(current, down_cache)) {
+ ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
+ return false;
+ }
+
+ // is this a big enough subtree to be marked as down?
+ if (type >= subtree_type) {
+ ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
+ return true;
+ }
+
+ int r = crush->get_immediate_parent_id(current, &current);
+ if (r < 0) {
+ return false;
+ }
+ }
+}
+
+bool OSDMap::subtree_type_is_down(
+ CephContext *cct,
+ int id,
+ int subtree_type,
+ set<int> *down_in_osds,
+ set<int> *up_in_osds,
+ set<int> *subtree_up,
+ unordered_map<int, set<int> > *subtree_type_down) const
+{
+ if (id >= 0) {
+ bool is_down_ret = is_down(id);
+ if (!is_out(id)) {
+ if (is_down_ret) {
+ down_in_osds->insert(id);
+ } else {
+ up_in_osds->insert(id);
+ }
+ }
+ return is_down_ret;
+ }
+
+ if (subtree_type_down &&
+ (*subtree_type_down)[subtree_type].count(id)) {
+ return true;
+ }
+
+ list<int> children;
+ crush->get_children(id, &children);
+ for (const auto &child : children) {
+ if (!subtree_type_is_down(
+ cct, child, crush->get_bucket_type(child),
+ down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
+ subtree_up->insert(id);
+ return false;
+ }
+ }
+ if (subtree_type_down) {
+ (*subtree_type_down)[subtree_type].insert(id);
+ }
+ return true;
+}
+
+void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
+{
+ using ceph::encode;
+ __u16 v = 5;
+ encode(v, bl);
+ encode(fsid, bl);
+ encode(epoch, bl);
+ encode(modified, bl);
+ int32_t new_t = new_pool_max;
+ encode(new_t, bl);
+ encode(new_flags, bl);
+ encode(fullmap, bl);
+ encode(crush, bl);
+
+ encode(new_max_osd, bl);
+ // for encode(new_pools, bl);
+ __u32 n = new_pools.size();
+ encode(n, bl);
+ for (const auto &new_pool : new_pools) {
+ n = new_pool.first;
+ encode(n, bl);
+ encode(new_pool.second, bl, 0);
+ }
+ // for encode(new_pool_names, bl);
+ n = new_pool_names.size();
+ encode(n, bl);
+
+ for (const auto &new_pool_name : new_pool_names) {
+ n = new_pool_name.first;
+ encode(n, bl);
+ encode(new_pool_name.second, bl);
+ }
+ // for encode(old_pools, bl);
+ n = old_pools.size();
+ encode(n, bl);
+ for (auto &old_pool : old_pools) {
+ n = old_pool;
+ encode(n, bl);
+ }
+ encode(new_up_client, bl, 0);
+ {
+ // legacy is map<int32_t,uint8_t>
+ map<int32_t, uint8_t> os;
+ for (auto p : new_state) {
+ // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
+ // that an old client could not understand.
+ // skip those!
+ uint8_t s = p.second;
+ if (p.second != 0 && s == 0)
+ continue;
+ os[p.first] = s;
+ }
+ uint32_t n = os.size();
+ encode(n, bl);
+ for (auto p : os) {
+ encode(p.first, bl);
+ encode(p.second, bl);
+ }
+ }
+ encode(new_weight, bl);
+ // for encode(new_pg_temp, bl);
+ n = new_pg_temp.size();
+ encode(n, bl);
+
+ for (const auto &pg_temp : new_pg_temp) {
+ old_pg_t opg = pg_temp.first.get_old_pg();
+ encode(opg, bl);
+ encode(pg_temp.second, bl);
+ }
+}
+
+void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_PGID64) == 0) {
+ encode_client_old(bl);
+ return;
+ }
+
+ // base
+ __u16 v = 6;
+ encode(v, bl);
+ encode(fsid, bl);
+ encode(epoch, bl);
+ encode(modified, bl);
+ encode(new_pool_max, bl);
+ encode(new_flags, bl);
+ encode(fullmap, bl);
+ encode(crush, bl);
+
+ encode(new_max_osd, bl);
+ encode(new_pools, bl, features);
+ encode(new_pool_names, bl);
+ encode(old_pools, bl);
+ encode(new_up_client, bl, features);
+ {
+ map<int32_t, uint8_t> os;
+ for (auto p : new_state) {
+ // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
+ // that an old client could not understand.
+ // skip those!
+ uint8_t s = p.second;
+ if (p.second != 0 && s == 0)
+ continue;
+ os[p.first] = s;
+ }
+ uint32_t n = os.size();
+ encode(n, bl);
+ for (auto p : os) {
+ encode(p.first, bl);
+ encode(p.second, bl);
+ }
+ }
+ encode(new_weight, bl);
+ encode(new_pg_temp, bl);
+
+ // extended
+ __u16 ev = 10;
+ encode(ev, bl);
+ encode(new_hb_back_up, bl, features);
+ encode(new_up_thru, bl);
+ encode(new_last_clean_interval, bl);
+ encode(new_lost, bl);
+ encode(new_blocklist, bl, features);
+ encode(old_blocklist, bl, features);
+ encode(new_up_cluster, bl, features);
+ encode(cluster_snapshot, bl);
+ encode(new_uuid, bl);
+ encode(new_xinfo, bl, features);
+ encode(new_hb_front_up, bl, features);
+}
+
+template<class T>
+static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
+{
+ uint32_t n = m.size();
+ encode(n, bl);
+ for (auto& i : m) {
+ encode(i.first, bl);
+ encode(i.second.legacy_addr(), bl, f);
+ }
+}
+
+template<class T>
+static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
+{
+ uint32_t n = m.size();
+ encode(n, bl);
+ for (auto& i : m) {
+ if (i) {
+ encode(i->legacy_addr(), bl, f);
+ } else {
+ encode(entity_addr_t(), bl, f);
+ }
+ }
+}
+
+/* for a description of osdmap incremental versions, and when they were
+ * introduced, please refer to
+ * doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
+ encode_classic(bl, features);
+ return;
+ }
+
+ // only a select set of callers should *ever* be encoding new
+ // OSDMaps. others should be passing around the canonical encoded
+ // buffers from on high. select out those callers by passing in an
+ // "impossible" feature bit.
+ ceph_assert(features & CEPH_FEATURE_RESERVED);
+ features &= ~CEPH_FEATURE_RESERVED;
+
+ size_t start_offset = bl.length();
+ size_t tail_offset;
+ size_t crc_offset;
+ std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
+
+ // meta-encoding: how we include client-used and osd-specific data
+ ENCODE_START(8, 7, bl);
+
+ {
+ uint8_t v = 8;
+ if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ v = 3;
+ } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+ v = 5;
+ } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ v = 6;
+ }
+ ENCODE_START(v, 1, bl); // client-usable data
+ encode(fsid, bl);
+ encode(epoch, bl);
+ encode(modified, bl);
+ encode(new_pool_max, bl);
+ encode(new_flags, bl);
+ encode(fullmap, bl);
+ encode(crush, bl);
+
+ encode(new_max_osd, bl);
+ encode(new_pools, bl, features);
+ encode(new_pool_names, bl);
+ encode(old_pools, bl);
+ if (v >= 7) {
+ encode(new_up_client, bl, features);
+ } else {
+ encode_addrvec_map_as_addr(new_up_client, bl, features);
+ }
+ if (v >= 5) {
+ encode(new_state, bl);
+ } else {
+ map<int32_t, uint8_t> os;
+ for (auto p : new_state) {
+ // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
+ // that an old client could not understand.
+ // skip those!
+ uint8_t s = p.second;
+ if (p.second != 0 && s == 0)
+ continue;
+ os[p.first] = s;
+ }
+ uint32_t n = os.size();
+ encode(n, bl);
+ for (auto p : os) {
+ encode(p.first, bl);
+ encode(p.second, bl);
+ }
+ }
+ encode(new_weight, bl);
+ encode(new_pg_temp, bl);
+ encode(new_primary_temp, bl);
+ encode(new_primary_affinity, bl);
+ encode(new_erasure_code_profiles, bl);
+ encode(old_erasure_code_profiles, bl);
+ if (v >= 4) {
+ encode(new_pg_upmap, bl);
+ encode(old_pg_upmap, bl);
+ encode(new_pg_upmap_items, bl);
+ encode(old_pg_upmap_items, bl);
+ }
+ if (v >= 6) {
+ encode(new_removed_snaps, bl);
+ encode(new_purged_snaps, bl);
+ }
+ if (v >= 8) {
+ encode(new_last_up_change, bl);
+ encode(new_last_in_change, bl);
+ }
+ ENCODE_FINISH(bl); // client-usable data
+ }
+
+ {
+ uint8_t target_v = 9; // if bumping this, be aware of range_blocklist 11
+ if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ target_v = 2;
+ } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ target_v = 6;
+ }
+ if (change_stretch_mode) {
+ target_v = std::max((uint8_t)10, target_v);
+ }
+ if (!new_range_blocklist.empty() ||
+ !old_range_blocklist.empty()) {
+ target_v = std::max((uint8_t)11, target_v);
+ }
+ ENCODE_START(target_v, 1, bl); // extended, osd-only data
+ if (target_v < 7) {
+ encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
+ } else {
+ encode(new_hb_back_up, bl, features);
+ }
+ encode(new_up_thru, bl);
+ encode(new_last_clean_interval, bl);
+ encode(new_lost, bl);
+ encode(new_blocklist, bl, features);
+ encode(old_blocklist, bl, features);
+ if (target_v < 7) {
+ encode_addrvec_map_as_addr(new_up_cluster, bl, features);
+ } else {
+ encode(new_up_cluster, bl, features);
+ }
+ encode(cluster_snapshot, bl);
+ encode(new_uuid, bl);
+ encode(new_xinfo, bl, features);
+ if (target_v < 7) {
+ encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
+ } else {
+ encode(new_hb_front_up, bl, features);
+ }
+ encode(features, bl); // NOTE: features arg, not the member
+ if (target_v >= 3) {
+ encode(new_nearfull_ratio, bl);
+ encode(new_full_ratio, bl);
+ encode(new_backfillfull_ratio, bl);
+ }
+ // 5 was string-based new_require_min_compat_client
+ if (target_v >= 6) {
+ encode(new_require_min_compat_client, bl);
+ encode(new_require_osd_release, bl);
+ }
+ if (target_v >= 8) {
+ encode(new_crush_node_flags, bl);
+ }
+ if (target_v >= 9) {
+ encode(new_device_class_flags, bl);
+ }
+ if (target_v >= 10) {
+ encode(change_stretch_mode, bl);
+ encode(new_stretch_bucket_count, bl);
+ encode(new_degraded_stretch_mode, bl);
+ encode(new_recovering_stretch_mode, bl);
+ encode(new_stretch_mode_bucket, bl);
+ encode(stretch_mode_enabled, bl);
+ }
+ if (target_v >= 11) {
+ encode(new_range_blocklist, bl, features);
+ encode(old_range_blocklist, bl, features);
+ }
+ ENCODE_FINISH(bl); // osd-only data
+ }
+
+ crc_offset = bl.length();
+ crc_filler = bl.append_hole(sizeof(uint32_t));
+ tail_offset = bl.length();
+
+ encode(full_crc, bl);
+
+ ENCODE_FINISH(bl); // meta-encoding wrapper
+
+ // fill in crc
+ ceph::buffer::list front;
+ front.substr_of(bl, start_offset, crc_offset - start_offset);
+ inc_crc = front.crc32c(-1);
+ ceph::buffer::list tail;
+ tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
+ inc_crc = tail.crc32c(inc_crc);
+ ceph_le32 crc_le;
+ crc_le = inc_crc;
+ crc_filler->copy_in(4u, (char*)&crc_le);
+ have_crc = true;
+}
+
+void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
+{
+ using ceph::decode;
+ __u32 n, t;
+ // base
+ __u16 v;
+ decode(v, p);
+ decode(fsid, p);
+ decode(epoch, p);
+ decode(modified, p);
+ if (v == 4 || v == 5) {
+ decode(n, p);
+ new_pool_max = n;
+ } else if (v >= 6)
+ decode(new_pool_max, p);
+ decode(new_flags, p);
+ decode(fullmap, p);
+ decode(crush, p);
+
+ decode(new_max_osd, p);
+ if (v < 6) {
+ new_pools.clear();
+ decode(n, p);
+ while (n--) {
+ decode(t, p);
+ decode(new_pools[t], p);
+ }
+ } else {
+ decode(new_pools, p);
+ }
+ if (v == 5) {
+ new_pool_names.clear();
+ decode(n, p);
+ while (n--) {
+ decode(t, p);
+ decode(new_pool_names[t], p);
+ }
+ } else if (v >= 6) {
+ decode(new_pool_names, p);
+ }
+ if (v < 6) {
+ old_pools.clear();
+ decode(n, p);
+ while (n--) {
+ decode(t, p);
+ old_pools.insert(t);
+ }
+ } else {
+ decode(old_pools, p);
+ }
+ decode(new_up_client, p);
+ {
+ map<int32_t,uint8_t> ns;
+ decode(ns, p);
+ for (auto q : ns) {
+ new_state[q.first] = q.second;
+ }
+ }
+ decode(new_weight, p);
+
+ if (v < 6) {
+ new_pg_temp.clear();
+ decode(n, p);
+ while (n--) {
+ old_pg_t opg;
+ ceph::decode_raw(opg, p);
+ decode(new_pg_temp[pg_t(opg)], p);
+ }
+ } else {
+ decode(new_pg_temp, p);
+ }
+
+ // decode short map, too.
+ if (v == 5 && p.end())
+ return;
+
+ // extended
+ __u16 ev = 0;
+ if (v >= 5)
+ decode(ev, p);
+ decode(new_hb_back_up, p);
+ if (v < 5)
+ decode(new_pool_names, p);
+ decode(new_up_thru, p);
+ decode(new_last_clean_interval, p);
+ decode(new_lost, p);
+ decode(new_blocklist, p);
+ decode(old_blocklist, p);
+ if (ev >= 6)
+ decode(new_up_cluster, p);
+ if (ev >= 7)
+ decode(cluster_snapshot, p);
+ if (ev >= 8)
+ decode(new_uuid, p);
+ if (ev >= 9)
+ decode(new_xinfo, p);
+ if (ev >= 10)
+ decode(new_hb_front_up, p);
+}
+
+/* for a description of osdmap incremental versions, and when they were
+ * introduced, please refer to
+ * doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
+{
+ using ceph::decode;
+ /**
+ * Older encodings of the Incremental had a single struct_v which
+ * covered the whole encoding, and was prior to our modern
+ * stuff which includes a compatv and a size. So if we see
+ * a struct_v < 7, we must rewind to the beginning and use our
+ * classic decoder.
+ */
+ size_t start_offset = bl.get_off();
+ size_t tail_offset = 0;
+ ceph::buffer::list crc_front, crc_tail;
+
+ DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
+ if (struct_v < 7) {
+ bl.seek(start_offset);
+ decode_classic(bl);
+ encode_features = 0;
+ if (struct_v >= 6)
+ encode_features = CEPH_FEATURE_PGID64;
+ else
+ encode_features = 0;
+ return;
+ }
+ {
+ DECODE_START(8, bl); // client-usable data
+ decode(fsid, bl);
+ decode(epoch, bl);
+ decode(modified, bl);
+ decode(new_pool_max, bl);
+ decode(new_flags, bl);
+ decode(fullmap, bl);
+ decode(crush, bl);
+
+ decode(new_max_osd, bl);
+ decode(new_pools, bl);
+ decode(new_pool_names, bl);
+ decode(old_pools, bl);
+ decode(new_up_client, bl);
+ if (struct_v >= 5) {
+ decode(new_state, bl);
+ } else {
+ map<int32_t,uint8_t> ns;
+ decode(ns, bl);
+ for (auto q : ns) {
+ new_state[q.first] = q.second;
+ }
+ }
+ decode(new_weight, bl);
+ decode(new_pg_temp, bl);
+ decode(new_primary_temp, bl);
+ if (struct_v >= 2)
+ decode(new_primary_affinity, bl);
+ else
+ new_primary_affinity.clear();
+ if (struct_v >= 3) {
+ decode(new_erasure_code_profiles, bl);
+ decode(old_erasure_code_profiles, bl);
+ } else {
+ new_erasure_code_profiles.clear();
+ old_erasure_code_profiles.clear();
+ }
+ if (struct_v >= 4) {
+ decode(new_pg_upmap, bl);
+ decode(old_pg_upmap, bl);
+ decode(new_pg_upmap_items, bl);
+ decode(old_pg_upmap_items, bl);
+ }
+ if (struct_v >= 6) {
+ decode(new_removed_snaps, bl);
+ decode(new_purged_snaps, bl);
+ }
+ if (struct_v >= 8) {
+ decode(new_last_up_change, bl);
+ decode(new_last_in_change, bl);
+ }
+ DECODE_FINISH(bl); // client-usable data
+ }
+
+ {
+ DECODE_START(10, bl); // extended, osd-only data
+ decode(new_hb_back_up, bl);
+ decode(new_up_thru, bl);
+ decode(new_last_clean_interval, bl);
+ decode(new_lost, bl);
+ decode(new_blocklist, bl);
+ decode(old_blocklist, bl);
+ decode(new_up_cluster, bl);
+ decode(cluster_snapshot, bl);
+ decode(new_uuid, bl);
+ decode(new_xinfo, bl);
+ decode(new_hb_front_up, bl);
+ if (struct_v >= 2)
+ decode(encode_features, bl);
+ else
+ encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
+ if (struct_v >= 3) {
+ decode(new_nearfull_ratio, bl);
+ decode(new_full_ratio, bl);
+ } else {
+ new_nearfull_ratio = -1;
+ new_full_ratio = -1;
+ }
+ if (struct_v >= 4) {
+ decode(new_backfillfull_ratio, bl);
+ } else {
+ new_backfillfull_ratio = -1;
+ }
+ if (struct_v == 5) {
+ string r;
+ decode(r, bl);
+ if (r.length()) {
+ new_require_min_compat_client = ceph_release_from_name(r);
+ }
+ }
+ if (struct_v >= 6) {
+ decode(new_require_min_compat_client, bl);
+ decode(new_require_osd_release, bl);
+ } else {
+ if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+ // only for compat with post-kraken pre-luminous test clusters
+ new_require_osd_release = ceph_release_t::luminous;
+ new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
+ new_require_osd_release = ceph_release_t::kraken;
+ } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
+ new_require_osd_release = ceph_release_t::jewel;
+ } else {
+ new_require_osd_release = ceph_release_t::unknown;
+ }
+ }
+ if (struct_v >= 8) {
+ decode(new_crush_node_flags, bl);
+ }
+ if (struct_v >= 9) {
+ decode(new_device_class_flags, bl);
+ }
+ if (struct_v >= 10) {
+ decode(change_stretch_mode, bl);
+ decode(new_stretch_bucket_count, bl);
+ decode(new_degraded_stretch_mode, bl);
+ decode(new_recovering_stretch_mode, bl);
+ decode(new_stretch_mode_bucket, bl);
+ decode(stretch_mode_enabled, bl);
+ }
+ if (struct_v >= 11) {
+ decode(new_range_blocklist, bl);
+ decode(old_range_blocklist, bl);
+ }
+ DECODE_FINISH(bl); // osd-only data
+ }
+
+ if (struct_v >= 8) {
+ have_crc = true;
+ crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
+ decode(inc_crc, bl);
+ tail_offset = bl.get_off();
+ decode(full_crc, bl);
+ } else {
+ have_crc = false;
+ full_crc = 0;
+ inc_crc = 0;
+ }
+
+ DECODE_FINISH(bl); // wrapper
+
+ if (have_crc) {
+ // verify crc
+ uint32_t actual = crc_front.crc32c(-1);
+ if (tail_offset < bl.get_off()) {
+ ceph::buffer::list tail;
+ tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
+ actual = tail.crc32c(actual);
+ }
+ if (inc_crc != actual) {
+ ostringstream ss;
+ ss << "bad crc, actual " << actual << " != expected " << inc_crc;
+ string s = ss.str();
+ throw ceph::buffer::malformed_input(s.c_str());
+ }
+ }
+}
+
+void OSDMap::Incremental::dump(Formatter *f) const
+{
+ f->dump_int("epoch", epoch);
+ f->dump_stream("fsid") << fsid;
+ f->dump_stream("modified") << modified;
+ f->dump_stream("new_last_up_change") << new_last_up_change;
+ f->dump_stream("new_last_in_change") << new_last_in_change;
+ f->dump_int("new_pool_max", new_pool_max);
+ f->dump_int("new_flags", new_flags);
+ f->dump_float("new_full_ratio", new_full_ratio);
+ f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
+ f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
+ f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
+ f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
+
+ if (fullmap.length()) {
+ f->open_object_section("full_map");
+ OSDMap full;
+ ceph::buffer::list fbl = fullmap; // kludge around constness.
+ auto p = fbl.cbegin();
+ full.decode(p);
+ full.dump(f);
+ f->close_section();
+ }
+ if (crush.length()) {
+ f->open_object_section("crush");
+ CrushWrapper c;
+ ceph::buffer::list tbl = crush; // kludge around constness.
+ auto p = tbl.cbegin();
+ c.decode(p);
+ c.dump(f);
+ f->close_section();
+ }
+
+ f->dump_int("new_max_osd", new_max_osd);
+
+ f->open_array_section("new_pools");
+
+ for (const auto &new_pool : new_pools) {
+ f->open_object_section("pool");
+ f->dump_int("pool", new_pool.first);
+ new_pool.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("new_pool_names");
+
+ for (const auto &new_pool_name : new_pool_names) {
+ f->open_object_section("pool_name");
+ f->dump_int("pool", new_pool_name.first);
+ f->dump_string("name", new_pool_name.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("old_pools");
+
+ for (const auto &old_pool : old_pools)
+ f->dump_int("pool", old_pool);
+ f->close_section();
+
+ f->open_array_section("new_up_osds");
+
+ for (const auto &upclient : new_up_client) {
+ f->open_object_section("osd");
+ f->dump_int("osd", upclient.first);
+ f->dump_stream("public_addr") << upclient.second.legacy_addr();
+ f->dump_object("public_addrs", upclient.second);
+ if (auto p = new_up_cluster.find(upclient.first);
+ p != new_up_cluster.end()) {
+ f->dump_stream("cluster_addr") << p->second.legacy_addr();
+ f->dump_object("cluster_addrs", p->second);
+ }
+ if (auto p = new_hb_back_up.find(upclient.first);
+ p != new_hb_back_up.end()) {
+ f->dump_object("heartbeat_back_addrs", p->second);
+ }
+ if (auto p = new_hb_front_up.find(upclient.first);
+ p != new_hb_front_up.end()) {
+ f->dump_object("heartbeat_front_addrs", p->second);
+ }
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("new_weight");
+
+ for (const auto &weight : new_weight) {
+ f->open_object_section("osd");
+ f->dump_int("osd", weight.first);
+ f->dump_int("weight", weight.second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("osd_state_xor");
+ for (const auto &ns : new_state) {
+ f->open_object_section("osd");
+ f->dump_int("osd", ns.first);
+ set<string> st;
+ calc_state_set(new_state.find(ns.first)->second, st);
+ f->open_array_section("state_xor");
+ for (auto &state : st)
+ f->dump_string("state", state);
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("new_pg_temp");
+
+ for (const auto &pg_temp : new_pg_temp) {
+ f->open_object_section("pg");
+ f->dump_stream("pgid") << pg_temp.first;
+ f->open_array_section("osds");
+
+ for (const auto &osd : pg_temp.second)
+ f->dump_int("osd", osd);
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("primary_temp");
+
+ for (const auto &primary_temp : new_primary_temp) {
+ f->dump_stream("pgid") << primary_temp.first;
+ f->dump_int("osd", primary_temp.second);
+ }
+ f->close_section(); // primary_temp
+
+ f->open_array_section("new_pg_upmap");
+ for (auto& i : new_pg_upmap) {
+ f->open_object_section("mapping");
+ f->dump_stream("pgid") << i.first;
+ f->open_array_section("osds");
+ for (auto osd : i.second) {
+ f->dump_int("osd", osd);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("old_pg_upmap");
+ for (auto& i : old_pg_upmap) {
+ f->dump_stream("pgid") << i;
+ }
+ f->close_section();
+
+ f->open_array_section("new_pg_upmap_items");
+ for (auto& i : new_pg_upmap_items) {
+ f->open_object_section("mapping");
+ f->dump_stream("pgid") << i.first;
+ f->open_array_section("mappings");
+ for (auto& p : i.second) {
+ f->open_object_section("mapping");
+ f->dump_int("from", p.first);
+ f->dump_int("to", p.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("old_pg_upmap_items");
+ for (auto& i : old_pg_upmap_items) {
+ f->dump_stream("pgid") << i;
+ }
+ f->close_section();
+
+ f->open_array_section("new_up_thru");
+
+ for (const auto &up_thru : new_up_thru) {
+ f->open_object_section("osd");
+ f->dump_int("osd", up_thru.first);
+ f->dump_int("up_thru", up_thru.second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("new_lost");
+
+ for (const auto &lost : new_lost) {
+ f->open_object_section("osd");
+ f->dump_int("osd", lost.first);
+ f->dump_int("epoch_lost", lost.second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("new_last_clean_interval");
+
+ for (const auto &last_clean_interval : new_last_clean_interval) {
+ f->open_object_section("osd");
+ f->dump_int("osd", last_clean_interval.first);
+ f->dump_int("first", last_clean_interval.second.first);
+ f->dump_int("last", last_clean_interval.second.second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("new_blocklist");
+ for (const auto &blist : new_blocklist) {
+ stringstream ss;
+ ss << blist.first;
+ f->dump_stream(ss.str().c_str()) << blist.second;
+ }
+ f->close_section();
+ f->open_array_section("old_blocklist");
+ for (const auto &blist : old_blocklist)
+ f->dump_stream("addr") << blist;
+ f->close_section();
+ f->open_array_section("new_range_blocklist");
+ for (const auto &blist : new_range_blocklist) {
+ stringstream ss;
+ ss << blist.first;
+ f->dump_stream(ss.str().c_str()) << blist.second;
+ }
+ f->close_section();
+ f->open_array_section("old_range_blocklist");
+ for (const auto &blist : old_range_blocklist)
+ f->dump_stream("addr") << blist;
+ f->close_section();
+
+ f->open_array_section("new_xinfo");
+ for (const auto &xinfo : new_xinfo) {
+ f->open_object_section("xinfo");
+ f->dump_int("osd", xinfo.first);
+ xinfo.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ if (cluster_snapshot.size())
+ f->dump_string("cluster_snapshot", cluster_snapshot);
+
+ f->open_array_section("new_uuid");
+ for (const auto &uuid : new_uuid) {
+ f->open_object_section("osd");
+ f->dump_int("osd", uuid.first);
+ f->dump_stream("uuid") << uuid.second;
+ f->close_section();
+ }
+ f->close_section();
+
+ OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
+ f->open_array_section("old_erasure_code_profiles");
+ for (const auto &erasure_code_profile : old_erasure_code_profiles) {
+ f->dump_string("old", erasure_code_profile);
+ }
+ f->close_section();
+
+ f->open_array_section("new_removed_snaps");
+ for (auto& p : new_removed_snaps) {
+ f->open_object_section("pool");
+ f->dump_int("pool", p.first);
+ f->open_array_section("snaps");
+ for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+ f->open_object_section("interval");
+ f->dump_unsigned("begin", q.get_start());
+ f->dump_unsigned("length", q.get_len());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("new_purged_snaps");
+ for (auto& p : new_purged_snaps) {
+ f->open_object_section("pool");
+ f->dump_int("pool", p.first);
+ f->open_array_section("snaps");
+ for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+ f->open_object_section("interval");
+ f->dump_unsigned("begin", q.get_start());
+ f->dump_unsigned("length", q.get_len());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->open_array_section("new_crush_node_flags");
+ for (auto& i : new_crush_node_flags) {
+ f->open_object_section("node");
+ f->dump_int("id", i.first);
+ set<string> st;
+ calc_state_set(i.second, st);
+ for (auto& j : st) {
+ f->dump_string("flag", j);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("new_device_class_flags");
+ for (auto& i : new_device_class_flags) {
+ f->open_object_section("device_class");
+ f->dump_int("id", i.first);
+ set<string> st;
+ calc_state_set(i.second, st);
+ for (auto& j : st) {
+ f->dump_string("flag", j);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->open_object_section("stretch_mode");
+ {
+ f->dump_bool("change_stretch_mode", change_stretch_mode);
+ f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
+ f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
+ f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
+ f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
+ f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
+ }
+ f->close_section();
+ f->close_section();
+}
+
+void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
+{
+ o.push_back(new Incremental);
+}
+
+// ----------------------------------
+// OSDMap
+
+void OSDMap::set_epoch(epoch_t e)
+{
+ epoch = e;
+ for (auto &pool : pools)
+ pool.second.last_change = e;
+}
+
+OSDMap::range_bits::range_bits() : ipv6(false) {
+ memset(&bits, 0, sizeof(bits));
+}
+
+OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) {
+ memset(&bits, 0, sizeof(bits));
+ parse(addr);
+}
+
+void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr,
+ uint64_t *upper, uint64_t *lower)
+{
+ *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 |
+ ((uint64_t)(ntohl(*(uint32_t*)(&addr[4]))));
+ *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 |
+ ((uint64_t)(ntohl(*(uint32_t*)(&addr[12]))));
+}
+
+void OSDMap::range_bits::parse(const entity_addr_t& addr) {
+ // parse it into meaningful data
+ if (addr.is_ipv6()) {
+ get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr,
+ &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits);
+ int32_t lower_shift = std::min(128-
+ static_cast<int32_t>(addr.get_nonce()), 64);
+ int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64
+ static_cast<int32_t>(addr.get_nonce()), 0);
+
+ auto get_mask = [](int32_t shift) -> uint64_t {
+ if (shift >= 0 && shift < 64) {
+ return UINT64_MAX << shift;
+ }
+ return 0;
+ };
+
+ bits.ipv6.lower_mask = get_mask(lower_shift);
+ bits.ipv6.upper_mask = get_mask(upper_shift);
+ ipv6 = true;
+ } else if (addr.is_ipv4()) {
+ bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr);
+ if (addr.get_nonce() > 0) {
+ bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce());
+ } else {
+ bits.ipv4.mask = 0;
+ }
+ } else {
+ // uh...
+ }
+}
+
+bool OSDMap::range_bits::matches(const entity_addr_t& addr) const {
+ if (addr.is_ipv4() && !ipv6) {
+ return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) ==
+ (bits.ipv4.ip_32_bits & bits.ipv4.mask));
+ } else if (addr.is_ipv6() && ipv6) {
+ uint64_t upper_64, lower_64;
+ get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64);
+ return (((upper_64 & bits.ipv6.upper_mask) ==
+ (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) &&
+ ((lower_64 & bits.ipv6.lower_mask) ==
+ (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask)));
+ }
+ return false;
+}
+
+bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const
+{
+ if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl;
+ if (blocklist.empty() && range_blocklist.empty()) {
+ if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl;
+ return false;
+ }
+
+ // all blocklist entries are type ANY for nautilus+
+ // FIXME: avoid this copy!
+ entity_addr_t a = orig;
+ if (require_osd_release < ceph_release_t::nautilus) {
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ } else {
+ a.set_type(entity_addr_t::TYPE_ANY);
+ }
+
+ // this specific instance?
+ if (blocklist.count(a)) {
+ if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
+ return true;
+ }
+
+ // is entire ip blocklisted?
+ if (a.is_ip()) {
+ a.set_port(0);
+ a.set_nonce(0);
+ if (blocklist.count(a)) {
+ if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
+ return true;
+ }
+ }
+
+ // is it in a blocklisted range?
+ for (const auto& i : calculated_ranges) {
+ bool blocked = i.second.matches(a);
+ if (blocked) {
+ if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl;
+ return true;
+ }
+ }
+
+ if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl;
+ return false;
+}
+
+bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const
+{
+ if (blocklist.empty() && range_blocklist.empty())
+ return false;
+
+ for (auto& a : av.v) {
+ if (is_blocklisted(a, cct)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl,
+ std::list<std::pair<entity_addr_t,utime_t> > *rl) const
+{
+ std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
+ std::copy(range_blocklist.begin(), range_blocklist.end(),
+ std::back_inserter(*rl));
+}
+
+void OSDMap::get_blocklist(std::set<entity_addr_t> *bl,
+ std::set<entity_addr_t> *rl) const
+{
+ for (const auto &i : blocklist) {
+ bl->insert(i.first);
+ }
+ for (const auto &i : range_blocklist) {
+ rl->insert(i.first);
+ }
+}
+
+void OSDMap::set_max_osd(int m)
+{
+ max_osd = m;
+ osd_state.resize(max_osd, 0);
+ osd_weight.resize(max_osd, CEPH_OSD_OUT);
+ osd_info.resize(max_osd);
+ osd_xinfo.resize(max_osd);
+ osd_addrs->client_addrs.resize(max_osd);
+ osd_addrs->cluster_addrs.resize(max_osd);
+ osd_addrs->hb_back_addrs.resize(max_osd);
+ osd_addrs->hb_front_addrs.resize(max_osd);
+ osd_uuid->resize(max_osd);
+ if (osd_primary_affinity)
+ osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+
+ calc_num_osds();
+}
+
+int OSDMap::calc_num_osds()
+{
+ num_osd = 0;
+ num_up_osd = 0;
+ num_in_osd = 0;
+ for (int i=0; i<max_osd; i++) {
+ if (osd_state[i] & CEPH_OSD_EXISTS) {
+ ++num_osd;
+ if (osd_state[i] & CEPH_OSD_UP) {
+ ++num_up_osd;
+ }
+ if (get_weight(i) != CEPH_OSD_OUT) {
+ ++num_in_osd;
+ }
+ }
+ }
+ return num_osd;
+}
+
+void OSDMap::get_full_pools(CephContext *cct,
+ set<int64_t> *full,
+ set<int64_t> *backfillfull,
+ set<int64_t> *nearfull) const
+{
+ ceph_assert(full);
+ ceph_assert(backfillfull);
+ ceph_assert(nearfull);
+ full->clear();
+ backfillfull->clear();
+ nearfull->clear();
+
+ vector<int> full_osds;
+ vector<int> backfillfull_osds;
+ vector<int> nearfull_osds;
+ for (int i = 0; i < max_osd; ++i) {
+ if (exists(i) && is_up(i) && is_in(i)) {
+ if (osd_state[i] & CEPH_OSD_FULL)
+ full_osds.push_back(i);
+ else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
+ backfillfull_osds.push_back(i);
+ else if (osd_state[i] & CEPH_OSD_NEARFULL)
+ nearfull_osds.push_back(i);
+ }
+ }
+
+ for (auto i: full_osds) {
+ get_pool_ids_by_osd(cct, i, full);
+ }
+ for (auto i: backfillfull_osds) {
+ get_pool_ids_by_osd(cct, i, backfillfull);
+ }
+ for (auto i: nearfull_osds) {
+ get_pool_ids_by_osd(cct, i, nearfull);
+ }
+}
+
+void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
+ set<int> *nearfull) const
+{
+ full->clear();
+ backfill->clear();
+ nearfull->clear();
+ for (int i = 0; i < max_osd; ++i) {
+ if (exists(i) && is_up(i) && is_in(i)) {
+ if (osd_state[i] & CEPH_OSD_FULL)
+ full->emplace(i);
+ else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
+ backfill->emplace(i);
+ else if (osd_state[i] & CEPH_OSD_NEARFULL)
+ nearfull->emplace(i);
+ }
+ }
+}
+
+void OSDMap::get_all_osds(set<int32_t>& ls) const
+{
+ for (int i=0; i<max_osd; i++)
+ if (exists(i))
+ ls.insert(i);
+}
+
+void OSDMap::get_up_osds(set<int32_t>& ls) const
+{
+ for (int i = 0; i < max_osd; i++) {
+ if (is_up(i))
+ ls.insert(i);
+ }
+}
+
+void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
+{
+ for (int i = 0; i < max_osd; i++) {
+ if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
+ ls.insert(i);
+ }
+}
+
+void OSDMap::get_flag_set(set<string> *flagset) const
+{
+ for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
+ if (flags & (1<<i)) {
+ flagset->insert(get_flag_string(flags & (1<<i)));
+ }
+ }
+}
+
+void OSDMap::calc_state_set(int state, set<string>& st)
+{
+ unsigned t = state;
+ for (unsigned s = 1; t; s <<= 1) {
+ if (t & s) {
+ t &= ~s;
+ st.insert(ceph_osd_state_name(s));
+ }
+ }
+}
+
+void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
+{
+ float max = 0;
+ for (const auto &weight : weights) {
+ if (weight.second > max)
+ max = weight.second;
+ }
+
+ for (const auto &weight : weights) {
+ inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
+ }
+}
+
+int OSDMap::identify_osd(const entity_addr_t& addr) const
+{
+ for (int i=0; i<max_osd; i++)
+ if (exists(i) && (get_addrs(i).contains(addr) ||
+ get_cluster_addrs(i).contains(addr)))
+ return i;
+ return -1;
+}
+
+int OSDMap::identify_osd(const uuid_d& u) const
+{
+ for (int i=0; i<max_osd; i++)
+ if (exists(i) && get_uuid(i) == u)
+ return i;
+ return -1;
+}
+
+int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
+{
+ for (int i=0; i<max_osd; i++)
+ if (exists(i) && (get_addrs(i).contains(addr) ||
+ get_cluster_addrs(i).contains(addr) ||
+ get_hb_back_addrs(i).contains(addr) ||
+ get_hb_front_addrs(i).contains(addr)))
+ return i;
+ return -1;
+}
+
+int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
+{
+ for (int i=0; i<max_osd; i++)
+ if (exists(i) && (get_addrs(i).is_same_host(ip) ||
+ get_cluster_addrs(i).is_same_host(ip)))
+ return i;
+ return -1;
+}
+
+
+uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
+{
+ uint64_t features = 0; // things we actually have
+ uint64_t mask = 0; // things we could have
+
+ if (crush->has_nondefault_tunables())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES;
+ if (crush->has_nondefault_tunables2())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES2;
+ if (crush->has_nondefault_tunables3())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+ if (crush->has_v4_buckets())
+ features |= CEPH_FEATURE_CRUSH_V4;
+ if (crush->has_nondefault_tunables5())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES5;
+ if (crush->has_incompat_choose_args()) {
+ features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
+ }
+ mask |= CEPH_FEATURES_CRUSH;
+
+ if (!pg_upmap.empty() || !pg_upmap_items.empty())
+ features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
+ mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
+
+ for (auto &pool: pools) {
+ if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
+ features |= CEPH_FEATURE_OSDHASHPSPOOL;
+ }
+ if (!pool.second.tiers.empty() ||
+ pool.second.is_tier()) {
+ features |= CEPH_FEATURE_OSD_CACHEPOOL;
+ }
+ int ruleid = crush->find_rule(pool.second.get_crush_rule(),
+ pool.second.get_type(),
+ pool.second.get_size());
+ if (ruleid >= 0) {
+ if (crush->is_v2_rule(ruleid))
+ features |= CEPH_FEATURE_CRUSH_V2;
+ if (crush->is_v3_rule(ruleid))
+ features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+ if (crush->is_v5_rule(ruleid))
+ features |= CEPH_FEATURE_CRUSH_TUNABLES5;
+ }
+ }
+ mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
+
+ if (osd_primary_affinity) {
+ for (int i = 0; i < max_osd; ++i) {
+ if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
+ break;
+ }
+ }
+ }
+ mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
+
+ if (entity_type == CEPH_ENTITY_TYPE_OSD) {
+ const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
+ if (require_osd_release >= ceph_release_t::jewel) {
+ features |= jewel_features;
+ }
+ mask |= jewel_features;
+
+ const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
+ | CEPH_FEATURE_MSG_ADDR2;
+ if (require_osd_release >= ceph_release_t::kraken) {
+ features |= kraken_features;
+ }
+ mask |= kraken_features;
+
+ if (stretch_mode_enabled) {
+ features |= CEPH_FEATUREMASK_STRETCH_MODE;
+ mask |= CEPH_FEATUREMASK_STRETCH_MODE;
+ }
+ }
+
+ if (require_min_compat_client >= ceph_release_t::nautilus) {
+ // if min_compat_client is >= nautilus, require v2 cephx signatures
+ // from everyone
+ features |= CEPH_FEATUREMASK_CEPHX_V2;
+ } else if (require_osd_release >= ceph_release_t::nautilus &&
+ entity_type == CEPH_ENTITY_TYPE_OSD) {
+ // if osds are >= nautilus, at least require the signatures from them
+ features |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
+ mask |= CEPH_FEATUREMASK_CEPHX_V2;
+
+ if (pmask)
+ *pmask = mask;
+ return features;
+}
+
+ceph_release_t OSDMap::get_min_compat_client() const
+{
+ uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
+
+ if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43
+ HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28
+ return ceph_release_t::luminous; // v12.2.0
+ }
+ if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737
+ return ceph_release_t::jewel; // v10.2.0
+ }
+ if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56
+ return ceph_release_t::hammer; // v0.94.0
+ }
+ if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
+ HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d
+ HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5
+ return ceph_release_t::firefly; // v0.80.0
+ }
+ if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff
+ HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f
+ return ceph_release_t::dumpling; // v0.67.0
+ }
+ if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af
+ return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
+ }
+ return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af
+}
+
+ceph_release_t OSDMap::get_require_min_compat_client() const
+{
+ return require_min_compat_client;
+}
+
+void OSDMap::_calc_up_osd_features()
+{
+ bool first = true;
+ cached_up_osd_features = 0;
+ for (int osd = 0; osd < max_osd; ++osd) {
+ if (!is_up(osd))
+ continue;
+ const osd_xinfo_t &xi = get_xinfo(osd);
+ if (xi.features == 0)
+ continue; // bogus xinfo, maybe #20751 or similar, skipping
+ if (first) {
+ cached_up_osd_features = xi.features;
+ first = false;
+ } else {
+ cached_up_osd_features &= xi.features;
+ }
+ }
+}
+
+uint64_t OSDMap::get_up_osd_features() const
+{
+ return cached_up_osd_features;
+}
+
+void OSDMap::dedup(const OSDMap *o, OSDMap *n)
+{
+ using ceph::encode;
+ if (o->epoch == n->epoch)
+ return;
+
+ int diff = 0;
+
+ // do addrs match?
+ if (o->max_osd != n->max_osd)
+ diff++;
+ for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
+ if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] &&
+ *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
+ n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
+ else
+ diff++;
+ if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] &&
+ *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
+ n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
+ else
+ diff++;
+ if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] &&
+ *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
+ n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
+ else
+ diff++;
+ if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] &&
+ *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
+ n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
+ else
+ diff++;
+ }
+ if (diff == 0) {
+ // zoinks, no differences at all!
+ n->osd_addrs = o->osd_addrs;
+ }
+
+ // does crush match?
+ ceph::buffer::list oc, nc;
+ encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ if (oc.contents_equal(nc)) {
+ n->crush = o->crush;
+ }
+
+ // does pg_temp match?
+ if (*o->pg_temp == *n->pg_temp)
+ n->pg_temp = o->pg_temp;
+
+ // does primary_temp match?
+ if (o->primary_temp->size() == n->primary_temp->size()) {
+ if (*o->primary_temp == *n->primary_temp)
+ n->primary_temp = o->primary_temp;
+ }
+
+ // do uuids match?
+ if (o->osd_uuid->size() == n->osd_uuid->size() &&
+ *o->osd_uuid == *n->osd_uuid)
+ n->osd_uuid = o->osd_uuid;
+}
+
+void OSDMap::clean_temps(CephContext *cct,
+ const OSDMap& oldmap,
+ const OSDMap& nextmap,
+ Incremental *pending_inc)
+{
+ ldout(cct, 10) << __func__ << dendl;
+
+ for (auto pg : *nextmap.pg_temp) {
+ // if pool does not exist, remove any existing pg_temps associated with
+ // it. we don't care about pg_temps on the pending_inc either; if there
+ // are new_pg_temp entries on the pending, clear them out just as well.
+ if (!nextmap.have_pg_pool(pg.first.pool())) {
+ ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
+ << " for nonexistent pool " << pg.first.pool() << dendl;
+ pending_inc->new_pg_temp[pg.first].clear();
+ continue;
+ }
+ if (!nextmap.pg_exists(pg.first)) {
+ ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
+ << " for nonexistent pg " << dendl;
+ pending_inc->new_pg_temp[pg.first].clear();
+ continue;
+ }
+ // all osds down?
+ unsigned num_up = 0;
+ for (auto o : pg.second) {
+ if (!nextmap.is_down(o)) {
+ ++num_up;
+ break;
+ }
+ }
+ if (num_up == 0) {
+ ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
+ << " with all down osds" << pg.second << dendl;
+ pending_inc->new_pg_temp[pg.first].clear();
+ continue;
+ }
+ // redundant pg_temp?
+ vector<int> raw_up;
+ int primary;
+ nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
+ bool remove = false;
+ if (raw_up == pg.second) {
+ ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
+ << pg.second << " that matches raw_up mapping" << dendl;
+ remove = true;
+ }
+ // oversized pg_temp?
+ if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
+ ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " "
+ << pg.second << " exceeds pool size" << dendl;
+ remove = true;
+ }
+ if (remove) {
+ if (oldmap.pg_temp->count(pg.first))
+ pending_inc->new_pg_temp[pg.first].clear();
+ else
+ pending_inc->new_pg_temp.erase(pg.first);
+ }
+ }
+
+ for (auto &pg : *nextmap.primary_temp) {
+ // primary down?
+ if (nextmap.is_down(pg.second)) {
+ ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first
+ << " to down " << pg.second << dendl;
+ pending_inc->new_primary_temp[pg.first] = -1;
+ continue;
+ }
+ // redundant primary_temp?
+ vector<int> real_up, templess_up;
+ int real_primary, templess_primary;
+ pg_t pgid = pg.first;
+ nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
+ nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
+ if (real_primary == templess_primary){
+ ldout(cct, 10) << __func__ << " removing primary_temp "
+ << pgid << " -> " << real_primary
+ << " (unnecessary/redundant)" << dendl;
+ if (oldmap.primary_temp->count(pgid))
+ pending_inc->new_primary_temp[pgid] = -1;
+ else
+ pending_inc->new_primary_temp.erase(pgid);
+ }
+ }
+}
+
+void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
+{
+ upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
+ for (auto& p : pg_upmap)
+ upmap_pgs->push_back(p.first);
+ for (auto& p : pg_upmap_items)
+ upmap_pgs->push_back(p.first);
+}
+
+bool OSDMap::check_pg_upmaps(
+ CephContext *cct,
+ const vector<pg_t>& to_check,
+ vector<pg_t> *to_cancel,
+ map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
+{
+ bool any_change = false;
+ map<int, map<int, float>> rule_weight_map;
+ for (auto& pg : to_check) {
+ const pg_pool_t *pi = get_pg_pool(pg.pool());
+ if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
+ ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
+ << dendl;
+ to_cancel->push_back(pg);
+ continue;
+ }
+ if (pi->is_pending_merge(pg, nullptr)) {
+ ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
+ << dendl;
+ to_cancel->push_back(pg);
+ continue;
+ }
+ vector<int> raw, up;
+ pg_to_raw_upmap(pg, &raw, &up);
+ auto crush_rule = get_pg_pool_crush_rule(pg);
+ auto r = crush->verify_upmap(cct,
+ crush_rule,
+ get_pg_pool_size(pg),
+ up);
+ if (r < 0) {
+ ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
+ << " returning " << r
+ << dendl;
+ to_cancel->push_back(pg);
+ continue;
+ }
+ // below we check against crush-topology changing..
+ map<int, float> weight_map;
+ auto it = rule_weight_map.find(crush_rule);
+ if (it == rule_weight_map.end()) {
+ auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
+ if (r < 0) {
+ lderr(cct) << __func__ << " unable to get crush weight_map for "
+ << "crush_rule " << crush_rule
+ << dendl;
+ continue;
+ }
+ rule_weight_map[crush_rule] = weight_map;
+ } else {
+ weight_map = it->second;
+ }
+ ldout(cct, 10) << __func__ << " pg " << pg
+ << " weight_map " << weight_map
+ << dendl;
+ for (auto osd : up) {
+ auto it = weight_map.find(osd);
+ if (it == weight_map.end()) {
+ ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
+ << "been moved out of the specific crush-tree"
+ << dendl;
+ to_cancel->push_back(pg);
+ break;
+ }
+ auto adjusted_weight = get_weightf(it->first) * it->second;
+ if (adjusted_weight == 0) {
+ ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
+ << " is out/crush-out"
+ << dendl;
+ to_cancel->push_back(pg);
+ break;
+ }
+ }
+ if (!to_cancel->empty() && to_cancel->back() == pg)
+ continue;
+ // okay, upmap is valid
+ // continue to check if it is still necessary
+ auto i = pg_upmap.find(pg);
+ if (i != pg_upmap.end()) {
+ if (i->second == raw) {
+ ldout(cct, 10) << "removing redundant pg_upmap " << i->first << " "
+ << i->second << dendl;
+ to_cancel->push_back(pg);
+ continue;
+ }
+ if ((int)i->second.size() != get_pg_pool_size(pg)) {
+ ldout(cct, 10) << "removing pg_upmap " << i->first << " "
+ << i->second << " != pool size " << get_pg_pool_size(pg)
+ << dendl;
+ to_cancel->push_back(pg);
+ continue;
+ }
+ }
+ auto j = pg_upmap_items.find(pg);
+ if (j != pg_upmap_items.end()) {
+ mempool::osdmap::vector<pair<int,int>> newmap;
+ for (auto& p : j->second) {
+ if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
+ // cancel mapping if source osd does not exist anymore
+ continue;
+ }
+ if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
+ p.second >= 0 && osd_weight[p.second] == 0) {
+ // cancel mapping if target osd is out
+ continue;
+ }
+ newmap.push_back(p);
+ }
+ if (newmap.empty()) {
+ ldout(cct, 10) << " removing no-op pg_upmap_items "
+ << j->first << " " << j->second
+ << dendl;
+ to_cancel->push_back(pg);
+ } else if (newmap != j->second) {
+ ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
+ << j->first << " " << j->second
+ << " -> " << newmap
+ << dendl;
+ to_remap->insert({pg, newmap});
+ any_change = true;
+ }
+ }
+ }
+ any_change = any_change || !to_cancel->empty();
+ return any_change;
+}
+
+void OSDMap::clean_pg_upmaps(
+ CephContext *cct,
+ Incremental *pending_inc,
+ const vector<pg_t>& to_cancel,
+ const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
+{
+ for (auto &pg: to_cancel) {
+ auto i = pending_inc->new_pg_upmap.find(pg);
+ if (i != pending_inc->new_pg_upmap.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap entry "
+ << i->first << "->" << i->second
+ << dendl;
+ pending_inc->new_pg_upmap.erase(i);
+ }
+ auto j = pg_upmap.find(pg);
+ if (j != pg_upmap.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
+ << j->first << "->" << j->second
+ << dendl;
+ pending_inc->old_pg_upmap.insert(pg);
+ }
+ auto p = pending_inc->new_pg_upmap_items.find(pg);
+ if (p != pending_inc->new_pg_upmap_items.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap_items entry "
+ << p->first << "->" << p->second
+ << dendl;
+ pending_inc->new_pg_upmap_items.erase(p);
+ }
+ auto q = pg_upmap_items.find(pg);
+ if (q != pg_upmap_items.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid "
+ << "pg_upmap_items entry "
+ << q->first << "->" << q->second
+ << dendl;
+ pending_inc->old_pg_upmap_items.insert(pg);
+ }
+ }
+ for (auto& i : to_remap)
+ pending_inc->new_pg_upmap_items[i.first] = i.second;
+}
+
+bool OSDMap::clean_pg_upmaps(
+ CephContext *cct,
+ Incremental *pending_inc) const
+{
+ ldout(cct, 10) << __func__ << dendl;
+ vector<pg_t> to_check;
+ vector<pg_t> to_cancel;
+ map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
+
+ get_upmap_pgs(&to_check);
+ auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
+ clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
+ return any_change;
+}
+
+int OSDMap::apply_incremental(const Incremental &inc)
+{
+ new_blocklist_entries = false;
+ if (inc.epoch == 1)
+ fsid = inc.fsid;
+ else if (inc.fsid != fsid)
+ return -EINVAL;
+
+ ceph_assert(inc.epoch == epoch+1);
+
+ epoch++;
+ modified = inc.modified;
+
+ // full map?
+ if (inc.fullmap.length()) {
+ ceph::buffer::list bl(inc.fullmap);
+ decode(bl);
+ return 0;
+ }
+
+ // nope, incremental.
+ if (inc.new_flags >= 0) {
+ flags = inc.new_flags;
+ // the below is just to cover a newly-upgraded luminous mon
+ // cluster that has to set require_jewel_osds or
+ // require_kraken_osds before the osds can be upgraded to
+ // luminous.
+ if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
+ if (require_osd_release < ceph_release_t::kraken) {
+ require_osd_release = ceph_release_t::kraken;
+ }
+ } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
+ if (require_osd_release < ceph_release_t::jewel) {
+ require_osd_release = ceph_release_t::jewel;
+ }
+ }
+ }
+
+ if (inc.new_max_osd >= 0)
+ set_max_osd(inc.new_max_osd);
+
+ if (inc.new_pool_max != -1)
+ pool_max = inc.new_pool_max;
+
+ for (const auto &pool : inc.new_pools) {
+ pools[pool.first] = pool.second;
+ pools[pool.first].last_change = epoch;
+ }
+
+ new_removed_snaps = inc.new_removed_snaps;
+ new_purged_snaps = inc.new_purged_snaps;
+ for (auto p = new_removed_snaps.begin();
+ p != new_removed_snaps.end();
+ ++p) {
+ removed_snaps_queue[p->first].union_of(p->second);
+ }
+ for (auto p = new_purged_snaps.begin();
+ p != new_purged_snaps.end();
+ ++p) {
+ auto q = removed_snaps_queue.find(p->first);
+ ceph_assert(q != removed_snaps_queue.end());
+ q->second.subtract(p->second);
+ if (q->second.empty()) {
+ removed_snaps_queue.erase(q);
+ }
+ }
+
+ if (inc.new_last_up_change != utime_t()) {
+ last_up_change = inc.new_last_up_change;
+ }
+ if (inc.new_last_in_change != utime_t()) {
+ last_in_change = inc.new_last_in_change;
+ }
+
+ for (const auto &pname : inc.new_pool_names) {
+ auto pool_name_entry = pool_name.find(pname.first);
+ if (pool_name_entry != pool_name.end()) {
+ name_pool.erase(pool_name_entry->second);
+ pool_name_entry->second = pname.second;
+ } else {
+ pool_name[pname.first] = pname.second;
+ }
+ name_pool[pname.second] = pname.first;
+ }
+
+ for (const auto &pool : inc.old_pools) {
+ pools.erase(pool);
+ name_pool.erase(pool_name[pool]);
+ pool_name.erase(pool);
+ }
+
+ for (const auto &weight : inc.new_weight) {
+ set_weight(weight.first, weight.second);
+
+ // if we are marking in, clear the AUTOOUT and NEW bits, and clear
+ // xinfo old_weight.
+ if (weight.second) {
+ osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
+ osd_xinfo[weight.first].old_weight = 0;
+ }
+ }
+
+ for (const auto &primary_affinity : inc.new_primary_affinity) {
+ set_primary_affinity(primary_affinity.first, primary_affinity.second);
+ }
+
+ // erasure_code_profiles
+ for (const auto &profile : inc.old_erasure_code_profiles)
+ erasure_code_profiles.erase(profile);
+
+ for (const auto &profile : inc.new_erasure_code_profiles) {
+ set_erasure_code_profile(profile.first, profile.second);
+ }
+
+ // up/down
+ for (const auto &state : inc.new_state) {
+ const auto osd = state.first;
+ int s = state.second ? state.second : CEPH_OSD_UP;
+ if ((osd_state[osd] & CEPH_OSD_UP) &&
+ (s & CEPH_OSD_UP)) {
+ osd_info[osd].down_at = epoch;
+ osd_xinfo[osd].down_stamp = modified;
+ }
+ if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
+ (s & CEPH_OSD_EXISTS)) {
+ // osd is destroyed; clear out anything interesting.
+ (*osd_uuid)[osd] = uuid_d();
+ osd_info[osd] = osd_info_t();
+ osd_xinfo[osd] = osd_xinfo_t();
+ set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+ osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
+ osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
+ osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
+ osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
+ osd_state[osd] = 0;
+ } else {
+ osd_state[osd] ^= s;
+ }
+ }
+
+ for (const auto &client : inc.new_up_client) {
+ osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+ osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
+ osd_addrs->client_addrs[client.first].reset(
+ new entity_addrvec_t(client.second));
+ osd_addrs->hb_back_addrs[client.first].reset(
+ new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
+ osd_addrs->hb_front_addrs[client.first].reset(
+ new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
+
+ osd_info[client.first].up_from = epoch;
+ }
+
+ for (const auto &cluster : inc.new_up_cluster)
+ osd_addrs->cluster_addrs[cluster.first].reset(
+ new entity_addrvec_t(cluster.second));
+
+ // info
+ for (const auto &thru : inc.new_up_thru)
+ osd_info[thru.first].up_thru = thru.second;
+
+ for (const auto &interval : inc.new_last_clean_interval) {
+ osd_info[interval.first].last_clean_begin = interval.second.first;
+ osd_info[interval.first].last_clean_end = interval.second.second;
+ }
+
+ for (const auto &lost : inc.new_lost)
+ osd_info[lost.first].lost_at = lost.second;
+
+ // xinfo
+ for (const auto &xinfo : inc.new_xinfo)
+ osd_xinfo[xinfo.first] = xinfo.second;
+
+ // uuid
+ for (const auto &uuid : inc.new_uuid)
+ (*osd_uuid)[uuid.first] = uuid.second;
+
+ // pg rebuild
+ for (const auto &pg : inc.new_pg_temp) {
+ if (pg.second.empty())
+ pg_temp->erase(pg.first);
+ else
+ pg_temp->set(pg.first, pg.second);
+ }
+ if (!inc.new_pg_temp.empty()) {
+ // make sure pg_temp is efficiently stored
+ pg_temp->rebuild();
+ }
+
+ for (const auto &pg : inc.new_primary_temp) {
+ if (pg.second == -1)
+ primary_temp->erase(pg.first);
+ else
+ (*primary_temp)[pg.first] = pg.second;
+ }
+
+ for (auto& p : inc.new_pg_upmap) {
+ pg_upmap[p.first] = p.second;
+ }
+ for (auto& pg : inc.old_pg_upmap) {
+ pg_upmap.erase(pg);
+ }
+ for (auto& p : inc.new_pg_upmap_items) {
+ pg_upmap_items[p.first] = p.second;
+ }
+ for (auto& pg : inc.old_pg_upmap_items) {
+ pg_upmap_items.erase(pg);
+ }
+
+ // blocklist
+ if (!inc.new_blocklist.empty()) {
+ blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
+ new_blocklist_entries = true;
+ }
+ for (const auto &addr : inc.old_blocklist)
+ blocklist.erase(addr);
+
+ for (const auto& addr_p : inc.new_range_blocklist) {
+ range_blocklist.insert(addr_p);
+ calculated_ranges.emplace(addr_p.first, addr_p.first);
+ new_blocklist_entries = true;
+ }
+ for (const auto &addr : inc.old_range_blocklist) {
+ calculated_ranges.erase(addr);
+ range_blocklist.erase(addr);
+ }
+
+ for (auto& i : inc.new_crush_node_flags) {
+ if (i.second) {
+ crush_node_flags[i.first] = i.second;
+ } else {
+ crush_node_flags.erase(i.first);
+ }
+ }
+
+ for (auto& i : inc.new_device_class_flags) {
+ if (i.second) {
+ device_class_flags[i.first] = i.second;
+ } else {
+ device_class_flags.erase(i.first);
+ }
+ }
+
+ // cluster snapshot?
+ if (inc.cluster_snapshot.length()) {
+ cluster_snapshot = inc.cluster_snapshot;
+ cluster_snapshot_epoch = inc.epoch;
+ } else {
+ cluster_snapshot.clear();
+ cluster_snapshot_epoch = 0;
+ }
+
+ if (inc.new_nearfull_ratio >= 0) {
+ nearfull_ratio = inc.new_nearfull_ratio;
+ }
+ if (inc.new_backfillfull_ratio >= 0) {
+ backfillfull_ratio = inc.new_backfillfull_ratio;
+ }
+ if (inc.new_full_ratio >= 0) {
+ full_ratio = inc.new_full_ratio;
+ }
+ if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
+ require_min_compat_client = inc.new_require_min_compat_client;
+ }
+ if (inc.new_require_osd_release >= ceph_release_t::unknown) {
+ require_osd_release = inc.new_require_osd_release;
+ if (require_osd_release >= ceph_release_t::luminous) {
+ flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+ }
+ }
+
+ if (inc.new_require_osd_release >= ceph_release_t::unknown) {
+ require_osd_release = inc.new_require_osd_release;
+ if (require_osd_release >= ceph_release_t::nautilus) {
+ flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
+ }
+ }
+ // do new crush map last (after up/down stuff)
+ if (inc.crush.length()) {
+ ceph::buffer::list bl(inc.crush);
+ auto blp = bl.cbegin();
+ crush.reset(new CrushWrapper);
+ crush->decode(blp);
+ if (require_osd_release >= ceph_release_t::luminous) {
+ // only increment if this is a luminous-encoded osdmap, lest
+ // the mon's crush_version diverge from what the osds or others
+ // are decoding and applying on their end. if we won't encode
+ // it in the canonical version, don't change it.
+ ++crush_version;
+ }
+ for (auto it = device_class_flags.begin();
+ it != device_class_flags.end();) {
+ const char* class_name = crush->get_class_name(it->first);
+ if (!class_name) // device class is gone
+ it = device_class_flags.erase(it);
+ else
+ it++;
+ }
+ }
+
+ if (inc.change_stretch_mode) {
+ stretch_mode_enabled = inc.stretch_mode_enabled;
+ stretch_bucket_count = inc.new_stretch_bucket_count;
+ degraded_stretch_mode = inc.new_degraded_stretch_mode;
+ recovering_stretch_mode = inc.new_recovering_stretch_mode;
+ stretch_mode_bucket = inc.new_stretch_mode_bucket;
+ }
+
+ calc_num_osds();
+ _calc_up_osd_features();
+ return 0;
+}
+
+// mapping
+int OSDMap::map_to_pg(
+ int64_t poolid,
+ const string& name,
+ const string& key,
+ const string& nspace,
+ pg_t *pg) const
+{
+ // calculate ps (placement seed)
+ const pg_pool_t *pool = get_pg_pool(poolid);
+ if (!pool)
+ return -ENOENT;
+ ps_t ps;
+ if (!key.empty())
+ ps = pool->hash_key(key, nspace);
+ else
+ ps = pool->hash_key(name, nspace);
+ *pg = pg_t(ps, poolid);
+ return 0;
+}
+
+int OSDMap::object_locator_to_pg(
+ const object_t& oid, const object_locator_t& loc, pg_t &pg) const
+{
+ if (loc.hash >= 0) {
+ if (!get_pg_pool(loc.get_pool())) {
+ return -ENOENT;
+ }
+ pg = pg_t(loc.hash, loc.get_pool());
+ return 0;
+ }
+ return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
+}
+
+ceph_object_layout OSDMap::make_object_layout(
+ object_t oid, int pg_pool, string nspace) const
+{
+ object_locator_t loc(pg_pool, nspace);
+
+ ceph_object_layout ol;
+ pg_t pgid = object_locator_to_pg(oid, loc);
+ ol.ol_pgid = pgid.get_old_pg().v;
+ ol.ol_stripe_unit = 0;
+ return ol;
+}
+
+void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
+ vector<int>& osds) const
+{
+ if (pool.can_shift_osds()) {
+ unsigned removed = 0;
+ for (unsigned i = 0; i < osds.size(); i++) {
+ if (!exists(osds[i])) {
+ removed++;
+ continue;
+ }
+ if (removed) {
+ osds[i - removed] = osds[i];
+ }
+ }
+ if (removed)
+ osds.resize(osds.size() - removed);
+ } else {
+ for (auto& osd : osds) {
+ if (!exists(osd))
+ osd = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
+void OSDMap::_pg_to_raw_osds(
+ const pg_pool_t& pool, pg_t pg,
+ vector<int> *osds,
+ ps_t *ppps) const
+{
+ // map to osds[]
+ ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
+ unsigned size = pool.get_size();
+
+ // what crush rule?
+ int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
+ if (ruleno >= 0)
+ crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
+
+ _remove_nonexistent_osds(pool, *osds);
+
+ if (ppps)
+ *ppps = pps;
+}
+
+int OSDMap::_pick_primary(const vector<int>& osds) const
+{
+ for (auto osd : osds) {
+ if (osd != CRUSH_ITEM_NONE) {
+ return osd;
+ }
+ }
+ return -1;
+}
+
+void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
+{
+ pg_t pg = pi.raw_pg_to_pg(raw_pg);
+ auto p = pg_upmap.find(pg);
+ if (p != pg_upmap.end()) {
+ // make sure targets aren't marked out
+ for (auto osd : p->second) {
+ if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
+ osd_weight[osd] == 0) {
+ // reject/ignore the explicit mapping
+ return;
+ }
+ }
+ *raw = vector<int>(p->second.begin(), p->second.end());
+ // continue to check and apply pg_upmap_items if any
+ }
+
+ auto q = pg_upmap_items.find(pg);
+ if (q != pg_upmap_items.end()) {
+ // NOTE: this approach does not allow a bidirectional swap,
+ // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+ for (auto& r : q->second) {
+ // make sure the replacement value doesn't already appear
+ bool exists = false;
+ ssize_t pos = -1;
+ for (unsigned i = 0; i < raw->size(); ++i) {
+ int osd = (*raw)[i];
+ if (osd == r.second) {
+ exists = true;
+ break;
+ }
+ // ignore mapping if target is marked out (or invalid osd id)
+ if (osd == r.first &&
+ pos < 0 &&
+ !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
+ r.second >= 0 && osd_weight[r.second] == 0)) {
+ pos = i;
+ }
+ }
+ if (!exists && pos >= 0) {
+ (*raw)[pos] = r.second;
+ }
+ }
+ }
+}
+
+// pg -> (up osd list)
+void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
+ vector<int> *up) const
+{
+ if (pool.can_shift_osds()) {
+ // shift left
+ up->clear();
+ up->reserve(raw.size());
+ for (unsigned i=0; i<raw.size(); i++) {
+ if (!exists(raw[i]) || is_down(raw[i]))
+ continue;
+ up->push_back(raw[i]);
+ }
+ } else {
+ // set down/dne devices to NONE
+ up->resize(raw.size());
+ for (int i = raw.size() - 1; i >= 0; --i) {
+ if (!exists(raw[i]) || is_down(raw[i])) {
+ (*up)[i] = CRUSH_ITEM_NONE;
+ } else {
+ (*up)[i] = raw[i];
+ }
+ }
+ }
+}
+
+void OSDMap::_apply_primary_affinity(ps_t seed,
+ const pg_pool_t& pool,
+ vector<int> *osds,
+ int *primary) const
+{
+ // do we have any non-default primary_affinity values for these osds?
+ if (!osd_primary_affinity)
+ return;
+
+ bool any = false;
+ for (const auto osd : *osds) {
+ if (osd != CRUSH_ITEM_NONE &&
+ (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ any = true;
+ break;
+ }
+ }
+ if (!any)
+ return;
+
+ // pick the primary. feed both the seed (for the pg) and the osd
+ // into the hash/rng so that a proportional fraction of an osd's pgs
+ // get rejected as primary.
+ int pos = -1;
+ for (unsigned i = 0; i < osds->size(); ++i) {
+ int o = (*osds)[i];
+ if (o == CRUSH_ITEM_NONE)
+ continue;
+ unsigned a = (*osd_primary_affinity)[o];
+ if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+ (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ seed, o) >> 16) >= a) {
+ // we chose not to use this primary. note it anyway as a
+ // fallback in case we don't pick anyone else, but keep looking.
+ if (pos < 0)
+ pos = i;
+ } else {
+ pos = i;
+ break;
+ }
+ }
+ if (pos < 0)
+ return;
+
+ *primary = (*osds)[pos];
+
+ if (pool.can_shift_osds() && pos > 0) {
+ // move the new primary to the front.
+ for (int i = pos; i > 0; --i) {
+ (*osds)[i] = (*osds)[i-1];
+ }
+ (*osds)[0] = *primary;
+ }
+}
+
+void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
+ vector<int> *temp_pg, int *temp_primary) const
+{
+ pg = pool.raw_pg_to_pg(pg);
+ const auto p = pg_temp->find(pg);
+ temp_pg->clear();
+ if (p != pg_temp->end()) {
+ for (unsigned i=0; i<p->second.size(); i++) {
+ if (!exists(p->second[i]) || is_down(p->second[i])) {
+ if (pool.can_shift_osds()) {
+ continue;
+ } else {
+ temp_pg->push_back(CRUSH_ITEM_NONE);
+ }
+ } else {
+ temp_pg->push_back(p->second[i]);
+ }
+ }
+ }
+ const auto &pp = primary_temp->find(pg);
+ *temp_primary = -1;
+ if (pp != primary_temp->end()) {
+ *temp_primary = pp->second;
+ } else if (!temp_pg->empty()) { // apply pg_temp's primary
+ for (unsigned i = 0; i < temp_pg->size(); ++i) {
+ if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
+ *temp_primary = (*temp_pg)[i];
+ break;
+ }
+ }
+ }
+}
+
+void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
+{
+ const pg_pool_t *pool = get_pg_pool(pg.pool());
+ if (!pool) {
+ *primary = -1;
+ raw->clear();
+ return;
+ }
+ _pg_to_raw_osds(*pool, pg, raw, NULL);
+ *primary = _pick_primary(*raw);
+}
+
+void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
+ vector<int> *raw_upmap) const
+{
+ auto pool = get_pg_pool(pg.pool());
+ if (!pool) {
+ raw_upmap->clear();
+ return;
+ }
+ _pg_to_raw_osds(*pool, pg, raw, NULL);
+ *raw_upmap = *raw;
+ _apply_upmap(*pool, pg, raw_upmap);
+}
+
+void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
+{
+ const pg_pool_t *pool = get_pg_pool(pg.pool());
+ if (!pool) {
+ *primary = -1;
+ up->clear();
+ return;
+ }
+ vector<int> raw;
+ ps_t pps;
+ _pg_to_raw_osds(*pool, pg, &raw, &pps);
+ _apply_upmap(*pool, pg, &raw);
+ _raw_to_up_osds(*pool, raw, up);
+ *primary = _pick_primary(raw);
+ _apply_primary_affinity(pps, *pool, up, primary);
+}
+
+void OSDMap::_pg_to_up_acting_osds(
+ const pg_t& pg, vector<int> *up, int *up_primary,
+ vector<int> *acting, int *acting_primary,
+ bool raw_pg_to_pg) const
+{
+ const pg_pool_t *pool = get_pg_pool(pg.pool());
+ if (!pool ||
+ (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
+ if (up)
+ up->clear();
+ if (up_primary)
+ *up_primary = -1;
+ if (acting)
+ acting->clear();
+ if (acting_primary)
+ *acting_primary = -1;
+ return;
+ }
+ vector<int> raw;
+ vector<int> _up;
+ vector<int> _acting;
+ int _up_primary;
+ int _acting_primary;
+ ps_t pps;
+ _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
+ if (_acting.empty() || up || up_primary) {
+ _pg_to_raw_osds(*pool, pg, &raw, &pps);
+ _apply_upmap(*pool, pg, &raw);
+ _raw_to_up_osds(*pool, raw, &_up);
+ _up_primary = _pick_primary(_up);
+ _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
+ if (_acting.empty()) {
+ _acting = _up;
+ if (_acting_primary == -1) {
+ _acting_primary = _up_primary;
+ }
+ }
+
+ if (up)
+ up->swap(_up);
+ if (up_primary)
+ *up_primary = _up_primary;
+ }
+
+ if (acting)
+ acting->swap(_acting);
+ if (acting_primary)
+ *acting_primary = _acting_primary;
+}
+
+int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
+{
+ // This implementation is broken for EC PGs since the osd may appear
+ // multiple times in the acting set. See
+ // https://tracker.ceph.com/issues/43213
+ if (!nrep)
+ nrep = acting.size();
+ for (int i=0; i<nrep; i++)
+ if (acting[i] == osd)
+ return i;
+ return -1;
+}
+
+int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
+{
+ int nrep = acting.size();
+ if (who.shard == shard_id_t::NO_SHARD) {
+ for (int i=0; i<nrep; i++) {
+ if (acting[i] == who.osd) {
+ return i;
+ }
+ }
+ } else {
+ if (who.shard < nrep && acting[who.shard] == who.osd) {
+ return who.shard;
+ }
+ }
+ return -1;
+}
+
+bool OSDMap::primary_changed_broken(
+ int oldprimary,
+ const vector<int> &oldacting,
+ int newprimary,
+ const vector<int> &newacting)
+{
+ if (oldacting.empty() && newacting.empty())
+ return false; // both still empty
+ if (oldacting.empty() ^ newacting.empty())
+ return true; // was empty, now not, or vice versa
+ if (oldprimary != newprimary)
+ return true; // primary changed
+ if (calc_pg_role_broken(oldprimary, oldacting) !=
+ calc_pg_role_broken(newprimary, newacting))
+ return true;
+ return false; // same primary (tho replicas may have changed)
+}
+
+uint64_t OSDMap::get_encoding_features() const
+{
+ uint64_t f = SIGNIFICANT_FEATURES;
+ if (require_osd_release < ceph_release_t::octopus) {
+ f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+ }
+ if (require_osd_release < ceph_release_t::nautilus) {
+ f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
+ }
+ if (require_osd_release < ceph_release_t::mimic) {
+ f &= ~CEPH_FEATURE_SERVER_MIMIC;
+ }
+ if (require_osd_release < ceph_release_t::luminous) {
+ f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
+ CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+ }
+ if (require_osd_release < ceph_release_t::kraken) {
+ f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
+ CEPH_FEATURE_MSG_ADDR2);
+ }
+ if (require_osd_release < ceph_release_t::jewel) {
+ f &= ~(CEPH_FEATURE_SERVER_JEWEL |
+ CEPH_FEATURE_NEW_OSDOP_ENCODING |
+ CEPH_FEATURE_CRUSH_TUNABLES5);
+ }
+ return f;
+}
+
+// serialize, unserialize
+void OSDMap::encode_client_old(ceph::buffer::list& bl) const
+{
+ using ceph::encode;
+ __u16 v = 5;
+ encode(v, bl);
+
+ // base
+ encode(fsid, bl);
+ encode(epoch, bl);
+ encode(created, bl);
+ encode(modified, bl);
+
+ // for encode(pools, bl);
+ __u32 n = pools.size();
+ encode(n, bl);
+
+ for (const auto &pool : pools) {
+ n = pool.first;
+ encode(n, bl);
+ encode(pool.second, bl, 0);
+ }
+ // for encode(pool_name, bl);
+ n = pool_name.size();
+ encode(n, bl);
+ for (const auto &pname : pool_name) {
+ n = pname.first;
+ encode(n, bl);
+ encode(pname.second, bl);
+ }
+ // for encode(pool_max, bl);
+ n = pool_max;
+ encode(n, bl);
+
+ encode(flags, bl);
+
+ encode(max_osd, bl);
+ {
+ uint32_t n = osd_state.size();
+ encode(n, bl);
+ for (auto s : osd_state) {
+ encode((uint8_t)s, bl);
+ }
+ }
+ encode(osd_weight, bl);
+ encode(osd_addrs->client_addrs, bl, 0);
+
+ // for encode(pg_temp, bl);
+ n = pg_temp->size();
+ encode(n, bl);
+ for (const auto& pg : *pg_temp) {
+ old_pg_t opg = pg.first.get_old_pg();
+ encode(opg, bl);
+ encode(pg.second, bl);
+ }
+
+ // crush
+ ceph::buffer::list cbl;
+ crush->encode(cbl, 0 /* legacy (no) features */);
+ encode(cbl, bl);
+}
+
+void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_PGID64) == 0) {
+ encode_client_old(bl);
+ return;
+ }
+
+ __u16 v = 6;
+ encode(v, bl);
+
+ // base
+ encode(fsid, bl);
+ encode(epoch, bl);
+ encode(created, bl);
+ encode(modified, bl);
+
+ encode(pools, bl, features);
+ encode(pool_name, bl);
+ encode(pool_max, bl);
+
+ encode(flags, bl);
+
+ encode(max_osd, bl);
+ {
+ uint32_t n = osd_state.size();
+ encode(n, bl);
+ for (auto s : osd_state) {
+ encode((uint8_t)s, bl);
+ }
+ }
+ encode(osd_weight, bl);
+ encode(osd_addrs->client_addrs, bl, features);
+
+ encode(*pg_temp, bl);
+
+ // crush
+ ceph::buffer::list cbl;
+ crush->encode(cbl, 0 /* legacy (no) features */);
+ encode(cbl, bl);
+
+ // extended
+ __u16 ev = 10;
+ encode(ev, bl);
+ encode(osd_addrs->hb_back_addrs, bl, features);
+ encode(osd_info, bl);
+ encode(blocklist, bl, features);
+ encode(osd_addrs->cluster_addrs, bl, features);
+ encode(cluster_snapshot_epoch, bl);
+ encode(cluster_snapshot, bl);
+ encode(*osd_uuid, bl);
+ encode(osd_xinfo, bl, features);
+ encode(osd_addrs->hb_front_addrs, bl, features);
+}
+
+/* for a description of osdmap versions, and when they were introduced, please
+ * refer to
+ * doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
+ encode_classic(bl, features);
+ return;
+ }
+
+ // only a select set of callers should *ever* be encoding new
+ // OSDMaps. others should be passing around the canonical encoded
+ // buffers from on high. select out those callers by passing in an
+ // "impossible" feature bit.
+ ceph_assert(features & CEPH_FEATURE_RESERVED);
+ features &= ~CEPH_FEATURE_RESERVED;
+
+ size_t start_offset = bl.length();
+ size_t tail_offset;
+ size_t crc_offset;
+ std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
+
+ // meta-encoding: how we include client-used and osd-specific data
+ ENCODE_START(8, 7, bl);
+
+ {
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
+ uint8_t v = 9;
+ if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ v = 3;
+ } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+ v = 6;
+ } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ v = 7;
+ }
+ ENCODE_START(v, 1, bl); // client-usable data
+ // base
+ encode(fsid, bl);
+ encode(epoch, bl);
+ encode(created, bl);
+ encode(modified, bl);
+
+ encode(pools, bl, features);
+ encode(pool_name, bl);
+ encode(pool_max, bl);
+
+ if (v < 4) {
+ decltype(flags) f = flags;
+ if (require_osd_release >= ceph_release_t::luminous)
+ f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
+ else if (require_osd_release == ceph_release_t::kraken)
+ f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
+ else if (require_osd_release == ceph_release_t::jewel)
+ f |= CEPH_OSDMAP_REQUIRE_JEWEL;
+ encode(f, bl);
+ } else {
+ encode(flags, bl);
+ }
+
+ encode(max_osd, bl);
+ if (v >= 5) {
+ encode(osd_state, bl);
+ } else {
+ uint32_t n = osd_state.size();
+ encode(n, bl);
+ for (auto s : osd_state) {
+ encode((uint8_t)s, bl);
+ }
+ }
+ encode(osd_weight, bl);
+ if (v >= 8) {
+ encode(osd_addrs->client_addrs, bl, features);
+ } else {
+ encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
+ }
+
+ encode(*pg_temp, bl);
+ encode(*primary_temp, bl);
+ if (osd_primary_affinity) {
+ encode(*osd_primary_affinity, bl);
+ } else {
+ vector<__u32> v;
+ encode(v, bl);
+ }
+
+ // crush
+ ceph::buffer::list cbl;
+ crush->encode(cbl, features);
+ encode(cbl, bl);
+ encode(erasure_code_profiles, bl);
+
+ if (v >= 4) {
+ encode(pg_upmap, bl);
+ encode(pg_upmap_items, bl);
+ } else {
+ ceph_assert(pg_upmap.empty());
+ ceph_assert(pg_upmap_items.empty());
+ }
+ if (v >= 6) {
+ encode(crush_version, bl);
+ }
+ if (v >= 7) {
+ encode(new_removed_snaps, bl);
+ encode(new_purged_snaps, bl);
+ }
+ if (v >= 9) {
+ encode(last_up_change, bl);
+ encode(last_in_change, bl);
+ }
+ ENCODE_FINISH(bl); // client-usable data
+ }
+
+ {
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
+ uint8_t target_v = 9; // when bumping this, be aware of range blocklist
+ if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ target_v = 1;
+ } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+ target_v = 5;
+ } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ target_v = 6;
+ }
+ if (stretch_mode_enabled) {
+ target_v = std::max((uint8_t)10, target_v);
+ }
+ if (!range_blocklist.empty()) {
+ target_v = std::max((uint8_t)11, target_v);
+ }
+ ENCODE_START(target_v, 1, bl); // extended, osd-only data
+ if (target_v < 7) {
+ encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
+ } else {
+ encode(osd_addrs->hb_back_addrs, bl, features);
+ }
+ encode(osd_info, bl);
+ {
+ // put this in a sorted, ordered map<> so that we encode in a
+ // deterministic order.
+ map<entity_addr_t,utime_t> blocklist_map;
+ for (const auto &addr : blocklist)
+ blocklist_map.insert(make_pair(addr.first, addr.second));
+ encode(blocklist_map, bl, features);
+ }
+ if (target_v < 7) {
+ encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
+ } else {
+ encode(osd_addrs->cluster_addrs, bl, features);
+ }
+ encode(cluster_snapshot_epoch, bl);
+ encode(cluster_snapshot, bl);
+ encode(*osd_uuid, bl);
+ encode(osd_xinfo, bl, features);
+ if (target_v < 7) {
+ encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
+ } else {
+ encode(osd_addrs->hb_front_addrs, bl, features);
+ }
+ if (target_v >= 2) {
+ encode(nearfull_ratio, bl);
+ encode(full_ratio, bl);
+ encode(backfillfull_ratio, bl);
+ }
+ // 4 was string-based new_require_min_compat_client
+ if (target_v >= 5) {
+ encode(require_min_compat_client, bl);
+ encode(require_osd_release, bl);
+ }
+ if (target_v >= 6) {
+ encode(removed_snaps_queue, bl);
+ }
+ if (target_v >= 8) {
+ encode(crush_node_flags, bl);
+ }
+ if (target_v >= 9) {
+ encode(device_class_flags, bl);
+ }
+ if (target_v >= 10) {
+ encode(stretch_mode_enabled, bl);
+ encode(stretch_bucket_count, bl);
+ encode(degraded_stretch_mode, bl);
+ encode(recovering_stretch_mode, bl);
+ encode(stretch_mode_bucket, bl);
+ }
+ if (target_v >= 11) {
+ ::encode(range_blocklist, bl, features);
+ }
+ ENCODE_FINISH(bl); // osd-only data
+ }
+
+ crc_offset = bl.length();
+ crc_filler = bl.append_hole(sizeof(uint32_t));
+ tail_offset = bl.length();
+
+ ENCODE_FINISH(bl); // meta-encoding wrapper
+
+ // fill in crc
+ ceph::buffer::list front;
+ front.substr_of(bl, start_offset, crc_offset - start_offset);
+ crc = front.crc32c(-1);
+ if (tail_offset < bl.length()) {
+ ceph::buffer::list tail;
+ tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
+ crc = tail.crc32c(crc);
+ }
+ ceph_le32 crc_le;
+ crc_le = crc;
+ crc_filler->copy_in(4, (char*)&crc_le);
+ crc_defined = true;
+}
+
+/* for a description of osdmap versions, and when they were introduced, please
+ * refer to
+ * doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::decode(ceph::buffer::list& bl)
+{
+ auto p = bl.cbegin();
+ decode(p);
+}
+
+void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
+{
+ using ceph::decode;
+ __u32 n, t;
+ __u16 v;
+ decode(v, p);
+
+ // base
+ decode(fsid, p);
+ decode(epoch, p);
+ decode(created, p);
+ decode(modified, p);
+
+ if (v < 6) {
+ if (v < 4) {
+ int32_t max_pools = 0;
+ decode(max_pools, p);
+ pool_max = max_pools;
+ }
+ pools.clear();
+ decode(n, p);
+ while (n--) {
+ decode(t, p);
+ decode(pools[t], p);
+ }
+ if (v == 4) {
+ decode(n, p);
+ pool_max = n;
+ } else if (v == 5) {
+ pool_name.clear();
+ decode(n, p);
+ while (n--) {
+ decode(t, p);
+ decode(pool_name[t], p);
+ }
+ decode(n, p);
+ pool_max = n;
+ }
+ } else {
+ decode(pools, p);
+ decode(pool_name, p);
+ decode(pool_max, p);
+ }
+ // kludge around some old bug that zeroed out pool_max (#2307)
+ if (pools.size() && pool_max < pools.rbegin()->first) {
+ pool_max = pools.rbegin()->first;
+ }
+
+ decode(flags, p);
+
+ decode(max_osd, p);
+ {
+ vector<uint8_t> os;
+ decode(os, p);
+ osd_state.resize(os.size());
+ for (unsigned i = 0; i < os.size(); ++i) {
+ osd_state[i] = os[i];
+ }
+ }
+ decode(osd_weight, p);
+ decode(osd_addrs->client_addrs, p);
+ if (v <= 5) {
+ pg_temp->clear();
+ decode(n, p);
+ while (n--) {
+ old_pg_t opg;
+ ceph::decode_raw(opg, p);
+ mempool::osdmap::vector<int32_t> v;
+ decode(v, p);
+ pg_temp->set(pg_t(opg), v);
+ }
+ } else {
+ decode(*pg_temp, p);
+ }
+
+ // crush
+ ceph::buffer::list cbl;
+ decode(cbl, p);
+ auto cblp = cbl.cbegin();
+ crush->decode(cblp);
+
+ // extended
+ __u16 ev = 0;
+ if (v >= 5)
+ decode(ev, p);
+ decode(osd_addrs->hb_back_addrs, p);
+ decode(osd_info, p);
+ if (v < 5)
+ decode(pool_name, p);
+
+ decode(blocklist, p);
+ if (ev >= 6)
+ decode(osd_addrs->cluster_addrs, p);
+ else
+ osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
+
+ if (ev >= 7) {
+ decode(cluster_snapshot_epoch, p);
+ decode(cluster_snapshot, p);
+ }
+
+ if (ev >= 8) {
+ decode(*osd_uuid, p);
+ } else {
+ osd_uuid->resize(max_osd);
+ }
+ if (ev >= 9)
+ decode(osd_xinfo, p);
+ else
+ osd_xinfo.resize(max_osd);
+
+ if (ev >= 10)
+ decode(osd_addrs->hb_front_addrs, p);
+ else
+ osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
+
+ osd_primary_affinity.reset();
+
+ post_decode();
+}
+
+void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
+{
+ using ceph::decode;
+ /**
+ * Older encodings of the OSDMap had a single struct_v which
+ * covered the whole encoding, and was prior to our modern
+ * stuff which includes a compatv and a size. So if we see
+ * a struct_v < 7, we must rewind to the beginning and use our
+ * classic decoder.
+ */
+ size_t start_offset = bl.get_off();
+ size_t tail_offset = 0;
+ ceph::buffer::list crc_front, crc_tail;
+
+ DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
+ if (struct_v < 7) {
+ bl.seek(start_offset);
+ decode_classic(bl);
+ return;
+ }
+ /**
+ * Since we made it past that hurdle, we can use our normal paths.
+ */
+ {
+ DECODE_START(9, bl); // client-usable data
+ // base
+ decode(fsid, bl);
+ decode(epoch, bl);
+ decode(created, bl);
+ decode(modified, bl);
+
+ decode(pools, bl);
+ decode(pool_name, bl);
+ decode(pool_max, bl);
+
+ decode(flags, bl);
+
+ decode(max_osd, bl);
+ if (struct_v >= 5) {
+ decode(osd_state, bl);
+ } else {
+ vector<uint8_t> os;
+ decode(os, bl);
+ osd_state.resize(os.size());
+ for (unsigned i = 0; i < os.size(); ++i) {
+ osd_state[i] = os[i];
+ }
+ }
+ decode(osd_weight, bl);
+ decode(osd_addrs->client_addrs, bl);
+
+ decode(*pg_temp, bl);
+ decode(*primary_temp, bl);
+ // dates back to firefly. version increased from 2 to 3 still in firefly.
+ // do we really still need to keep this around? even for old clients?
+ if (struct_v >= 2) {
+ osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
+ decode(*osd_primary_affinity, bl);
+ if (osd_primary_affinity->empty())
+ osd_primary_affinity.reset();
+ } else {
+ osd_primary_affinity.reset();
+ }
+
+ // crush
+ ceph::buffer::list cbl;
+ decode(cbl, bl);
+ auto cblp = cbl.cbegin();
+ crush->decode(cblp);
+ // added in firefly; version increased in luminous, so it affects
+ // giant, hammer, infernallis, jewel, and kraken. probably should be left
+ // alone until we require clients to be all luminous?
+ if (struct_v >= 3) {
+ decode(erasure_code_profiles, bl);
+ } else {
+ erasure_code_profiles.clear();
+ }
+ // version increased from 3 to 4 still in luminous, so same as above
+ // applies.
+ if (struct_v >= 4) {
+ decode(pg_upmap, bl);
+ decode(pg_upmap_items, bl);
+ } else {
+ pg_upmap.clear();
+ pg_upmap_items.clear();
+ }
+ // again, version increased from 5 to 6 still in luminous, so above
+ // applies.
+ if (struct_v >= 6) {
+ decode(crush_version, bl);
+ }
+ // version increase from 6 to 7 in mimic
+ if (struct_v >= 7) {
+ decode(new_removed_snaps, bl);
+ decode(new_purged_snaps, bl);
+ }
+ // version increase from 7 to 8, 8 to 9, in nautilus.
+ if (struct_v >= 9) {
+ decode(last_up_change, bl);
+ decode(last_in_change, bl);
+ }
+ DECODE_FINISH(bl); // client-usable data
+ }
+
+ {
+ DECODE_START(10, bl); // extended, osd-only data
+ decode(osd_addrs->hb_back_addrs, bl);
+ decode(osd_info, bl);
+ decode(blocklist, bl);
+ decode(osd_addrs->cluster_addrs, bl);
+ decode(cluster_snapshot_epoch, bl);
+ decode(cluster_snapshot, bl);
+ decode(*osd_uuid, bl);
+ decode(osd_xinfo, bl);
+ decode(osd_addrs->hb_front_addrs, bl);
+ //
+ if (struct_v >= 2) {
+ decode(nearfull_ratio, bl);
+ decode(full_ratio, bl);
+ } else {
+ nearfull_ratio = 0;
+ full_ratio = 0;
+ }
+ if (struct_v >= 3) {
+ decode(backfillfull_ratio, bl);
+ } else {
+ backfillfull_ratio = 0;
+ }
+ if (struct_v == 4) {
+ string r;
+ decode(r, bl);
+ if (r.length())
+ require_min_compat_client = ceph_release_from_name(r.c_str());
+ }
+ if (struct_v >= 5) {
+ decode(require_min_compat_client, bl);
+ decode(require_osd_release, bl);
+ if (require_osd_release >= ceph_release_t::nautilus) {
+ flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
+ }
+ if (require_osd_release >= ceph_release_t::luminous) {
+ flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+ }
+ } else {
+ if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
+ // only for compat with post-kraken pre-luminous test clusters
+ require_osd_release = ceph_release_t::luminous;
+ flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+ flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+ } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
+ require_osd_release = ceph_release_t::kraken;
+ } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
+ require_osd_release = ceph_release_t::jewel;
+ } else {
+ require_osd_release = ceph_release_t::unknown;
+ }
+ }
+ if (struct_v >= 6) {
+ decode(removed_snaps_queue, bl);
+ }
+ if (struct_v >= 8) {
+ decode(crush_node_flags, bl);
+ } else {
+ crush_node_flags.clear();
+ }
+ if (struct_v >= 9) {
+ decode(device_class_flags, bl);
+ } else {
+ device_class_flags.clear();
+ }
+ if (struct_v >= 10) {
+ decode(stretch_mode_enabled, bl);
+ decode(stretch_bucket_count, bl);
+ decode(degraded_stretch_mode, bl);
+ decode(recovering_stretch_mode, bl);
+ decode(stretch_mode_bucket, bl);
+ } else {
+ stretch_mode_enabled = false;
+ stretch_bucket_count = 0;
+ degraded_stretch_mode = 0;
+ recovering_stretch_mode = 0;
+ stretch_mode_bucket = 0;
+ }
+ if (struct_v >= 11) {
+ decode(range_blocklist, bl);
+ calculated_ranges.clear();
+ for (const auto& i : range_blocklist) {
+ calculated_ranges.emplace(i.first, i.first);
+ }
+ }
+ DECODE_FINISH(bl); // osd-only data
+ }
+
+ if (struct_v >= 8) {
+ crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
+ decode(crc, bl);
+ tail_offset = bl.get_off();
+ crc_defined = true;
+ } else {
+ crc_defined = false;
+ crc = 0;
+ }
+
+ DECODE_FINISH(bl); // wrapper
+
+ if (tail_offset) {
+ // verify crc
+ uint32_t actual = crc_front.crc32c(-1);
+ if (tail_offset < bl.get_off()) {
+ ceph::buffer::list tail;
+ tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
+ actual = tail.crc32c(actual);
+ }
+ if (crc != actual) {
+ ostringstream ss;
+ ss << "bad crc, actual " << actual << " != expected " << crc;
+ string s = ss.str();
+ throw ceph::buffer::malformed_input(s.c_str());
+ }
+ }
+
+ post_decode();
+}
+
+void OSDMap::post_decode()
+{
+ // index pool names
+ name_pool.clear();
+ for (const auto &pname : pool_name) {
+ name_pool[pname.second] = pname.first;
+ }
+
+ calc_num_osds();
+ _calc_up_osd_features();
+}
+
+void OSDMap::dump_erasure_code_profiles(
+ const mempool::osdmap::map<string,map<string,string>>& profiles,
+ Formatter *f)
+{
+ f->open_object_section("erasure_code_profiles");
+ for (const auto &profile : profiles) {
+ f->open_object_section(profile.first.c_str());
+ for (const auto &profm : profile.second) {
+ f->dump_string(profm.first.c_str(), profm.second);
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void OSDMap::dump_osds(Formatter *f) const
+{
+ f->open_array_section("osds");
+ for (int i=0; i<get_max_osd(); i++) {
+ if (exists(i)) {
+ dump_osd(i, f);
+ }
+ }
+ f->close_section();
+}
+
+void OSDMap::dump_osd(int id, Formatter *f) const
+{
+ ceph_assert(f != nullptr);
+ if (!exists(id)) {
+ return;
+ }
+
+ f->open_object_section("osd_info");
+ f->dump_int("osd", id);
+ f->dump_stream("uuid") << get_uuid(id);
+ f->dump_int("up", is_up(id));
+ f->dump_int("in", is_in(id));
+ f->dump_float("weight", get_weightf(id));
+ f->dump_float("primary_affinity", get_primary_affinityf(id));
+ get_info(id).dump(f);
+ f->dump_object("public_addrs", get_addrs(id));
+ f->dump_object("cluster_addrs", get_cluster_addrs(id));
+ f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
+ f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
+ // compat
+ f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
+ f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
+ f->dump_stream("heartbeat_back_addr")
+ << get_hb_back_addrs(id).get_legacy_str();
+ f->dump_stream("heartbeat_front_addr")
+ << get_hb_front_addrs(id).get_legacy_str();
+
+ set<string> st;
+ get_state(id, st);
+ f->open_array_section("state");
+ for (const auto &state : st)
+ f->dump_string("state", state);
+ f->close_section();
+
+ f->close_section();
+}
+
+void OSDMap::dump(Formatter *f) const
+{
+ f->dump_int("epoch", get_epoch());
+ f->dump_stream("fsid") << get_fsid();
+ f->dump_stream("created") << get_created();
+ f->dump_stream("modified") << get_modified();
+ f->dump_stream("last_up_change") << last_up_change;
+ f->dump_stream("last_in_change") << last_in_change;
+ f->dump_string("flags", get_flag_string());
+ f->dump_unsigned("flags_num", flags);
+ f->open_array_section("flags_set");
+ set<string> flagset;
+ get_flag_set(&flagset);
+ for (auto p : flagset) {
+ f->dump_string("flag", p);
+ }
+ f->close_section();
+ f->dump_unsigned("crush_version", get_crush_version());
+ f->dump_float("full_ratio", full_ratio);
+ f->dump_float("backfillfull_ratio", backfillfull_ratio);
+ f->dump_float("nearfull_ratio", nearfull_ratio);
+ f->dump_string("cluster_snapshot", get_cluster_snapshot());
+ f->dump_int("pool_max", get_pool_max());
+ f->dump_int("max_osd", get_max_osd());
+ f->dump_string("require_min_compat_client",
+ to_string(require_min_compat_client));
+ f->dump_string("min_compat_client",
+ to_string(get_min_compat_client()));
+ f->dump_string("require_osd_release",
+ to_string(require_osd_release));
+
+ f->open_array_section("pools");
+ for (const auto &pool : pools) {
+ std::string name("<unknown>");
+ const auto &pni = pool_name.find(pool.first);
+ if (pni != pool_name.end())
+ name = pni->second;
+ f->open_object_section("pool");
+ f->dump_int("pool", pool.first);
+ f->dump_string("pool_name", name);
+ pool.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ dump_osds(f);
+
+ f->open_array_section("osd_xinfo");
+ for (int i=0; i<get_max_osd(); i++) {
+ if (exists(i)) {
+ f->open_object_section("xinfo");
+ f->dump_int("osd", i);
+ osd_xinfo[i].dump(f);
+ f->close_section();
+ }
+ }
+ f->close_section();
+
+ f->open_array_section("pg_upmap");
+ for (auto& p : pg_upmap) {
+ f->open_object_section("mapping");
+ f->dump_stream("pgid") << p.first;
+ f->open_array_section("osds");
+ for (auto q : p.second) {
+ f->dump_int("osd", q);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("pg_upmap_items");
+ for (auto& p : pg_upmap_items) {
+ f->open_object_section("mapping");
+ f->dump_stream("pgid") << p.first;
+ f->open_array_section("mappings");
+ for (auto& q : p.second) {
+ f->open_object_section("mapping");
+ f->dump_int("from", q.first);
+ f->dump_int("to", q.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("pg_temp");
+ pg_temp->dump(f);
+ f->close_section();
+
+ f->open_array_section("primary_temp");
+ for (const auto &pg : *primary_temp) {
+ f->dump_stream("pgid") << pg.first;
+ f->dump_int("osd", pg.second);
+ }
+ f->close_section(); // primary_temp
+
+ f->open_object_section("blocklist");
+ for (const auto &addr : blocklist) {
+ stringstream ss;
+ ss << addr.first;
+ f->dump_stream(ss.str().c_str()) << addr.second;
+ }
+ f->close_section();
+ f->open_object_section("range_blocklist");
+ for (const auto &addr : range_blocklist) {
+ stringstream ss;
+ ss << addr.first;
+ f->dump_stream(ss.str().c_str()) << addr.second;
+ }
+ f->close_section();
+
+ dump_erasure_code_profiles(erasure_code_profiles, f);
+
+ f->open_array_section("removed_snaps_queue");
+ for (auto& p : removed_snaps_queue) {
+ f->open_object_section("pool");
+ f->dump_int("pool", p.first);
+ f->open_array_section("snaps");
+ for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+ f->open_object_section("interval");
+ f->dump_unsigned("begin", q.get_start());
+ f->dump_unsigned("length", q.get_len());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("new_removed_snaps");
+ for (auto& p : new_removed_snaps) {
+ f->open_object_section("pool");
+ f->dump_int("pool", p.first);
+ f->open_array_section("snaps");
+ for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+ f->open_object_section("interval");
+ f->dump_unsigned("begin", q.get_start());
+ f->dump_unsigned("length", q.get_len());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("new_purged_snaps");
+ for (auto& p : new_purged_snaps) {
+ f->open_object_section("pool");
+ f->dump_int("pool", p.first);
+ f->open_array_section("snaps");
+ for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+ f->open_object_section("interval");
+ f->dump_unsigned("begin", q.get_start());
+ f->dump_unsigned("length", q.get_len());
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_object_section("crush_node_flags");
+ for (auto& i : crush_node_flags) {
+ string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
+ : stringify(i.first);
+ f->open_array_section(s.c_str());
+ set<string> st;
+ calc_state_set(i.second, st);
+ for (auto& j : st) {
+ f->dump_string("flag", j);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->open_object_section("device_class_flags");
+ for (auto& i : device_class_flags) {
+ const char* class_name = crush->get_class_name(i.first);
+ string s = class_name ? class_name : stringify(i.first);
+ f->open_array_section(s.c_str());
+ set<string> st;
+ calc_state_set(i.second, st);
+ for (auto& j : st) {
+ f->dump_string("flag", j);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->open_object_section("stretch_mode");
+ {
+ f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
+ f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
+ f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
+ f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
+ f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
+ }
+ f->close_section();
+}
+
+void OSDMap::generate_test_instances(list<OSDMap*>& o)
+{
+ o.push_back(new OSDMap);
+
+ CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
+ o.push_back(new OSDMap);
+ uuid_d fsid;
+ o.back()->build_simple(cct, 1, fsid, 16);
+ o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
+ o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
+ cct->put();
+}
+
+string OSDMap::get_flag_string(unsigned f)
+{
+ string s;
+ if (f & CEPH_OSDMAP_PAUSERD)
+ s += ",pauserd";
+ if (f & CEPH_OSDMAP_PAUSEWR)
+ s += ",pausewr";
+ if (f & CEPH_OSDMAP_PAUSEREC)
+ s += ",pauserec";
+ if (f & CEPH_OSDMAP_NOUP)
+ s += ",noup";
+ if (f & CEPH_OSDMAP_NODOWN)
+ s += ",nodown";
+ if (f & CEPH_OSDMAP_NOOUT)
+ s += ",noout";
+ if (f & CEPH_OSDMAP_NOIN)
+ s += ",noin";
+ if (f & CEPH_OSDMAP_NOBACKFILL)
+ s += ",nobackfill";
+ if (f & CEPH_OSDMAP_NOREBALANCE)
+ s += ",norebalance";
+ if (f & CEPH_OSDMAP_NORECOVER)
+ s += ",norecover";
+ if (f & CEPH_OSDMAP_NOSCRUB)
+ s += ",noscrub";
+ if (f & CEPH_OSDMAP_NODEEP_SCRUB)
+ s += ",nodeep-scrub";
+ if (f & CEPH_OSDMAP_NOTIERAGENT)
+ s += ",notieragent";
+ if (f & CEPH_OSDMAP_NOSNAPTRIM)
+ s += ",nosnaptrim";
+ if (f & CEPH_OSDMAP_SORTBITWISE)
+ s += ",sortbitwise";
+ if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
+ s += ",require_jewel_osds";
+ if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
+ s += ",require_kraken_osds";
+ if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
+ s += ",require_luminous_osds";
+ if (f & CEPH_OSDMAP_RECOVERY_DELETES)
+ s += ",recovery_deletes";
+ if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
+ s += ",purged_snapdirs";
+ if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
+ s += ",pglog_hardlimit";
+ if (s.length())
+ s.erase(0, 1);
+ return s;
+}
+
+string OSDMap::get_flag_string() const
+{
+ return get_flag_string(flags);
+}
+
+void OSDMap::print_pools(ostream& out) const
+{
+ for (const auto &pool : pools) {
+ std::string name("<unknown>");
+ const auto &pni = pool_name.find(pool.first);
+ if (pni != pool_name.end())
+ name = pni->second;
+ out << "pool " << pool.first
+ << " '" << name
+ << "' " << pool.second << "\n";
+
+ for (const auto &snap : pool.second.snaps)
+ out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
+
+ if (!pool.second.removed_snaps.empty())
+ out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
+ auto p = removed_snaps_queue.find(pool.first);
+ if (p != removed_snaps_queue.end()) {
+ out << "\tremoved_snaps_queue " << p->second << "\n";
+ }
+ }
+ out << std::endl;
+}
+
+void OSDMap::print_osds(ostream& out) const
+{
+ for (int i=0; i<get_max_osd(); i++) {
+ if (exists(i)) {
+ print_osd(i, out);
+ }
+ }
+}
+void OSDMap::print_osd(int id, ostream& out) const
+{
+ if (!exists(id)) {
+ return;
+ }
+
+ out << "osd." << id;
+ out << (is_up(id) ? " up ":" down");
+ out << (is_in(id) ? " in ":" out");
+ out << " weight " << get_weightf(id);
+ if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ out << " primary_affinity " << get_primary_affinityf(id);
+ }
+ const osd_info_t& info(get_info(id));
+ out << " " << info;
+ out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
+ set<string> st;
+ get_state(id, st);
+ out << " " << st;
+ if (!get_uuid(id).is_zero()) {
+ out << " " << get_uuid(id);
+ }
+ out << "\n";
+}
+
+void OSDMap::print(ostream& out) const
+{
+ out << "epoch " << get_epoch() << "\n"
+ << "fsid " << get_fsid() << "\n"
+ << "created " << get_created() << "\n"
+ << "modified " << get_modified() << "\n";
+
+ out << "flags " << get_flag_string() << "\n";
+ out << "crush_version " << get_crush_version() << "\n";
+ out << "full_ratio " << full_ratio << "\n";
+ out << "backfillfull_ratio " << backfillfull_ratio << "\n";
+ out << "nearfull_ratio " << nearfull_ratio << "\n";
+ if (require_min_compat_client != ceph_release_t::unknown) {
+ out << "require_min_compat_client "
+ << require_min_compat_client << "\n";
+ }
+ out << "min_compat_client " << get_min_compat_client()
+ << "\n";
+ if (require_osd_release > ceph_release_t::unknown) {
+ out << "require_osd_release " << require_osd_release
+ << "\n";
+ }
+ out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
+ if (stretch_mode_enabled) {
+ out << "stretch_bucket_count " << stretch_bucket_count << "\n";
+ out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
+ out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
+ out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
+ }
+ if (get_cluster_snapshot().length())
+ out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
+ out << "\n";
+
+ print_pools(out);
+
+ out << "max_osd " << get_max_osd() << "\n";
+ print_osds(out);
+ out << std::endl;
+
+ for (auto& p : pg_upmap) {
+ out << "pg_upmap " << p.first << " " << p.second << "\n";
+ }
+ for (auto& p : pg_upmap_items) {
+ out << "pg_upmap_items " << p.first << " " << p.second << "\n";
+ }
+
+ for (const auto& pg : *pg_temp)
+ out << "pg_temp " << pg.first << " " << pg.second << "\n";
+
+ for (const auto& pg : *primary_temp)
+ out << "primary_temp " << pg.first << " " << pg.second << "\n";
+
+ for (const auto &addr : blocklist)
+ out << "blocklist " << addr.first << " expires " << addr.second << "\n";
+ for (const auto &addr : range_blocklist)
+ out << "range blocklist " << addr.first << " expires " << addr.second << "\n";
+}
+
+class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
+public:
+ typedef CrushTreeDumper::Dumper<TextTable> Parent;
+
+ OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
+ unsigned f)
+ : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
+
+ bool should_dump_leaf(int i) const override {
+ if (!filter) {
+ return true; // normal case
+ }
+ if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
+ ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
+ ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
+ ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
+ ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
+ return true;
+ }
+ return false;
+ }
+
+ bool should_dump_empty_bucket() const override {
+ return !filter;
+ }
+
+ void init_table(TextTable *tbl) {
+ tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
+ }
+ void dump(TextTable *tbl, string& bucket) {
+ init_table(tbl);
+
+ if (!bucket.empty()) {
+ set_root(bucket);
+ Parent::dump(tbl);
+ } else {
+ Parent::dump(tbl);
+ for (int i = 0; i < osdmap->get_max_osd(); i++) {
+ if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
+ }
+ }
+ }
+ }
+
+protected:
+ void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
+ const char *c = crush->get_item_class(qi.id);
+ if (!c)
+ c = "";
+ *tbl << qi.id
+ << c
+ << weightf_t(qi.weight);
+
+ ostringstream name;
+ for (int k = 0; k < qi.depth; k++)
+ name << " ";
+ if (qi.is_bucket()) {
+ name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
+ << crush->get_item_name(qi.id);
+ } else {
+ name << "osd." << qi.id;
+ }
+ *tbl << name.str();
+
+ if (!qi.is_bucket()) {
+ if (!osdmap->exists(qi.id)) {
+ *tbl << "DNE"
+ << 0;
+ } else {
+ string s;
+ if (osdmap->is_up(qi.id)) {
+ s = "up";
+ } else if (osdmap->is_destroyed(qi.id)) {
+ s = "destroyed";
+ } else {
+ s = "down";
+ }
+ *tbl << s
+ << weightf_t(osdmap->get_weightf(qi.id))
+ << weightf_t(osdmap->get_primary_affinityf(qi.id));
+ }
+ }
+ *tbl << TextTable::endrow;
+ }
+
+private:
+ const OSDMap *osdmap;
+ const unsigned filter;
+};
+
+class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
+public:
+ typedef CrushTreeDumper::FormattingDumper Parent;
+
+ OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
+ unsigned f)
+ : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
+
+ bool should_dump_leaf(int i) const override {
+ if (!filter) {
+ return true; // normal case
+ }
+ if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
+ ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
+ ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
+ ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
+ ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
+ return true;
+ }
+ return false;
+ }
+
+ bool should_dump_empty_bucket() const override {
+ return !filter;
+ }
+
+ void dump(Formatter *f, string& bucket) {
+ if (!bucket.empty()) {
+ set_root(bucket);
+ f->open_array_section("nodes");
+ Parent::dump(f);
+ f->close_section();
+ } else {
+ f->open_array_section("nodes");
+ Parent::dump(f);
+ f->close_section();
+ f->open_array_section("stray");
+ for (int i = 0; i < osdmap->get_max_osd(); i++) {
+ if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
+ }
+ f->close_section();
+ }
+ }
+
+protected:
+ void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
+ Parent::dump_item_fields(qi, f);
+ if (!qi.is_bucket())
+ {
+ string s;
+ if (osdmap->is_up(qi.id)) {
+ s = "up";
+ } else if (osdmap->is_destroyed(qi.id)) {
+ s = "destroyed";
+ } else {
+ s = "down";
+ }
+ f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
+ f->dump_string("status", s);
+ f->dump_float("reweight", osdmap->get_weightf(qi.id));
+ f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
+ }
+ }
+
+private:
+ const OSDMap *osdmap;
+ const unsigned filter;
+};
+
+void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
+{
+ if (f) {
+ OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
+ } else {
+ ceph_assert(out);
+ TextTable tbl;
+ OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
+ *out << tbl;
+ }
+}
+
+void OSDMap::print_summary(Formatter *f, ostream& out,
+ const string& prefix, bool extra) const
+{
+ if (f) {
+ f->dump_int("epoch", get_epoch());
+ f->dump_int("num_osds", get_num_osds());
+ f->dump_int("num_up_osds", get_num_up_osds());
+ f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
+ f->dump_int("num_in_osds", get_num_in_osds());
+ f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
+ f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
+ } else {
+ utime_t now = ceph_clock_now();
+ out << get_num_osds() << " osds: "
+ << get_num_up_osds() << " up";
+ if (last_up_change != utime_t()) {
+ out << " (since " << utimespan_str(now - last_up_change) << ")";
+ }
+ out << ", " << get_num_in_osds() << " in";
+ if (last_in_change != utime_t()) {
+ out << " (since " << utimespan_str(now - last_in_change) << ")";
+ }
+ if (extra)
+ out << "; epoch: e" << get_epoch();
+ if (get_num_pg_temp())
+ out << "; " << get_num_pg_temp() << " remapped pgs";
+ out << "\n";
+ uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
+ if (important_flags)
+ out << prefix << "flags " << get_flag_string(important_flags) << "\n";
+ }
+}
+
+void OSDMap::print_oneline_summary(ostream& out) const
+{
+ out << "e" << get_epoch() << ": "
+ << get_num_osds() << " total, "
+ << get_num_up_osds() << " up, "
+ << get_num_in_osds() << " in";
+}
+
+bool OSDMap::crush_rule_in_use(int rule_id) const
+{
+ for (const auto &pool : pools) {
+ if (pool.second.crush_rule == rule_id)
+ return true;
+ }
+ return false;
+}
+
+int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
+ ostream *ss) const
+{
+ for (auto& i : pools) {
+ auto& pool = i.second;
+ int ruleno = pool.get_crush_rule();
+ if (!newcrush->rule_exists(ruleno)) {
+ *ss << "pool " << i.first << " references crush_rule " << ruleno
+ << " but it is not present";
+ return -EINVAL;
+ }
+ if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
+ *ss << "rule " << ruleno << " mask ruleset does not match rule id";
+ return -EINVAL;
+ }
+ if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
+ *ss << "pool " << i.first << " type does not match rule " << ruleno;
+ return -EINVAL;
+ }
+ int poolsize = pool.get_size();
+ if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
+ poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
+ *ss << "pool " << i.first << " size " << poolsize << " does not"
+ << " fall within rule " << ruleno
+ << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
+ << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
+ int nosd, int pg_bits, int pgp_bits,
+ bool default_pool)
+{
+ ldout(cct, 10) << "build_simple on " << nosd
+ << " osds" << dendl;
+ epoch = e;
+ set_fsid(fsid);
+ created = modified = ceph_clock_now();
+
+ if (nosd >= 0) {
+ set_max_osd(nosd);
+ } else {
+ // count osds
+ int maxosd = 0;
+ const auto& conf = cct->_conf;
+ vector<string> sections;
+ conf.get_all_sections(sections);
+
+ for (auto &section : sections) {
+ if (section.find("osd.") != 0)
+ continue;
+
+ const char *begin = section.c_str() + 4;
+ char *end = (char*)begin;
+ int o = strtol(begin, &end, 10);
+ if (*end != '\0')
+ continue;
+
+ if (o > cct->_conf->mon_max_osd) {
+ lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
+ return -ERANGE;
+ }
+
+ if (o > maxosd)
+ maxosd = o;
+ }
+
+ set_max_osd(maxosd + 1);
+ }
+
+
+ stringstream ss;
+ int r;
+ if (nosd >= 0)
+ r = build_simple_crush_map(cct, *crush, nosd, &ss);
+ else
+ r = build_simple_crush_map_from_conf(cct, *crush, &ss);
+ ceph_assert(r == 0);
+
+ int poolbase = get_max_osd() ? get_max_osd() : 1;
+
+ const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
+ ceph_assert(default_replicated_rule >= 0);
+
+ if (default_pool) {
+ // pgp_num <= pg_num
+ if (pgp_bits > pg_bits)
+ pgp_bits = pg_bits;
+
+ vector<string> pool_names;
+ pool_names.push_back("rbd");
+ for (auto &plname : pool_names) {
+ int64_t pool = ++pool_max;
+ pools[pool].type = pg_pool_t::TYPE_REPLICATED;
+ pools[pool].flags = cct->_conf->osd_pool_default_flags;
+ if (cct->_conf->osd_pool_default_flag_hashpspool)
+ pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+ if (cct->_conf->osd_pool_default_flag_nodelete)
+ pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
+ if (cct->_conf->osd_pool_default_flag_nopgchange)
+ pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+ if (cct->_conf->osd_pool_default_flag_nosizechange)
+ pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+ if (cct->_conf->osd_pool_default_flag_bulk)
+ pools[pool].set_flag(pg_pool_t::FLAG_BULK);
+ pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
+ pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
+ pools[pool].size);
+ pools[pool].crush_rule = default_replicated_rule;
+ pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
+ pools[pool].set_pg_num(poolbase << pg_bits);
+ pools[pool].set_pgp_num(poolbase << pgp_bits);
+ pools[pool].set_pg_num_target(poolbase << pg_bits);
+ pools[pool].set_pgp_num_target(poolbase << pgp_bits);
+ pools[pool].last_change = epoch;
+ pools[pool].application_metadata.insert(
+ {pg_pool_t::APPLICATION_NAME_RBD, {}});
+ if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+ cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
+ m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ pools[pool].pg_autoscale_mode = m;
+ } else {
+ pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
+ }
+ pool_name[pool] = plname;
+ name_pool[plname] = pool;
+ }
+ }
+
+ map<string,string> profile_map;
+ r = get_erasure_code_profile_default(cct, profile_map, &ss);
+ if (r < 0) {
+ lderr(cct) << ss.str() << dendl;
+ return r;
+ }
+ set_erasure_code_profile("default", profile_map);
+ return 0;
+}
+
+int OSDMap::get_erasure_code_profile_default(CephContext *cct,
+ map<string,string> &profile_map,
+ ostream *ss)
+{
+ int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
+ *ss,
+ &profile_map);
+ return r;
+}
+
+int OSDMap::_build_crush_types(CrushWrapper& crush)
+{
+ crush.set_type_name(0, "osd");
+ crush.set_type_name(1, "host");
+ crush.set_type_name(2, "chassis");
+ crush.set_type_name(3, "rack");
+ crush.set_type_name(4, "row");
+ crush.set_type_name(5, "pdu");
+ crush.set_type_name(6, "pod");
+ crush.set_type_name(7, "room");
+ crush.set_type_name(8, "datacenter");
+ crush.set_type_name(9, "zone");
+ crush.set_type_name(10, "region");
+ crush.set_type_name(11, "root");
+ return 11;
+}
+
+int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
+ int nosd, ostream *ss)
+{
+ crush.create();
+
+ // root
+ int root_type = _build_crush_types(crush);
+ int rootid;
+ int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
+ root_type, 0, NULL, NULL, &rootid);
+ ceph_assert(r == 0);
+ crush.set_item_name(rootid, "default");
+
+ map<string,string> loc{
+ {"host", "localhost"},
+ {"rack", "localrack"},
+ {"root", "default"}
+ };
+ for (int o=0; o<nosd; o++) {
+ ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
+ char name[32];
+ snprintf(name, sizeof(name), "osd.%d", o);
+ crush.insert_item(cct, o, 1.0, name, loc);
+ }
+
+ build_simple_crush_rules(cct, crush, "default", ss);
+
+ crush.finalize();
+
+ return 0;
+}
+
+int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
+ CrushWrapper& crush,
+ ostream *ss)
+{
+ const auto& conf = cct->_conf;
+
+ crush.create();
+
+ // root
+ int root_type = _build_crush_types(crush);
+ int rootid;
+ int r = crush.add_bucket(0, 0,
+ CRUSH_HASH_DEFAULT,
+ root_type, 0, NULL, NULL, &rootid);
+ ceph_assert(r == 0);
+ crush.set_item_name(rootid, "default");
+
+ // add osds
+ vector<string> sections;
+ conf.get_all_sections(sections);
+
+ for (auto &section : sections) {
+ if (section.find("osd.") != 0)
+ continue;
+
+ const char *begin = section.c_str() + 4;
+ char *end = (char*)begin;
+ int o = strtol(begin, &end, 10);
+ if (*end != '\0')
+ continue;
+
+ string host, rack, row, room, dc, pool;
+ vector<string> sectiontmp;
+ sectiontmp.push_back("osd");
+ sectiontmp.push_back(section);
+ conf.get_val_from_conf_file(sectiontmp, "host", host, false);
+ conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
+ conf.get_val_from_conf_file(sectiontmp, "row", row, false);
+ conf.get_val_from_conf_file(sectiontmp, "room", room, false);
+ conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
+ conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
+
+ if (host.length() == 0)
+ host = "unknownhost";
+ if (rack.length() == 0)
+ rack = "unknownrack";
+
+ map<string,string> loc;
+ loc["host"] = host;
+ loc["rack"] = rack;
+ if (row.size())
+ loc["row"] = row;
+ if (room.size())
+ loc["room"] = room;
+ if (dc.size())
+ loc["datacenter"] = dc;
+ loc["root"] = "default";
+
+ ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
+ crush.insert_item(cct, o, 1.0, section, loc);
+ }
+
+ build_simple_crush_rules(cct, crush, "default", ss);
+
+ crush.finalize();
+
+ return 0;
+}
+
+
+int OSDMap::build_simple_crush_rules(
+ CephContext *cct,
+ CrushWrapper& crush,
+ const string& root,
+ ostream *ss)
+{
+ int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
+ string failure_domain =
+ crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
+
+ int r;
+ r = crush.add_simple_rule_at(
+ "replicated_rule", root, failure_domain, "",
+ "firstn", pg_pool_t::TYPE_REPLICATED,
+ crush_rule, ss);
+ if (r < 0)
+ return r;
+ // do not add an erasure rule by default or else we will implicitly
+ // require the crush_v2 feature of clients
+ return 0;
+}
+
+int OSDMap::summarize_mapping_stats(
+ OSDMap *newmap,
+ const set<int64_t> *pools,
+ std::string *out,
+ Formatter *f) const
+{
+ set<int64_t> ls;
+ if (pools) {
+ ls = *pools;
+ } else {
+ for (auto &p : get_pools())
+ ls.insert(p.first);
+ }
+
+ unsigned total_pg = 0;
+ unsigned moved_pg = 0;
+ vector<unsigned> base_by_osd(get_max_osd(), 0);
+ vector<unsigned> new_by_osd(get_max_osd(), 0);
+ for (int64_t pool_id : ls) {
+ const pg_pool_t *pi = get_pg_pool(pool_id);
+ vector<int> up, up2;
+ int up_primary;
+ for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+ pg_t pgid(ps, pool_id);
+ total_pg += pi->get_size();
+ pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
+ for (int osd : up) {
+ if (osd >= 0 && osd < get_max_osd())
+ ++base_by_osd[osd];
+ }
+ if (newmap) {
+ newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
+ for (int osd : up2) {
+ if (osd >= 0 && osd < get_max_osd())
+ ++new_by_osd[osd];
+ }
+ if (pi->type == pg_pool_t::TYPE_ERASURE) {
+ for (unsigned i=0; i<up.size(); ++i) {
+ if (up[i] != up2[i]) {
+ ++moved_pg;
+ }
+ }
+ } else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
+ for (int osd : up) {
+ if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
+ ++moved_pg;
+ }
+ }
+ } else {
+ ceph_abort_msg("unhandled pool type");
+ }
+ }
+ }
+ }
+
+ unsigned num_up_in = 0;
+ for (int osd = 0; osd < get_max_osd(); ++osd) {
+ if (is_up(osd) && is_in(osd))
+ ++num_up_in;
+ }
+ if (!num_up_in) {
+ return -EINVAL;
+ }
+
+ float avg_pg = (float)total_pg / (float)num_up_in;
+ float base_stddev = 0, new_stddev = 0;
+ int min = -1, max = -1;
+ unsigned min_base_pg = 0, max_base_pg = 0;
+ unsigned min_new_pg = 0, max_new_pg = 0;
+ for (int osd = 0; osd < get_max_osd(); ++osd) {
+ if (is_up(osd) && is_in(osd)) {
+ float base_diff = (float)base_by_osd[osd] - avg_pg;
+ base_stddev += base_diff * base_diff;
+ float new_diff = (float)new_by_osd[osd] - avg_pg;
+ new_stddev += new_diff * new_diff;
+ if (min < 0 || base_by_osd[osd] < min_base_pg) {
+ min = osd;
+ min_base_pg = base_by_osd[osd];
+ min_new_pg = new_by_osd[osd];
+ }
+ if (max < 0 || base_by_osd[osd] > max_base_pg) {
+ max = osd;
+ max_base_pg = base_by_osd[osd];
+ max_new_pg = new_by_osd[osd];
+ }
+ }
+ }
+ base_stddev = sqrt(base_stddev / num_up_in);
+ new_stddev = sqrt(new_stddev / num_up_in);
+
+ float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
+
+ ostringstream ss;
+ if (f)
+ f->open_object_section("utilization");
+ if (newmap) {
+ if (f) {
+ f->dump_unsigned("moved_pgs", moved_pg);
+ f->dump_unsigned("total_pgs", total_pg);
+ } else {
+ float percent = 0;
+ if (total_pg)
+ percent = (float)moved_pg * 100.0 / (float)total_pg;
+ ss << "moved " << moved_pg << " / " << total_pg
+ << " (" << percent << "%)\n";
+ }
+ }
+ if (f) {
+ f->dump_float("avg_pgs", avg_pg);
+ f->dump_float("std_dev", base_stddev);
+ f->dump_float("expected_baseline_std_dev", edev);
+ if (newmap)
+ f->dump_float("new_std_dev", new_stddev);
+ } else {
+ ss << "avg " << avg_pg << "\n";
+ ss << "stddev " << base_stddev;
+ if (newmap)
+ ss << " -> " << new_stddev;
+ ss << " (expected baseline " << edev << ")\n";
+ }
+ if (min >= 0) {
+ if (f) {
+ f->dump_unsigned("min_osd", min);
+ f->dump_unsigned("min_osd_pgs", min_base_pg);
+ if (newmap)
+ f->dump_unsigned("new_min_osd_pgs", min_new_pg);
+ } else {
+ ss << "min osd." << min << " with " << min_base_pg;
+ if (newmap)
+ ss << " -> " << min_new_pg;
+ ss << " pgs (" << (float)min_base_pg / avg_pg;
+ if (newmap)
+ ss << " -> " << (float)min_new_pg / avg_pg;
+ ss << " * mean)\n";
+ }
+ }
+ if (max >= 0) {
+ if (f) {
+ f->dump_unsigned("max_osd", max);
+ f->dump_unsigned("max_osd_pgs", max_base_pg);
+ if (newmap)
+ f->dump_unsigned("new_max_osd_pgs", max_new_pg);
+ } else {
+ ss << "max osd." << max << " with " << max_base_pg;
+ if (newmap)
+ ss << " -> " << max_new_pg;
+ ss << " pgs (" << (float)max_base_pg / avg_pg;
+ if (newmap)
+ ss << " -> " << (float)max_new_pg / avg_pg;
+ ss << " * mean)\n";
+ }
+ }
+ if (f)
+ f->close_section();
+ if (out)
+ *out = ss.str();
+ return 0;
+}
+
+bool OSDMap::try_pg_upmap(
+ CephContext *cct,
+ pg_t pg, ///< pg to potentially remap
+ const set<int>& overfull, ///< osds we'd want to evacuate
+ const vector<int>& underfull, ///< osds to move to, in order of preference
+ const vector<int>& more_underfull, ///< more osds only slightly underfull
+ vector<int> *orig,
+ vector<int> *out) ///< resulting alternative mapping
+{
+ const pg_pool_t *pool = get_pg_pool(pg.pool());
+ if (!pool)
+ return false;
+ int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
+ pool->get_size());
+ if (rule < 0)
+ return false;
+
+ // make sure there is something there to remap
+ bool any = false;
+ for (auto osd : *orig) {
+ if (overfull.count(osd)) {
+ any = true;
+ break;
+ }
+ }
+ if (!any) {
+ return false;
+ }
+
+ int r = crush->try_remap_rule(
+ cct,
+ rule,
+ pool->get_size(),
+ overfull, underfull,
+ more_underfull,
+ *orig,
+ out);
+ if (r < 0)
+ return false;
+ if (*out == *orig)
+ return false;
+ return true;
+}
+
+int OSDMap::calc_pg_upmaps(
+ CephContext *cct,
+ uint32_t max_deviation,
+ int max,
+ const set<int64_t>& only_pools,
+ OSDMap::Incremental *pending_inc)
+{
+ ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
+ OSDMap tmp;
+ // Can't be less than 1 pg
+ if (max_deviation < 1)
+ max_deviation = 1;
+ tmp.deepish_copy_from(*this);
+ int num_changed = 0;
+ map<int,set<pg_t>> pgs_by_osd;
+ int total_pgs = 0;
+ float osd_weight_total = 0;
+ map<int,float> osd_weight;
+ for (auto& i : pools) {
+ if (!only_pools.empty() && !only_pools.count(i.first))
+ continue;
+ for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+ pg_t pg(ps, i.first);
+ vector<int> up;
+ tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+ ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
+ for (auto osd : up) {
+ if (osd != CRUSH_ITEM_NONE)
+ pgs_by_osd[osd].insert(pg);
+ }
+ }
+ total_pgs += i.second.get_size() * i.second.get_pg_num();
+
+ map<int,float> pmap;
+ int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
+ i.second.get_type(),
+ i.second.get_size());
+ tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+ ldout(cct,20) << __func__ << " pool " << i.first
+ << " ruleno " << ruleno
+ << " weight-map " << pmap
+ << dendl;
+ for (auto p : pmap) {
+ auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+ if (adjusted_weight == 0) {
+ continue;
+ }
+ osd_weight[p.first] += adjusted_weight;
+ osd_weight_total += adjusted_weight;
+ }
+ }
+ for (auto& i : osd_weight) {
+ int pgs = 0;
+ auto p = pgs_by_osd.find(i.first);
+ if (p != pgs_by_osd.end())
+ pgs = p->second.size();
+ else
+ pgs_by_osd.emplace(i.first, set<pg_t>());
+ ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+ << " pgs " << pgs << dendl;
+ }
+ if (osd_weight_total == 0) {
+ lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+ return 0;
+ }
+ float pgs_per_weight = total_pgs / osd_weight_total;
+ ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+ ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
+
+ if (max <= 0) {
+ lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
+ return 0;
+ }
+ float stddev = 0;
+ map<int,float> osd_deviation; // osd, deviation(pgs)
+ multimap<float,int> deviation_osd; // deviation(pgs), osd
+ float cur_max_deviation = 0;
+ for (auto& i : pgs_by_osd) {
+ // make sure osd is still there (belongs to this crush-tree)
+ ceph_assert(osd_weight.count(i.first));
+ float target = osd_weight[i.first] * pgs_per_weight;
+ float deviation = (float)i.second.size() - target;
+ ldout(cct, 20) << " osd." << i.first
+ << "\tpgs " << i.second.size()
+ << "\ttarget " << target
+ << "\tdeviation " << deviation
+ << dendl;
+ osd_deviation[i.first] = deviation;
+ deviation_osd.insert(make_pair(deviation, i.first));
+ stddev += deviation * deviation;
+ if (fabsf(deviation) > cur_max_deviation)
+ cur_max_deviation = fabsf(deviation);
+ }
+ ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
+ if (cur_max_deviation <= max_deviation) {
+ ldout(cct, 10) << __func__ << " distribution is almost perfect"
+ << dendl;
+ return 0;
+ }
+ bool skip_overfull = false;
+ auto aggressive =
+ cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
+ auto local_fallback_retries =
+ cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
+ while (max--) {
+ ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
+ // build overfull and underfull
+ set<int> overfull;
+ set<int> more_overfull;
+ bool using_more_overfull = false;
+ vector<int> underfull;
+ vector<int> more_underfull;
+ for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
+ ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
+ if (i->first <= 0)
+ break;
+ if (i->first > max_deviation) {
+ ldout(cct, 30) << " add overfull osd." << i->second << dendl;
+ overfull.insert(i->second);
+ } else {
+ more_overfull.insert(i->second);
+ }
+ }
+
+ for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
+ ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
+ if (i->first >= 0)
+ break;
+ if (i->first < -(int)max_deviation) {
+ ldout(cct, 30) << " add underfull osd." << i->second << dendl;
+ underfull.push_back(i->second);
+ } else {
+ more_underfull.push_back(i->second);
+ }
+ }
+ if (underfull.empty() && overfull.empty()) {
+ ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
+ break;
+ }
+ if (overfull.empty() && !underfull.empty()) {
+ ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
+ overfull = more_overfull;
+ using_more_overfull = true;
+ }
+
+ ldout(cct, 10) << " overfull " << overfull
+ << " underfull " << underfull
+ << dendl;
+ set<pg_t> to_skip;
+ uint64_t local_fallback_retried = 0;
+
+ retry:
+
+ set<pg_t> to_unmap;
+ map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
+ auto temp_pgs_by_osd = pgs_by_osd;
+ // always start with fullest, break if we find any changes to make
+ for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
+ if (skip_overfull && !underfull.empty()) {
+ ldout(cct, 10) << " skipping overfull " << dendl;
+ break; // fall through to check underfull
+ }
+ int osd = p->second;
+ float deviation = p->first;
+ if (deviation < 0) {
+ ldout(cct, 10) << " hitting underfull osds now"
+ << " when trying to remap overfull osds"
+ << dendl;
+ break;
+ }
+ float target = osd_weight[osd] * pgs_per_weight;
+ ldout(cct, 10) << " Overfull search osd." << osd
+ << " target " << target
+ << " deviation " << deviation
+ << dendl;
+ ceph_assert(target > 0);
+ if (!using_more_overfull && deviation <= max_deviation) {
+ ldout(cct, 10) << " osd." << osd
+ << " target " << target
+ << " deviation " << deviation
+ << " < max deviation " << max_deviation
+ << dendl;
+ break;
+ }
+
+ vector<pg_t> pgs;
+ pgs.reserve(pgs_by_osd[osd].size());
+ for (auto& pg : pgs_by_osd[osd]) {
+ if (to_skip.count(pg))
+ continue;
+ pgs.push_back(pg);
+ }
+ if (aggressive) {
+ // shuffle PG list so they all get equal (in)attention
+ std::random_device rd;
+ std::default_random_engine rng{rd()};
+ std::shuffle(pgs.begin(), pgs.end(), rng);
+ }
+ // look for remaps we can un-remap
+ for (auto pg : pgs) {
+ auto p = tmp.pg_upmap_items.find(pg);
+ if (p == tmp.pg_upmap_items.end())
+ continue;
+ mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+ for (auto q : p->second) {
+ if (q.second == osd) {
+ ldout(cct, 10) << " will try dropping existing"
+ << " remapping pair "
+ << q.first << " -> " << q.second
+ << " which remapped " << pg
+ << " into overfull osd." << osd
+ << dendl;
+ temp_pgs_by_osd[q.second].erase(pg);
+ temp_pgs_by_osd[q.first].insert(pg);
+ } else {
+ new_upmap_items.push_back(q);
+ }
+ }
+ if (new_upmap_items.empty()) {
+ // drop whole item
+ ldout(cct, 10) << " existing pg_upmap_items " << p->second
+ << " remapped " << pg << " into overfull osd." << osd
+ << ", will try cancelling it entirely"
+ << dendl;
+ to_unmap.insert(pg);
+ goto test_change;
+ } else if (new_upmap_items.size() != p->second.size()) {
+ // drop single remapping pair, updating
+ ceph_assert(new_upmap_items.size() < p->second.size());
+ ldout(cct, 10) << " existing pg_upmap_items " << p->second
+ << " remapped " << pg << " into overfull osd." << osd
+ << ", new_pg_upmap_items now " << new_upmap_items
+ << dendl;
+ to_upmap[pg] = new_upmap_items;
+ goto test_change;
+ }
+ }
+
+ // try upmap
+ for (auto pg : pgs) {
+ auto temp_it = tmp.pg_upmap.find(pg);
+ if (temp_it != tmp.pg_upmap.end()) {
+ // leave pg_upmap alone
+ // it must be specified by admin since balancer does not
+ // support pg_upmap yet
+ ldout(cct, 10) << " " << pg << " already has pg_upmap "
+ << temp_it->second << ", skipping"
+ << dendl;
+ continue;
+ }
+ auto pg_pool_size = tmp.get_pg_pool_size(pg);
+ mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+ set<int> existing;
+ auto it = tmp.pg_upmap_items.find(pg);
+ if (it != tmp.pg_upmap_items.end() &&
+ it->second.size() >= (size_t)pg_pool_size) {
+ ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
+ << it->second << ", skipping"
+ << dendl;
+ continue;
+ } else if (it != tmp.pg_upmap_items.end()) {
+ ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
+ << it->second
+ << dendl;
+ new_upmap_items = it->second;
+ // build existing too (for dedup)
+ for (auto i : it->second) {
+ existing.insert(i.first);
+ existing.insert(i.second);
+ }
+ // fall through
+ // to see if we can append more remapping pairs
+ }
+ ldout(cct, 10) << " trying " << pg << dendl;
+ vector<int> raw, orig, out;
+ tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
+ if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
+ continue;
+ }
+ ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
+ if (orig.size() != out.size()) {
+ continue;
+ }
+ ceph_assert(orig != out);
+ int pos = -1;
+ float max_dev = 0;
+ for (unsigned i = 0; i < out.size(); ++i) {
+ if (orig[i] == out[i])
+ continue; // skip invalid remappings
+ if (existing.count(orig[i]) || existing.count(out[i]))
+ continue; // we want new remappings only!
+ if (osd_deviation[orig[i]] > max_dev) {
+ max_dev = osd_deviation[orig[i]];
+ pos = i;
+ ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
+ }
+ }
+ if (pos != -1) {
+ int i = pos;
+ ldout(cct, 10) << " will try adding new remapping pair "
+ << orig[i] << " -> " << out[i] << " for " << pg
+ << (orig[i] != osd ? " NOT selected osd" : "")
+ << dendl;
+ existing.insert(orig[i]);
+ existing.insert(out[i]);
+ temp_pgs_by_osd[orig[i]].erase(pg);
+ temp_pgs_by_osd[out[i]].insert(pg);
+ ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
+ new_upmap_items.push_back(make_pair(orig[i], out[i]));
+ // append new remapping pairs slowly
+ // This way we can make sure that each tiny change will
+ // definitely make distribution of PGs converging to
+ // the perfect status.
+ to_upmap[pg] = new_upmap_items;
+ goto test_change;
+ }
+ }
+ }
+
+ ceph_assert(!(to_unmap.size() || to_upmap.size()));
+ ldout(cct, 10) << " failed to find any changes for overfull osds"
+ << dendl;
+ for (auto& p : deviation_osd) {
+ if (std::find(underfull.begin(), underfull.end(), p.second) ==
+ underfull.end())
+ break;
+ int osd = p.second;
+ float deviation = p.first;
+ float target = osd_weight[osd] * pgs_per_weight;
+ ceph_assert(target > 0);
+ if (fabsf(deviation) < max_deviation) {
+ // respect max_deviation too
+ ldout(cct, 10) << " osd." << osd
+ << " target " << target
+ << " deviation " << deviation
+ << " -> absolute " << fabsf(deviation)
+ << " < max " << max_deviation
+ << dendl;
+ break;
+ }
+ // look for remaps we can un-remap
+ vector<pair<pg_t,
+ mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
+ candidates.reserve(tmp.pg_upmap_items.size());
+ for (auto& i : tmp.pg_upmap_items) {
+ if (to_skip.count(i.first))
+ continue;
+ if (!only_pools.empty() && !only_pools.count(i.first.pool()))
+ continue;
+ candidates.push_back(make_pair(i.first, i.second));
+ }
+ if (aggressive) {
+ // shuffle candidates so they all get equal (in)attention
+ std::random_device rd;
+ std::default_random_engine rng{rd()};
+ std::shuffle(candidates.begin(), candidates.end(), rng);
+ }
+ for (auto& i : candidates) {
+ auto pg = i.first;
+ mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+ for (auto& j : i.second) {
+ if (j.first == osd) {
+ ldout(cct, 10) << " will try dropping existing"
+ << " remapping pair "
+ << j.first << " -> " << j.second
+ << " which remapped " << pg
+ << " out from underfull osd." << osd
+ << dendl;
+ temp_pgs_by_osd[j.second].erase(pg);
+ temp_pgs_by_osd[j.first].insert(pg);
+ } else {
+ new_upmap_items.push_back(j);
+ }
+ }
+ if (new_upmap_items.empty()) {
+ // drop whole item
+ ldout(cct, 10) << " existing pg_upmap_items " << i.second
+ << " remapped " << pg
+ << " out from underfull osd." << osd
+ << ", will try cancelling it entirely"
+ << dendl;
+ to_unmap.insert(pg);
+ goto test_change;
+ } else if (new_upmap_items.size() != i.second.size()) {
+ // drop single remapping pair, updating
+ ceph_assert(new_upmap_items.size() < i.second.size());
+ ldout(cct, 10) << " existing pg_upmap_items " << i.second
+ << " remapped " << pg
+ << " out from underfull osd." << osd
+ << ", new_pg_upmap_items now " << new_upmap_items
+ << dendl;
+ to_upmap[pg] = new_upmap_items;
+ goto test_change;
+ }
+ }
+ }
+
+ ceph_assert(!(to_unmap.size() || to_upmap.size()));
+ ldout(cct, 10) << " failed to find any changes for underfull osds"
+ << dendl;
+ if (!aggressive) {
+ ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
+ break;
+ } else if (!skip_overfull) {
+ // safe to quit because below here we know
+ // we've done checking both overfull and underfull osds..
+ ldout(cct, 10) << " break due to not being able to find any"
+ << " further optimizations"
+ << dendl;
+ break;
+ }
+ // restart with fullest and do exhaustive searching
+ skip_overfull = false;
+ continue;
+
+ test_change:
+
+ // test change, apply if change is good
+ ceph_assert(to_unmap.size() || to_upmap.size());
+ float new_stddev = 0;
+ map<int,float> temp_osd_deviation;
+ multimap<float,int> temp_deviation_osd;
+ float cur_max_deviation = 0;
+ for (auto& i : temp_pgs_by_osd) {
+ // make sure osd is still there (belongs to this crush-tree)
+ ceph_assert(osd_weight.count(i.first));
+ float target = osd_weight[i.first] * pgs_per_weight;
+ float deviation = (float)i.second.size() - target;
+ ldout(cct, 20) << " osd." << i.first
+ << "\tpgs " << i.second.size()
+ << "\ttarget " << target
+ << "\tdeviation " << deviation
+ << dendl;
+ temp_osd_deviation[i.first] = deviation;
+ temp_deviation_osd.insert(make_pair(deviation, i.first));
+ new_stddev += deviation * deviation;
+ if (fabsf(deviation) > cur_max_deviation)
+ cur_max_deviation = fabsf(deviation);
+ }
+ ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
+ if (new_stddev >= stddev) {
+ if (!aggressive) {
+ ldout(cct, 10) << " break because stddev is not decreasing"
+ << " and aggressive mode is not enabled"
+ << dendl;
+ break;
+ }
+ local_fallback_retried++;
+ if (local_fallback_retried >= local_fallback_retries) {
+ // does not make progress
+ // flip *skip_overfull* so both overfull and underfull
+ // get equal (in)attention
+ skip_overfull = !skip_overfull;
+ ldout(cct, 10) << " hit local_fallback_retries "
+ << local_fallback_retries
+ << dendl;
+ continue;
+ }
+ for (auto& i : to_unmap)
+ to_skip.insert(i);
+ for (auto& i : to_upmap)
+ to_skip.insert(i.first);
+ ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
+ << " to_skip " << to_skip
+ << dendl;
+ goto retry;
+ }
+
+ // ready to go
+ ceph_assert(new_stddev < stddev);
+ stddev = new_stddev;
+ pgs_by_osd = temp_pgs_by_osd;
+ osd_deviation = temp_osd_deviation;
+ deviation_osd = temp_deviation_osd;
+ for (auto& i : to_unmap) {
+ ldout(cct, 10) << " unmap pg " << i << dendl;
+ ceph_assert(tmp.pg_upmap_items.count(i));
+ tmp.pg_upmap_items.erase(i);
+ pending_inc->old_pg_upmap_items.insert(i);
+ ++num_changed;
+ }
+ for (auto& i : to_upmap) {
+ ldout(cct, 10) << " upmap pg " << i.first
+ << " new pg_upmap_items " << i.second
+ << dendl;
+ tmp.pg_upmap_items[i.first] = i.second;
+ pending_inc->new_pg_upmap_items[i.first] = i.second;
+ ++num_changed;
+ }
+ ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
+ if (cur_max_deviation <= max_deviation) {
+ ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
+ << dendl;
+ break;
+ }
+ }
+ ldout(cct, 10) << " num_changed = " << num_changed << dendl;
+ return num_changed;
+}
+
+int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
+{
+ return crush->get_leaves(name, osds);
+}
+
+// get pools whose crush rules might reference the given osd
+void OSDMap::get_pool_ids_by_osd(CephContext *cct,
+ int osd,
+ set<int64_t> *pool_ids) const
+{
+ ceph_assert(pool_ids);
+ set<int> raw_rules;
+ int r = crush->get_rules_by_osd(osd, &raw_rules);
+ if (r < 0) {
+ lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
+ << dendl;
+ ceph_assert(r >= 0);
+ }
+ set<int> rules;
+ for (auto &i: raw_rules) {
+ // exclude any dead rule
+ if (crush_rule_in_use(i)) {
+ rules.insert(i);
+ }
+ }
+ for (auto &r: rules) {
+ get_pool_ids_by_rule(r, pool_ids);
+ }
+}
+
+template <typename F>
+class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
+public:
+ typedef CrushTreeDumper::Dumper<F> Parent;
+
+ OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
+ const PGMap& pgmap_, bool tree_,
+ const string& filter) :
+ Parent(crush, osdmap_->get_pool_names()),
+ osdmap(osdmap_),
+ pgmap(pgmap_),
+ tree(tree_),
+ min_var(-1),
+ max_var(-1),
+ stddev(0),
+ sum(0) {
+ if (osdmap->crush->name_exists(filter)) {
+ // filter by crush node
+ auto item_id = osdmap->crush->get_item_id(filter);
+ allowed.insert(item_id);
+ osdmap->crush->get_all_children(item_id, &allowed);
+ } else if (osdmap->crush->class_exists(filter)) {
+ // filter by device class
+ class_id = osdmap->crush->get_class_id(filter);
+ } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
+ pool_id >= 0) {
+ // filter by pool
+ auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
+ set<int> roots;
+ osdmap->crush->find_takes_by_rule(crush_rule, &roots);
+ allowed = roots;
+ for (auto r : roots)
+ osdmap->crush->get_all_children(r, &allowed);
+ }
+ average_util = average_utilization();
+ }
+
+protected:
+
+ bool should_dump(int id) const {
+ if (!allowed.empty() && !allowed.count(id)) // filter by name
+ return false;
+ if (id >= 0 && class_id >= 0) {
+ auto item_class_id = osdmap->crush->get_item_class_id(id);
+ if (item_class_id < 0 || // not bound to a class yet
+ item_class_id != class_id) // or already bound to a different class
+ return false;
+ }
+ return true;
+ }
+
+ set<int> get_dumped_osds() {
+ if (allowed.empty() && class_id < 0) {
+ // old way, all
+ return {};
+ }
+ return dumped_osds;
+ }
+
+ void dump_stray(F *f) {
+ for (int i = 0; i < osdmap->get_max_osd(); i++) {
+ if (osdmap->exists(i) && !this->is_touched(i))
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
+ }
+ }
+
+ void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
+ if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
+ return;
+ if (!should_dump(qi.id))
+ return;
+
+ if (!qi.is_bucket())
+ dumped_osds.insert(qi.id);
+ float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
+ int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
+ kb_used_meta = 0, kb_avail = 0;
+ double util = 0;
+ if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
+ &kb_used_omap, &kb_used_meta, &kb_avail))
+ if (kb_used && kb)
+ util = 100.0 * (double)kb_used / (double)kb;
+
+ double var = 1.0;
+ if (average_util)
+ var = util / average_util;
+
+ size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
+
+ dump_item(qi, reweight, kb, kb_used,
+ kb_used_data, kb_used_omap, kb_used_meta,
+ kb_avail, util, var, num_pgs, f);
+
+ if (!qi.is_bucket() && reweight > 0) {
+ if (min_var < 0 || var < min_var)
+ min_var = var;
+ if (max_var < 0 || var > max_var)
+ max_var = var;
+
+ double dev = util - average_util;
+ dev *= dev;
+ stddev += reweight * dev;
+ sum += reweight;
+ }
+ }
+
+ virtual void dump_item(const CrushTreeDumper::Item &qi,
+ float &reweight,
+ int64_t kb,
+ int64_t kb_used,
+ int64_t kb_used_data,
+ int64_t kb_used_omap,
+ int64_t kb_used_meta,
+ int64_t kb_avail,
+ double& util,
+ double& var,
+ const size_t num_pgs,
+ F *f) = 0;
+
+ double dev() {
+ return sum > 0 ? sqrt(stddev / sum) : 0;
+ }
+
+ double average_utilization() {
+ int64_t kb = 0, kb_used = 0;
+ for (int i = 0; i < osdmap->get_max_osd(); i++) {
+ if (!osdmap->exists(i) ||
+ osdmap->get_weight(i) == 0 ||
+ !should_dump(i))
+ continue;
+ int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
+ kb_avail_i;
+ if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
+ &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
+ kb += kb_i;
+ kb_used += kb_used_i;
+ }
+ }
+ return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
+ }
+
+ bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
+ int64_t* kb_used_data,
+ int64_t* kb_used_omap,
+ int64_t* kb_used_meta,
+ int64_t* kb_avail) const {
+ const osd_stat_t *p = pgmap.get_osd_stat(id);
+ if (!p) return false;
+ *kb = p->statfs.kb();
+ *kb_used = p->statfs.kb_used_raw();
+ *kb_used_data = p->statfs.kb_used_data();
+ *kb_used_omap = p->statfs.kb_used_omap();
+ *kb_used_meta = p->statfs.kb_used_internal_metadata();
+ *kb_avail = p->statfs.kb_avail();
+
+ return true;
+ }
+
+ bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
+ int64_t* kb_used_data,
+ int64_t* kb_used_omap,
+ int64_t* kb_used_meta,
+ int64_t* kb_avail) const {
+ if (id >= 0) {
+ if (osdmap->is_out(id) || !should_dump(id)) {
+ *kb = 0;
+ *kb_used = 0;
+ *kb_used_data = 0;
+ *kb_used_omap = 0;
+ *kb_used_meta = 0;
+ *kb_avail = 0;
+ return true;
+ }
+ return get_osd_utilization(id, kb, kb_used, kb_used_data,
+ kb_used_omap, kb_used_meta, kb_avail);
+ }
+
+ *kb = 0;
+ *kb_used = 0;
+ *kb_used_data = 0;
+ *kb_used_omap = 0;
+ *kb_used_meta = 0;
+ *kb_avail = 0;
+
+ for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
+ int item = osdmap->crush->get_bucket_item(id, k);
+ int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
+ kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
+ if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
+ &kb_used_data_i, &kb_used_omap_i,
+ &kb_used_meta_i, &kb_avail_i))
+ return false;
+ *kb += kb_i;
+ *kb_used += kb_used_i;
+ *kb_used_data += kb_used_data_i;
+ *kb_used_omap += kb_used_omap_i;
+ *kb_used_meta += kb_used_meta_i;
+ *kb_avail += kb_avail_i;
+ }
+ return true;
+ }
+
+protected:
+ const OSDMap *osdmap;
+ const PGMap& pgmap;
+ bool tree;
+ double average_util;
+ double min_var;
+ double max_var;
+ double stddev;
+ double sum;
+ int class_id = -1;
+ set<int> allowed;
+ set<int> dumped_osds;
+};
+
+
+class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
+public:
+ typedef OSDUtilizationDumper<TextTable> Parent;
+
+ OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
+ const PGMap& pgmap, bool tree,
+ const string& filter) :
+ Parent(crush, osdmap, pgmap, tree, filter) {}
+
+ void dump(TextTable *tbl) {
+ tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
+ if (tree)
+ tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+
+ Parent::dump(tbl);
+
+ dump_stray(tbl);
+
+ auto sum = pgmap.get_osd_sum(get_dumped_osds());
+ *tbl << ""
+ << ""
+ << "" << "TOTAL"
+ << byte_u_t(sum.statfs.total)
+ << byte_u_t(sum.statfs.get_used_raw())
+ << byte_u_t(sum.statfs.allocated)
+ << byte_u_t(sum.statfs.omap_allocated)
+ << byte_u_t(sum.statfs.internal_metadata)
+ << byte_u_t(sum.statfs.available)
+ << lowprecision_t(average_util)
+ << ""
+ << TextTable::endrow;
+ }
+
+protected:
+ struct lowprecision_t {
+ float v;
+ explicit lowprecision_t(float _v) : v(_v) {}
+ };
+ friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
+
+ using OSDUtilizationDumper<TextTable>::dump_item;
+ void dump_item(const CrushTreeDumper::Item &qi,
+ float &reweight,
+ int64_t kb,
+ int64_t kb_used,
+ int64_t kb_used_data,
+ int64_t kb_used_omap,
+ int64_t kb_used_meta,
+ int64_t kb_avail,
+ double& util,
+ double& var,
+ const size_t num_pgs,
+ TextTable *tbl) override {
+ const char *c = crush->get_item_class(qi.id);
+ if (!c)
+ c = "";
+ *tbl << qi.id
+ << c
+ << weightf_t(qi.weight)
+ << weightf_t(reweight)
+ << byte_u_t(kb << 10)
+ << byte_u_t(kb_used << 10)
+ << byte_u_t(kb_used_data << 10)
+ << byte_u_t(kb_used_omap << 10)
+ << byte_u_t(kb_used_meta << 10)
+ << byte_u_t(kb_avail << 10)
+ << lowprecision_t(util)
+ << lowprecision_t(var);
+
+ if (qi.is_bucket()) {
+ *tbl << "-";
+ *tbl << "";
+ } else {
+ *tbl << num_pgs;
+ if (osdmap->is_up(qi.id)) {
+ *tbl << "up";
+ } else if (osdmap->is_destroyed(qi.id)) {
+ *tbl << "destroyed";
+ } else {
+ *tbl << "down";
+ }
+ }
+
+ if (tree) {
+ ostringstream name;
+ for (int k = 0; k < qi.depth; k++)
+ name << " ";
+ if (qi.is_bucket()) {
+ int type = crush->get_bucket_type(qi.id);
+ name << crush->get_type_name(type) << " "
+ << crush->get_item_name(qi.id);
+ } else {
+ name << "osd." << qi.id;
+ }
+ *tbl << name.str();
+ }
+
+ *tbl << TextTable::endrow;
+ }
+
+public:
+ string summary() {
+ ostringstream out;
+ out << "MIN/MAX VAR: " << lowprecision_t(min_var)
+ << "/" << lowprecision_t(max_var) << " "
+ << "STDDEV: " << lowprecision_t(dev());
+ return out.str();
+ }
+};
+
+ostream& operator<<(ostream& out,
+ const OSDUtilizationPlainDumper::lowprecision_t& v)
+{
+ if (v.v < -0.01) {
+ return out << "-";
+ } else if (v.v < 0.001) {
+ return out << "0";
+ } else {
+ std::streamsize p = out.precision();
+ return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
+ }
+}
+
+class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
+public:
+ typedef OSDUtilizationDumper<Formatter> Parent;
+
+ OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
+ const PGMap& pgmap, bool tree,
+ const string& filter) :
+ Parent(crush, osdmap, pgmap, tree, filter) {}
+
+ void dump(Formatter *f) {
+ f->open_array_section("nodes");
+ Parent::dump(f);
+ f->close_section();
+
+ f->open_array_section("stray");
+ dump_stray(f);
+ f->close_section();
+ }
+
+protected:
+ using OSDUtilizationDumper<Formatter>::dump_item;
+ void dump_item(const CrushTreeDumper::Item &qi,
+ float &reweight,
+ int64_t kb,
+ int64_t kb_used,
+ int64_t kb_used_data,
+ int64_t kb_used_omap,
+ int64_t kb_used_meta,
+ int64_t kb_avail,
+ double& util,
+ double& var,
+ const size_t num_pgs,
+ Formatter *f) override {
+ f->open_object_section("item");
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+ f->dump_float("reweight", reweight);
+ f->dump_int("kb", kb);
+ f->dump_int("kb_used", kb_used);
+ f->dump_int("kb_used_data", kb_used_data);
+ f->dump_int("kb_used_omap", kb_used_omap);
+ f->dump_int("kb_used_meta", kb_used_meta);
+ f->dump_int("kb_avail", kb_avail);
+ f->dump_float("utilization", util);
+ f->dump_float("var", var);
+ f->dump_unsigned("pgs", num_pgs);
+ if (!qi.is_bucket()) {
+ if (osdmap->is_up(qi.id)) {
+ f->dump_string("status", "up");
+ } else if (osdmap->is_destroyed(qi.id)) {
+ f->dump_string("status", "destroyed");
+ } else {
+ f->dump_string("status", "down");
+ }
+ }
+ CrushTreeDumper::dump_bucket_children(crush, qi, f);
+ f->close_section();
+ }
+
+public:
+ void summary(Formatter *f) {
+ f->open_object_section("summary");
+ auto sum = pgmap.get_osd_sum(get_dumped_osds());
+ auto& s = sum.statfs;
+
+ f->dump_int("total_kb", s.kb());
+ f->dump_int("total_kb_used", s.kb_used_raw());
+ f->dump_int("total_kb_used_data", s.kb_used_data());
+ f->dump_int("total_kb_used_omap", s.kb_used_omap());
+ f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
+ f->dump_int("total_kb_avail", s.kb_avail());
+ f->dump_float("average_utilization", average_util);
+ f->dump_float("min_var", min_var);
+ f->dump_float("max_var", max_var);
+ f->dump_float("dev", dev());
+ f->close_section();
+ }
+};
+
+void print_osd_utilization(const OSDMap& osdmap,
+ const PGMap& pgmap,
+ ostream& out,
+ Formatter *f,
+ bool tree,
+ const string& filter)
+{
+ const CrushWrapper *crush = osdmap.crush.get();
+ if (f) {
+ f->open_object_section("df");
+ OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
+ d.dump(f);
+ d.summary(f);
+ f->close_section();
+ f->flush(out);
+ } else {
+ OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
+ TextTable tbl;
+ d.dump(&tbl);
+ out << tbl << d.summary() << "\n";
+ }
+}
+
+void OSDMap::check_health(CephContext *cct,
+ health_check_map_t *checks) const
+{
+ int num_osds = get_num_osds();
+
+ // OSD_DOWN
+ // OSD_$subtree_DOWN
+ // OSD_ORPHAN
+ if (num_osds >= 0) {
+ int num_in_osds = 0;
+ int num_down_in_osds = 0;
+ set<int> osds;
+ set<int> down_in_osds;
+ set<int> up_in_osds;
+ set<int> subtree_up;
+ unordered_map<int, set<int> > subtree_type_down;
+ unordered_map<int, int> num_osds_subtree;
+ int max_type = crush->get_max_type_id();
+
+ for (int i = 0; i < get_max_osd(); i++) {
+ if (!exists(i)) {
+ if (crush->item_exists(i)) {
+ osds.insert(i);
+ }
+ continue;
+ }
+ if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
+ continue;
+ ++num_in_osds;
+ if (down_in_osds.count(i) || up_in_osds.count(i))
+ continue;
+ if (!is_up(i)) {
+ down_in_osds.insert(i);
+ int parent_id = 0;
+ int current = i;
+ for (int type = 0; type <= max_type; type++) {
+ if (!crush->get_type_name(type))
+ continue;
+ int r = crush->get_immediate_parent_id(current, &parent_id);
+ if (r == -ENOENT)
+ break;
+ // break early if this parent is already marked as up
+ if (subtree_up.count(parent_id))
+ break;
+ type = crush->get_bucket_type(parent_id);
+ if (!subtree_type_is_down(
+ cct, parent_id, type,
+ &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+ break;
+ current = parent_id;
+ }
+ }
+ }
+
+ // calculate the number of down osds in each down subtree and
+ // store it in num_osds_subtree
+ for (int type = 1; type <= max_type; type++) {
+ if (!crush->get_type_name(type))
+ continue;
+ for (auto j = subtree_type_down[type].begin();
+ j != subtree_type_down[type].end();
+ ++j) {
+ list<int> children;
+ int num = 0;
+ int num_children = crush->get_children(*j, &children);
+ if (num_children == 0)
+ continue;
+ for (auto l = children.begin(); l != children.end(); ++l) {
+ if (*l >= 0) {
+ ++num;
+ } else if (num_osds_subtree[*l] > 0) {
+ num = num + num_osds_subtree[*l];
+ }
+ }
+ num_osds_subtree[*j] = num;
+ }
+ }
+ num_down_in_osds = down_in_osds.size();
+ ceph_assert(num_down_in_osds <= num_in_osds);
+ if (num_down_in_osds > 0) {
+ // summary of down subtree types and osds
+ for (int type = max_type; type > 0; type--) {
+ if (!crush->get_type_name(type))
+ continue;
+ if (subtree_type_down[type].size() > 0) {
+ ostringstream ss;
+ ss << subtree_type_down[type].size() << " "
+ << crush->get_type_name(type);
+ if (subtree_type_down[type].size() > 1) {
+ ss << "s";
+ }
+ int sum_down_osds = 0;
+ for (auto j = subtree_type_down[type].begin();
+ j != subtree_type_down[type].end();
+ ++j) {
+ sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+ }
+ ss << " (" << sum_down_osds << " osds) down";
+ string err = string("OSD_") +
+ string(crush->get_type_name(type)) + "_DOWN";
+ boost::to_upper(err);
+ auto& d = checks->add(err, HEALTH_WARN, ss.str(),
+ subtree_type_down[type].size());
+ for (auto j = subtree_type_down[type].rbegin();
+ j != subtree_type_down[type].rend();
+ ++j) {
+ ostringstream ss;
+ ss << crush->get_type_name(type);
+ ss << " ";
+ ss << crush->get_item_name(*j);
+ // at the top level, do not print location
+ if (type != max_type) {
+ ss << " (";
+ ss << crush->get_full_location_ordered_string(*j);
+ ss << ")";
+ }
+ int num = num_osds_subtree[*j];
+ ss << " (" << num << " osds)";
+ ss << " is down";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+ ostringstream ss;
+ ss << down_in_osds.size() << " osds down";
+ auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
+ down_in_osds.size());
+ for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+ ostringstream ss;
+ ss << "osd." << *it << " (";
+ ss << crush->get_full_location_ordered_string(*it);
+ ss << ") is down";
+ d.detail.push_back(ss.str());
+ }
+ }
+
+ if (!osds.empty()) {
+ ostringstream ss;
+ ss << osds.size() << " osds exist in the crush map but not in the osdmap";
+ auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
+ osds.size());
+ for (auto osd : osds) {
+ ostringstream ss;
+ ss << "osd." << osd << " exists in crush map but not in osdmap";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+
+ std::list<std::string> scrub_messages;
+ bool noscrub = false, nodeepscrub = false;
+ for (const auto &p : pools) {
+ if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
+ ostringstream ss;
+ ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
+ scrub_messages.push_back(ss.str());
+ noscrub = true;
+ }
+ if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
+ ostringstream ss;
+ ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
+ scrub_messages.push_back(ss.str());
+ nodeepscrub = true;
+ }
+ }
+ if (noscrub || nodeepscrub) {
+ string out = "";
+ out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
+ out += nodeepscrub ? "nodeep-scrub" : "";
+ auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
+ "Some pool(s) have the " + out + " flag(s) set", 0);
+ d.detail.splice(d.detail.end(), scrub_messages);
+ }
+
+ // OSD_OUT_OF_ORDER_FULL
+ {
+ // An osd could configure failsafe ratio, to something different
+ // but for now assume it is the same here.
+ float fsr = cct->_conf->osd_failsafe_full_ratio;
+ if (fsr > 1.0) fsr /= 100;
+ float fr = get_full_ratio();
+ float br = get_backfillfull_ratio();
+ float nr = get_nearfull_ratio();
+
+ list<string> detail;
+ // These checks correspond to how OSDService::check_full_status() in an OSD
+ // handles the improper setting of these values.
+ if (br < nr) {
+ ostringstream ss;
+ ss << "backfillfull_ratio (" << br
+ << ") < nearfull_ratio (" << nr << "), increased";
+ detail.push_back(ss.str());
+ br = nr;
+ }
+ if (fr < br) {
+ ostringstream ss;
+ ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
+ << "), increased";
+ detail.push_back(ss.str());
+ fr = br;
+ }
+ if (fsr < fr) {
+ ostringstream ss;
+ ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
+ << "), increased";
+ detail.push_back(ss.str());
+ }
+ if (!detail.empty()) {
+ auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
+ "full ratio(s) out of order", 0);
+ d.detail.swap(detail);
+ }
+ }
+
+ // OSD_FULL
+ // OSD_NEARFULL
+ // OSD_BACKFILLFULL
+ // OSD_FAILSAFE_FULL
+ {
+ set<int> full, backfillfull, nearfull;
+ get_full_osd_counts(&full, &backfillfull, &nearfull);
+ if (full.size()) {
+ ostringstream ss;
+ ss << full.size() << " full osd(s)";
+ auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
+ for (auto& i: full) {
+ ostringstream ss;
+ ss << "osd." << i << " is full";
+ d.detail.push_back(ss.str());
+ }
+ }
+ if (backfillfull.size()) {
+ ostringstream ss;
+ ss << backfillfull.size() << " backfillfull osd(s)";
+ auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
+ backfillfull.size());
+ for (auto& i: backfillfull) {
+ ostringstream ss;
+ ss << "osd." << i << " is backfill full";
+ d.detail.push_back(ss.str());
+ }
+ }
+ if (nearfull.size()) {
+ ostringstream ss;
+ ss << nearfull.size() << " nearfull osd(s)";
+ auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
+ for (auto& i: nearfull) {
+ ostringstream ss;
+ ss << "osd." << i << " is near full";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+
+ // OSDMAP_FLAGS
+ {
+ // warn about flags
+ uint64_t warn_flags =
+ CEPH_OSDMAP_PAUSERD |
+ CEPH_OSDMAP_PAUSEWR |
+ CEPH_OSDMAP_PAUSEREC |
+ CEPH_OSDMAP_NOUP |
+ CEPH_OSDMAP_NODOWN |
+ CEPH_OSDMAP_NOIN |
+ CEPH_OSDMAP_NOOUT |
+ CEPH_OSDMAP_NOBACKFILL |
+ CEPH_OSDMAP_NORECOVER |
+ CEPH_OSDMAP_NOSCRUB |
+ CEPH_OSDMAP_NODEEP_SCRUB |
+ CEPH_OSDMAP_NOTIERAGENT |
+ CEPH_OSDMAP_NOSNAPTRIM |
+ CEPH_OSDMAP_NOREBALANCE;
+ if (test_flag(warn_flags)) {
+ ostringstream ss;
+ string s = get_flag_string(get_flags() & warn_flags);
+ ss << s << " flag(s) set";
+ checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
+ s.size() /* kludgey but sufficient */);
+ }
+ }
+
+ // OSD_FLAGS
+ {
+ list<string> detail;
+ const unsigned flags =
+ CEPH_OSD_NOUP |
+ CEPH_OSD_NOIN |
+ CEPH_OSD_NODOWN |
+ CEPH_OSD_NOOUT;
+ for (int i = 0; i < max_osd; ++i) {
+ if (osd_state[i] & flags) {
+ ostringstream ss;
+ set<string> states;
+ OSDMap::calc_state_set(osd_state[i] & flags, states);
+ ss << "osd." << i << " has flags " << states;
+ detail.push_back(ss.str());
+ }
+ }
+ for (auto& i : crush_node_flags) {
+ if (i.second && crush->item_exists(i.first)) {
+ ostringstream ss;
+ set<string> states;
+ OSDMap::calc_state_set(i.second, states);
+ int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
+ const char *tn = crush->get_type_name(t);
+ ss << (tn ? tn : "node") << " "
+ << crush->get_item_name(i.first) << " has flags " << states;
+ detail.push_back(ss.str());
+ }
+ }
+ for (auto& i : device_class_flags) {
+ const char* class_name = crush->get_class_name(i.first);
+ if (i.second && class_name) {
+ ostringstream ss;
+ set<string> states;
+ OSDMap::calc_state_set(i.second, states);
+ ss << "device class '" << class_name << "' has flags " << states;
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
+ auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
+ d.detail.swap(detail);
+ }
+ }
+
+ // OLD_CRUSH_TUNABLES
+ if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
+ string min = crush->get_min_required_version();
+ if (min < cct->_conf->mon_crush_min_required_version) {
+ ostringstream ss;
+ ss << "crush map has legacy tunables (require " << min
+ << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
+ auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
+ d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
+ }
+ }
+
+ // OLD_CRUSH_STRAW_CALC_VERSION
+ if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
+ if (crush->get_straw_calc_version() == 0) {
+ ostringstream ss;
+ ss << "crush map has straw_calc_version=0";
+ auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
+ d.detail.push_back(
+ "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
+ }
+ }
+
+ // CACHE_POOL_NO_HIT_SET
+ if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
+ list<string> detail;
+ for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
+ const pg_pool_t& info = p->second;
+ if (info.cache_mode_requires_hit_set() &&
+ info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+ ostringstream ss;
+ ss << "pool '" << get_pool_name(p->first)
+ << "' with cache_mode " << info.get_cache_mode_name()
+ << " needs hit_set_type to be set but it is not";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " cache pools are missing hit_sets";
+ auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
+ detail.size());
+ d.detail.swap(detail);
+ }
+ }
+
+ // OSD_NO_SORTBITWISE
+ if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+ ostringstream ss;
+ ss << "'sortbitwise' flag is not set";
+ checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
+ }
+
+ // OSD_UPGRADE_FINISHED
+ if (auto require_release = pending_require_osd_release()) {
+ ostringstream ss;
+ ss << "all OSDs are running " << *require_release << " or later but"
+ << " require_osd_release < " << *require_release;
+ auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
+ d.detail.push_back(ss.str());
+ }
+
+ // POOL_NEARFULL/BACKFILLFULL/FULL
+ {
+ list<string> full_detail, backfillfull_detail, nearfull_detail;
+ for (auto it : get_pools()) {
+ const pg_pool_t &pool = it.second;
+ const string& pool_name = get_pool_name(it.first);
+ if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+ stringstream ss;
+ if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+ // may run out of space too,
+ // but we want EQUOTA taking precedence
+ ss << "pool '" << pool_name << "' is full (running out of quota)";
+ } else {
+ ss << "pool '" << pool_name << "' is full (no space)";
+ }
+ full_detail.push_back(ss.str());
+ } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is backfillfull";
+ backfillfull_detail.push_back(ss.str());
+ } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is nearfull";
+ nearfull_detail.push_back(ss.str());
+ }
+ }
+ if (!full_detail.empty()) {
+ ostringstream ss;
+ ss << full_detail.size() << " pool(s) full";
+ auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
+ d.detail.swap(full_detail);
+ }
+ if (!backfillfull_detail.empty()) {
+ ostringstream ss;
+ ss << backfillfull_detail.size() << " pool(s) backfillfull";
+ auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
+ backfillfull_detail.size());
+ d.detail.swap(backfillfull_detail);
+ }
+ if (!nearfull_detail.empty()) {
+ ostringstream ss;
+ ss << nearfull_detail.size() << " pool(s) nearfull";
+ auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
+ nearfull_detail.size());
+ d.detail.swap(nearfull_detail);
+ }
+ }
+
+ // POOL_PG_NUM_NOT_POWER_OF_TWO
+ if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
+ list<string> detail;
+ for (auto it : get_pools()) {
+ if (!isp2(it.second.get_pg_num_target())) {
+ ostringstream ss;
+ ss << "pool '" << get_pool_name(it.first)
+ << "' pg_num " << it.second.get_pg_num_target()
+ << " is not a power of two";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " pool(s) have non-power-of-two pg_num";
+ auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
+ ss.str(), detail.size());
+ d.detail.swap(detail);
+ }
+ }
+
+ // POOL_NO_REDUNDANCY
+ if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
+ {
+ list<string> detail;
+ for (auto it : get_pools()) {
+ if (it.second.get_size() == 1) {
+ ostringstream ss;
+ ss << "pool '" << get_pool_name(it.first)
+ << "' has no replicas configured";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " pool(s) have no replicas configured";
+ auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
+ ss.str(), detail.size());
+ d.detail.swap(detail);
+ }
+ }
+
+ // DEGRADED STRETCH MODE
+ if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
+ if (recovering_stretch_mode) {
+ stringstream ss;
+ ss << "We are recovering stretch mode buckets, only requiring "
+ << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
+ checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
+ ss.str(), 0);
+ } else if (degraded_stretch_mode) {
+ stringstream ss;
+ ss << "We are missing stretch mode buckets, only requiring "
+ << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
+ checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
+ ss.str(), 0);
+ }
+ }
+}
+
+int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
+ ostream *ss) const
+{
+ out->clear();
+ for (auto i = ls.begin(); i != ls.end(); ++i) {
+ if (i == ls.begin() &&
+ (*i == "any" || *i == "all" || *i == "*")) {
+ get_all_osds(*out);
+ break;
+ }
+ long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
+ if (osd < 0) {
+ *ss << "invalid osd id '" << *i << "'";
+ return -EINVAL;
+ }
+ out->insert(osd);
+ }
+ return 0;
+}
+
+void OSDMap::get_random_up_osds_by_subtree(int n, // whoami
+ string &subtree,
+ int limit, // how many
+ set<int> skip,
+ set<int> *want) const {
+ if (limit <= 0)
+ return;
+ int subtree_type = crush->get_type_id(subtree);
+ if (subtree_type < 1)
+ return;
+ vector<int> subtrees;
+ crush->get_subtree_of_type(subtree_type, &subtrees);
+ std::random_device rd;
+ std::default_random_engine rng{rd()};
+ std::shuffle(subtrees.begin(), subtrees.end(), rng);
+ for (auto s : subtrees) {
+ if (limit <= 0)
+ break;
+ if (crush->subtree_contains(s, n))
+ continue;
+ vector<int> osds;
+ crush->get_children_of_type(s, 0, &osds);
+ if (osds.empty())
+ continue;
+ vector<int> up_osds;
+ for (auto o : osds) {
+ if (is_up(o) && !skip.count(o))
+ up_osds.push_back(o);
+ }
+ if (up_osds.empty())
+ continue;
+ auto it = up_osds.begin();
+ std::advance(it, (n % up_osds.size()));
+ want->insert(*it);
+ --limit;
+ }
+}
+
+float OSDMap::pool_raw_used_rate(int64_t poolid) const
+{
+ const pg_pool_t *pool = get_pg_pool(poolid);
+ assert(pool != nullptr);
+
+ switch (pool->get_type()) {
+ case pg_pool_t::TYPE_REPLICATED:
+ return pool->get_size();
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ auto& ecp =
+ get_erasure_code_profile(pool->erasure_code_profile);
+ auto pm = ecp.find("m");
+ auto pk = ecp.find("k");
+ if (pm != ecp.end() && pk != ecp.end()) {
+ int k = atoi(pk->second.c_str());
+ int m = atoi(pm->second.c_str());
+ int mk = m + k;
+ ceph_assert(mk != 0);
+ ceph_assert(k != 0);
+ return (float)mk / k;
+ } else {
+ return 0.0;
+ }
+ }
+ break;
+ default:
+ ceph_abort_msg("unrecognized pool type");
+ }
+}
+
+unsigned OSDMap::get_osd_crush_node_flags(int osd) const
+{
+ unsigned flags = 0;
+ if (!crush_node_flags.empty()) {
+ // the map will contain type -> name
+ std::map<std::string,std::string> ploc = crush->get_full_location(osd);
+ for (auto& i : ploc) {
+ int id = crush->get_item_id(i.second);
+ auto p = crush_node_flags.find(id);
+ if (p != crush_node_flags.end()) {
+ flags |= p->second;
+ }
+ }
+ }
+ return flags;
+}
+
+unsigned OSDMap::get_crush_node_flags(int id) const
+{
+ unsigned flags = 0;
+ auto it = crush_node_flags.find(id);
+ if (it != crush_node_flags.end())
+ flags = it->second;
+ return flags;
+}
+
+unsigned OSDMap::get_device_class_flags(int id) const
+{
+ unsigned flags = 0;
+ auto it = device_class_flags.find(id);
+ if (it != device_class_flags.end())
+ flags = it->second;
+ return flags;
+}
+
+std::optional<std::string> OSDMap::pending_require_osd_release() const
+{
+ if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
+ require_osd_release < ceph_release_t::pacific) {
+ return "pacific";
+ }
+ if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
+ require_osd_release < ceph_release_t::octopus) {
+ return "octopus";
+ }
+ if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
+ require_osd_release < ceph_release_t::nautilus) {
+ return "nautilus";
+ }
+
+ return std::nullopt;
+}
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
new file mode 100644
index 000000000..83ab75e0d
--- /dev/null
+++ b/src/osd/OSDMap.h
@@ -0,0 +1,1600 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_OSDMAP_H
+#define CEPH_OSDMAP_H
+
+/*
+ * describe properties of the OSD cluster.
+ * disks, disk groups, total # osds,
+ *
+ */
+#include <vector>
+#include <list>
+#include <set>
+#include <map>
+#include <memory>
+
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include "include/btree_map.h"
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/ceph_releases.h"
+#include "osd_types.h"
+
+//#include "include/ceph_features.h"
+#include "crush/CrushWrapper.h"
+
+// forward declaration
+class CrushWrapper;
+class health_check_map_t;
+
+/*
+ * we track up to two intervals during which the osd was alive and
+ * healthy. the most recent is [up_from,up_thru), where up_thru is
+ * the last epoch the osd is known to have _started_. i.e., a lower
+ * bound on the actual osd death. down_at (if it is > up_from) is an
+ * upper bound on the actual osd death.
+ *
+ * the second is the last_clean interval [begin,end). in that case,
+ * the last interval is the last epoch known to have been either
+ * _finished_, or during which the osd cleanly shut down. when
+ * possible, we push this forward to the epoch the osd was eventually
+ * marked down.
+ *
+ * the lost_at is used to allow build_prior to proceed without waiting
+ * for an osd to recover. In certain cases, progress may be blocked
+ * because an osd is down that may contain updates (i.e., a pg may have
+ * gone rw during an interval). If the osd can't be brought online, we
+ * can force things to proceed knowing that we _might_ be losing some
+ * acked writes. If the osd comes back to life later, that's fine to,
+ * but those writes will still be lost (the divergent objects will be
+ * thrown out).
+ */
+struct osd_info_t {
+ epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown
+ epoch_t last_clean_end;
+ epoch_t up_from; // epoch osd marked up
+ epoch_t up_thru; // lower bound on actual osd death (if > up_from)
+ epoch_t down_at; // upper bound on actual osd death (if > up_from)
+ epoch_t lost_at; // last epoch we decided data was "lost"
+
+ osd_info_t() : last_clean_begin(0), last_clean_end(0),
+ up_from(0), up_thru(0), down_at(0), lost_at(0) {}
+
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ static void generate_test_instances(std::list<osd_info_t*>& o);
+};
+WRITE_CLASS_ENCODER(osd_info_t)
+
+std::ostream& operator<<(std::ostream& out, const osd_info_t& info);
+
+struct osd_xinfo_t {
+ utime_t down_stamp; ///< timestamp when we were last marked down
+ float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
+ __u32 laggy_interval; ///< average interval between being marked laggy and recovering
+ uint64_t features; ///< features supported by this osd we should know about
+ __u32 old_weight; ///< weight prior to being auto marked out
+ utime_t last_purged_snaps_scrub; ///< last scrub of purged_snaps
+ epoch_t dead_epoch = 0; ///< last epoch we were confirmed dead (not just down)
+
+ osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
+ features(0), old_weight(0) {}
+
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ static void generate_test_instances(std::list<osd_xinfo_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t)
+
+std::ostream& operator<<(std::ostream& out, const osd_xinfo_t& xi);
+
+
+struct PGTempMap {
+#if 1
+ ceph::buffer::list data;
+ typedef btree::btree_map<pg_t,ceph_le32*> map_t;
+ map_t map;
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ uint32_t n = map.size();
+ encode(n, bl);
+ for (auto &p : map) {
+ encode(p.first, bl);
+ bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32));
+ }
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ data.clear();
+ map.clear();
+ uint32_t n;
+ decode(n, p);
+ if (!n)
+ return;
+ auto pstart = p;
+ size_t start_off = pstart.get_off();
+ std::vector<std::pair<pg_t,size_t>> offsets;
+ offsets.resize(n);
+ for (unsigned i=0; i<n; ++i) {
+ pg_t pgid;
+ decode(pgid, p);
+ offsets[i].first = pgid;
+ offsets[i].second = p.get_off() - start_off;
+ uint32_t vn;
+ decode(vn, p);
+ p += vn * sizeof(int32_t);
+ }
+ size_t len = p.get_off() - start_off;
+ pstart.copy(len, data);
+ if (data.get_num_buffers() > 1) {
+ data.rebuild();
+ }
+ //map.reserve(n);
+ char *start = data.c_str();
+ for (auto i : offsets) {
+ map.insert(map.end(), std::make_pair(i.first, (ceph_le32*)(start + i.second)));
+ }
+ }
+ void rebuild() {
+ ceph::buffer::list bl;
+ encode(bl);
+ auto p = std::cbegin(bl);
+ decode(p);
+ }
+ friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
+ return
+ l.map.size() == r.map.size() &&
+ l.data.contents_equal(r.data);
+ }
+
+ class iterator {
+ map_t::const_iterator it;
+ map_t::const_iterator end;
+ std::pair<pg_t,std::vector<int32_t>> current;
+ void init_current() {
+ if (it != end) {
+ current.first = it->first;
+ ceph_assert(it->second);
+ current.second.resize(*it->second);
+ ceph_le32 *p = it->second + 1;
+ for (uint32_t n = 0; n < *it->second; ++n, ++p) {
+ current.second[n] = *p;
+ }
+ }
+ }
+ public:
+ iterator(map_t::const_iterator p,
+ map_t::const_iterator e)
+ : it(p), end(e) {
+ init_current();
+ }
+
+ const std::pair<pg_t,std::vector<int32_t>>& operator*() const {
+ return current;
+ }
+ const std::pair<pg_t,std::vector<int32_t>>* operator->() const {
+ return &current;
+ }
+ friend bool operator==(const iterator& l, const iterator& r) {
+ return l.it == r.it;
+ }
+ friend bool operator!=(const iterator& l, const iterator& r) {
+ return l.it != r.it;
+ }
+ iterator& operator++() {
+ ++it;
+ if (it != end)
+ init_current();
+ return *this;
+ }
+ iterator operator++(int) {
+ iterator r = *this;
+ ++it;
+ if (it != end)
+ init_current();
+ return r;
+ }
+ };
+ iterator begin() const {
+ return iterator(map.begin(), map.end());
+ }
+ iterator end() const {
+ return iterator(map.end(), map.end());
+ }
+ iterator find(pg_t pgid) const {
+ return iterator(map.find(pgid), map.end());
+ }
+ size_t size() const {
+ return map.size();
+ }
+ size_t count(pg_t pgid) const {
+ return map.count(pgid);
+ }
+ void erase(pg_t pgid) {
+ map.erase(pgid);
+ }
+ void clear() {
+ map.clear();
+ data.clear();
+ }
+ void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
+ using ceph::encode;
+ size_t need = sizeof(ceph_le32) * (1 + v.size());
+ if (need < data.get_append_buffer_unused_tail_length()) {
+ ceph::buffer::ptr z(data.get_append_buffer_unused_tail_length());
+ z.zero();
+ data.append(z.c_str(), z.length());
+ }
+ encode(v, data);
+ map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size());
+ }
+ mempool::osdmap::vector<int32_t> get(pg_t pgid) {
+ mempool::osdmap::vector<int32_t> v;
+ ceph_le32 *p = map[pgid];
+ size_t n = *p++;
+ v.resize(n);
+ for (size_t i = 0; i < n; ++i, ++p) {
+ v[i] = *p;
+ }
+ return v;
+ }
+#else
+ // trivial implementation
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
+
+ void encode(ceph::buffer::list& bl) const {
+ encode(pg_temp, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ decode(pg_temp, p);
+ }
+ friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
+ return
+ l.pg_temp.size() == r.pg_temp.size() &&
+ l.pg_temp == r.pg_temp;
+ }
+
+ class iterator {
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
+ public:
+ iterator(mempool::osdmap::map<pg_t,
+ mempool::osdmap::vector<int32_t> >::const_iterator p)
+ : it(p) {}
+
+ std::pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
+ return *it;
+ }
+ const std::pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
+ return &*it;
+ }
+ friend bool operator==(const iterator& l, const iterator& r) {
+ return l.it == r.it;
+ }
+ friend bool operator!=(const iterator& l, const iterator& r) {
+ return l.it != r.it;
+ }
+ iterator& operator++() {
+ ++it;
+ return *this;
+ }
+ iterator operator++(int) {
+ iterator r = *this;
+ ++it;
+ return r;
+ }
+ };
+ iterator begin() const {
+ return iterator(pg_temp.cbegin());
+ }
+ iterator end() const {
+ return iterator(pg_temp.cend());
+ }
+ iterator find(pg_t pgid) const {
+ return iterator(pg_temp.find(pgid));
+ }
+ size_t size() const {
+ return pg_temp.size();
+ }
+ size_t count(pg_t pgid) const {
+ return pg_temp.count(pgid);
+ }
+ void erase(pg_t pgid) {
+ pg_temp.erase(pgid);
+ }
+ void clear() {
+ pg_temp.clear();
+ }
+ void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
+ pg_temp[pgid] = v;
+ }
+ const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
+ return pg_temp.at(pgid);
+ }
+#endif
+ void dump(ceph::Formatter *f) const {
+ for (const auto &pg : *this) {
+ f->open_object_section("osds");
+ f->dump_stream("pgid") << pg.first;
+ f->open_array_section("osds");
+ for (const auto osd : pg.second)
+ f->dump_int("osd", osd);
+ f->close_section();
+ f->close_section();
+ }
+ }
+};
+WRITE_CLASS_ENCODER(PGTempMap)
+
+/** OSDMap
+ */
+class OSDMap {
+public:
+ MEMPOOL_CLASS_HELPERS();
+
+ class Incremental {
+ public:
+ MEMPOOL_CLASS_HELPERS();
+
+ /// feature bits we were encoded with. the subsequent OSDMap
+ /// encoding should match.
+ uint64_t encode_features;
+ uuid_d fsid;
+ epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
+ utime_t modified;
+ int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
+ int32_t new_flags;
+ ceph_release_t new_require_osd_release{0xff};
+ uint32_t new_stretch_bucket_count{0};
+ uint32_t new_degraded_stretch_mode{0};
+ uint32_t new_recovering_stretch_mode{0};
+ int32_t new_stretch_mode_bucket{0};
+ bool stretch_mode_enabled{false};
+ bool change_stretch_mode{false};
+
+ // full (rare)
+ ceph::buffer::list fullmap; // in lieu of below.
+ ceph::buffer::list crush;
+
+ // incremental
+ int32_t new_max_osd;
+ mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
+ mempool::osdmap::map<int64_t,std::string> new_pool_names;
+ mempool::osdmap::set<int64_t> old_pools;
+ mempool::osdmap::map<std::string,std::map<std::string,std::string> > new_erasure_code_profiles;
+ mempool::osdmap::vector<std::string> old_erasure_code_profiles;
+ mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client;
+ mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster;
+ mempool::osdmap::map<int32_t,uint32_t> new_state; // XORed onto previous state.
+ mempool::osdmap::map<int32_t,uint32_t> new_weight;
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
+ mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
+ mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
+ mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
+ mempool::osdmap::map<int32_t,std::pair<epoch_t,epoch_t> > new_last_clean_interval;
+ mempool::osdmap::map<int32_t,epoch_t> new_lost;
+ mempool::osdmap::map<int32_t,uuid_d> new_uuid;
+ mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
+
+ mempool::osdmap::map<entity_addr_t,utime_t> new_blocklist;
+ mempool::osdmap::vector<entity_addr_t> old_blocklist;
+ mempool::osdmap::map<entity_addr_t,utime_t> new_range_blocklist;
+ mempool::osdmap::vector<entity_addr_t> old_range_blocklist;
+ mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up;
+ mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up;
+
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> new_pg_upmap_items;
+ mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
+ mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
+ mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
+
+ mempool::osdmap::map<int32_t,uint32_t> new_crush_node_flags;
+ mempool::osdmap::map<int32_t,uint32_t> new_device_class_flags;
+
+ std::string cluster_snapshot;
+
+ float new_nearfull_ratio = -1;
+ float new_backfillfull_ratio = -1;
+ float new_full_ratio = -1;
+
+ ceph_release_t new_require_min_compat_client{0xff};
+
+ utime_t new_last_up_change, new_last_in_change;
+
+ mutable bool have_crc; ///< crc values are defined
+ uint32_t full_crc; ///< crc of the resulting OSDMap
+ mutable uint32_t inc_crc; ///< crc of this incremental
+
+ int get_net_marked_out(const OSDMap *previous) const;
+ int get_net_marked_down(const OSDMap *previous) const;
+ int identify_osd(uuid_d u) const;
+
+ void encode_client_old(ceph::buffer::list& bl) const;
+ void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
+ void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
+ void decode_classic(ceph::buffer::list::const_iterator &p);
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<Incremental*>& o);
+
+ explicit Incremental(epoch_t e=0) :
+ encode_features(0),
+ epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
+ have_crc(false), full_crc(0), inc_crc(0) {
+ }
+ explicit Incremental(ceph::buffer::list &bl) {
+ auto p = std::cbegin(bl);
+ decode(p);
+ }
+ explicit Incremental(ceph::buffer::list::const_iterator &p) {
+ decode(p);
+ }
+
+ pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
+ if (new_pools.count(pool) == 0)
+ new_pools[pool] = *orig;
+ return &new_pools[pool];
+ }
+ bool has_erasure_code_profile(const std::string &name) const {
+ auto i = new_erasure_code_profiles.find(name);
+ return i != new_erasure_code_profiles.end();
+ }
+ void set_erasure_code_profile(const std::string &name,
+ const std::map<std::string,std::string>& profile) {
+ new_erasure_code_profiles[name] = profile;
+ }
+ mempool::osdmap::map<std::string,std::map<std::string,std::string>> get_erasure_code_profiles() const {
+ return new_erasure_code_profiles;
+ }
+
+ /// propagate update pools' (snap and other) metadata to any of their tiers
+ int propagate_base_properties_to_tiers(CephContext *cct, const OSDMap &base);
+
+ /// filter out osds with any pending state changing
+ size_t get_pending_state_osds(std::vector<int> *osds) {
+ ceph_assert(osds);
+ osds->clear();
+
+ for (auto &p : new_state) {
+ osds->push_back(p.first);
+ }
+
+ return osds->size();
+ }
+
+ bool pending_osd_has_state(int osd, unsigned state) {
+ return new_state.count(osd) && (new_state[osd] & state) != 0;
+ }
+
+ bool pending_osd_state_set(int osd, unsigned state) {
+ if (pending_osd_has_state(osd, state))
+ return false;
+ new_state[osd] |= state;
+ return true;
+ }
+
+ // cancel the specified pending osd state if there is any
+ // return ture on success, false otherwise.
+ bool pending_osd_state_clear(int osd, unsigned state) {
+ if (!pending_osd_has_state(osd, state)) {
+ // never has been set or already has been cancelled.
+ return false;
+ }
+
+ new_state[osd] &= ~state;
+ if (!new_state[osd]) {
+ // all flags cleared
+ new_state.erase(osd);
+ }
+ return true;
+ }
+
+ bool in_new_removed_snaps(int64_t pool, snapid_t snap) const {
+ auto p = new_removed_snaps.find(pool);
+ if (p == new_removed_snaps.end()) {
+ return false;
+ }
+ return p->second.contains(snap);
+ }
+ };
+
+private:
+ uuid_d fsid;
+ epoch_t epoch; // what epoch of the osd cluster descriptor is this
+ utime_t created, modified; // epoch start time
+ int32_t pool_max; // the largest pool num, ever
+
+ uint32_t flags;
+
+ int num_osd; // not saved; see calc_num_osds
+ int num_up_osd; // not saved; see calc_num_osds
+ int num_in_osd; // not saved; see calc_num_osds
+
+ int32_t max_osd;
+ std::vector<uint32_t> osd_state;
+
+ mempool::osdmap::map<int32_t,uint32_t> crush_node_flags; // crush node -> CEPH_OSD_* flags
+ mempool::osdmap::map<int32_t,uint32_t> device_class_flags; // device class -> CEPH_OSD_* flags
+
+ utime_t last_up_change, last_in_change;
+
+ // These features affect OSDMap[::Incremental] encoding, or the
+ // encoding of some type embedded therein (CrushWrapper, something
+ // from osd_types, etc.).
+ static constexpr uint64_t SIGNIFICANT_FEATURES =
+ CEPH_FEATUREMASK_PGID64 |
+ CEPH_FEATUREMASK_PGPOOL3 |
+ CEPH_FEATUREMASK_OSDENC |
+ CEPH_FEATUREMASK_OSDMAP_ENC |
+ CEPH_FEATUREMASK_OSD_POOLRESEND |
+ CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
+ CEPH_FEATUREMASK_MSG_ADDR2 |
+ CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
+ CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
+ CEPH_FEATUREMASK_SERVER_LUMINOUS |
+ CEPH_FEATUREMASK_SERVER_MIMIC |
+ CEPH_FEATUREMASK_SERVER_NAUTILUS |
+ CEPH_FEATUREMASK_SERVER_OCTOPUS;
+
+ struct addrs_s {
+ mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs;
+ mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs;
+ mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs;
+ mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs;
+ };
+ std::shared_ptr<addrs_s> osd_addrs;
+
+ entity_addrvec_t _blank_addrvec;
+
+ mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
+ mempool::osdmap::vector<osd_info_t> osd_info;
+ std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
+ std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
+ std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
+
+ // remap (post-CRUSH, pre-up)
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
+ mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
+
+ mempool::osdmap::map<int64_t,pg_pool_t> pools;
+ mempool::osdmap::map<int64_t,std::string> pool_name;
+ mempool::osdmap::map<std::string, std::map<std::string,std::string>> erasure_code_profiles;
+ mempool::osdmap::map<std::string,int64_t, std::less<>> name_pool;
+
+ std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
+ mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
+
+ class range_bits {
+ struct ip6 {
+ uint64_t upper_64_bits, lower_64_bits;
+ uint64_t upper_mask, lower_mask;
+ };
+ struct ip4 {
+ uint32_t ip_32_bits;
+ uint32_t mask;
+ };
+ union {
+ ip6 ipv6;
+ ip4 ipv4;
+ } bits;
+ bool ipv6;
+ static void get_ipv6_bytes(unsigned const char *addr,
+ uint64_t *upper, uint64_t *lower);
+ public:
+ range_bits();
+ range_bits(const entity_addr_t& addr);
+ void parse(const entity_addr_t& addr);
+ bool matches(const entity_addr_t& addr) const;
+ };
+ mempool::osdmap::unordered_map<entity_addr_t,utime_t> blocklist;
+ mempool::osdmap::map<entity_addr_t,utime_t> range_blocklist;
+ mempool::osdmap::map<entity_addr_t,range_bits> calculated_ranges;
+
+ /// queue of snaps to remove
+ mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue;
+
+ /// removed_snaps additions this epoch
+ mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
+
+ /// removed_snaps removals this epoch
+ mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
+
+ epoch_t cluster_snapshot_epoch;
+ std::string cluster_snapshot;
+ bool new_blocklist_entries;
+
+ float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
+
+ /// min compat client we want to support
+ ceph_release_t require_min_compat_client{ceph_release_t::unknown};
+
+public:
+ /// require osds to run at least this release
+ ceph_release_t require_osd_release{ceph_release_t::unknown};
+
+private:
+ mutable uint64_t cached_up_osd_features;
+
+ mutable bool crc_defined;
+ mutable uint32_t crc;
+
+ void _calc_up_osd_features();
+
+ public:
+ bool have_crc() const { return crc_defined; }
+ uint32_t get_crc() const { return crc; }
+
+ std::shared_ptr<CrushWrapper> crush; // hierarchical map
+ bool stretch_mode_enabled; // we are in stretch mode, requiring multiple sites
+ uint32_t stretch_bucket_count; // number of sites we expect to be in
+ uint32_t degraded_stretch_mode; // 0 if not degraded; else count of up sites
+ uint32_t recovering_stretch_mode; // 0 if not recovering; else 1
+ int32_t stretch_mode_bucket; // the bucket type we're stretched across
+private:
+ uint32_t crush_version = 1;
+
+ friend class OSDMonitor;
+
+ public:
+ OSDMap() : epoch(0),
+ pool_max(0),
+ flags(0),
+ num_osd(0), num_up_osd(0), num_in_osd(0),
+ max_osd(0),
+ osd_addrs(std::make_shared<addrs_s>()),
+ pg_temp(std::make_shared<PGTempMap>()),
+ primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
+ osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
+ cluster_snapshot_epoch(0),
+ new_blocklist_entries(false),
+ cached_up_osd_features(0),
+ crc_defined(false), crc(0),
+ crush(std::make_shared<CrushWrapper>()),
+ stretch_mode_enabled(false), stretch_bucket_count(0),
+ degraded_stretch_mode(0), recovering_stretch_mode(0), stretch_mode_bucket(0) {
+ }
+
+private:
+ OSDMap(const OSDMap& other) = default;
+ OSDMap& operator=(const OSDMap& other) = default;
+public:
+
+ /// return feature mask subset that is relevant to OSDMap encoding
+ static uint64_t get_significant_features(uint64_t features) {
+ return SIGNIFICANT_FEATURES & features;
+ }
+
+ uint64_t get_encoding_features() const;
+
+ void deepish_copy_from(const OSDMap& o) {
+ *this = o;
+ primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
+ pg_temp.reset(new PGTempMap(*o.pg_temp));
+ osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
+
+ if (o.osd_primary_affinity)
+ osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
+
+ // NOTE: this still references shared entity_addrvec_t's.
+ osd_addrs.reset(new addrs_s(*o.osd_addrs));
+
+ // NOTE: we do not copy crush. note that apply_incremental will
+ // allocate a new CrushWrapper, though.
+ }
+
+ // map info
+ const uuid_d& get_fsid() const { return fsid; }
+ void set_fsid(uuid_d& f) { fsid = f; }
+
+ epoch_t get_epoch() const { return epoch; }
+ void inc_epoch() { epoch++; }
+
+ void set_epoch(epoch_t e);
+
+ uint32_t get_crush_version() const {
+ return crush_version;
+ }
+
+ /* stamps etc */
+ const utime_t& get_created() const { return created; }
+ const utime_t& get_modified() const { return modified; }
+
+ bool is_blocklisted(const entity_addr_t& a, CephContext *cct=nullptr) const;
+ bool is_blocklisted(const entity_addrvec_t& a, CephContext *cct=nullptr) const;
+ void get_blocklist(std::list<std::pair<entity_addr_t,utime_t > > *bl,
+ std::list<std::pair<entity_addr_t,utime_t> > *rl) const;
+ void get_blocklist(std::set<entity_addr_t> *bl,
+ std::set<entity_addr_t> *rl) const;
+
+ std::string get_cluster_snapshot() const {
+ if (cluster_snapshot_epoch == epoch)
+ return cluster_snapshot;
+ return std::string();
+ }
+
+ float get_full_ratio() const {
+ return full_ratio;
+ }
+ float get_backfillfull_ratio() const {
+ return backfillfull_ratio;
+ }
+ float get_nearfull_ratio() const {
+ return nearfull_ratio;
+ }
+ void get_full_pools(CephContext *cct,
+ std::set<int64_t> *full,
+ std::set<int64_t> *backfillfull,
+ std::set<int64_t> *nearfull) const;
+ void get_full_osd_counts(std::set<int> *full, std::set<int> *backfill,
+ std::set<int> *nearfull) const;
+
+
+ /***** cluster state *****/
+ /* osds */
+ int get_max_osd() const { return max_osd; }
+ void set_max_osd(int m);
+
+ unsigned get_num_osds() const {
+ return num_osd;
+ }
+ unsigned get_num_up_osds() const {
+ return num_up_osd;
+ }
+ unsigned get_num_in_osds() const {
+ return num_in_osd;
+ }
+ /// recalculate cached values for get_num{,_up,_in}_osds
+ int calc_num_osds();
+
+ void get_all_osds(std::set<int32_t>& ls) const;
+ void get_up_osds(std::set<int32_t>& ls) const;
+ void get_out_existing_osds(std::set<int32_t>& ls) const;
+ unsigned get_num_pg_temp() const {
+ return pg_temp->size();
+ }
+
+ int get_flags() const { return flags; }
+ bool test_flag(int f) const { return flags & f; }
+ void set_flag(int f) { flags |= f; }
+ void clear_flag(int f) { flags &= ~f; }
+
+ void get_flag_set(std::set<std::string> *flagset) const;
+
+ static void calc_state_set(int state, std::set<std::string>& st);
+
+ int get_state(int o) const {
+ ceph_assert(o < max_osd);
+ return osd_state[o];
+ }
+ int get_state(int o, std::set<std::string>& st) const {
+ ceph_assert(o < max_osd);
+ unsigned t = osd_state[o];
+ calc_state_set(t, st);
+ return osd_state[o];
+ }
+ void set_state(int o, unsigned s) {
+ ceph_assert(o < max_osd);
+ osd_state[o] = s;
+ }
+ void set_weight(int o, unsigned w) {
+ ceph_assert(o < max_osd);
+ osd_weight[o] = w;
+ if (w)
+ osd_state[o] |= CEPH_OSD_EXISTS;
+ }
+ unsigned get_weight(int o) const {
+ ceph_assert(o < max_osd);
+ return osd_weight[o];
+ }
+ float get_weightf(int o) const {
+ return (float)get_weight(o) / (float)CEPH_OSD_IN;
+ }
+ void adjust_osd_weights(const std::map<int,double>& weights, Incremental& inc) const;
+
+ void set_primary_affinity(int o, int w) {
+ ceph_assert(o < max_osd);
+ if (!osd_primary_affinity)
+ osd_primary_affinity.reset(
+ new mempool::osdmap::vector<__u32>(
+ max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
+ (*osd_primary_affinity)[o] = w;
+ }
+ unsigned get_primary_affinity(int o) const {
+ ceph_assert(o < max_osd);
+ if (!osd_primary_affinity)
+ return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ return (*osd_primary_affinity)[o];
+ }
+ float get_primary_affinityf(int o) const {
+ return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
+ }
+
+ bool has_erasure_code_profile(const std::string &name) const {
+ auto i = erasure_code_profiles.find(name);
+ return i != erasure_code_profiles.end();
+ }
+ int get_erasure_code_profile_default(CephContext *cct,
+ std::map<std::string,std::string> &profile_map,
+ std::ostream *ss);
+ void set_erasure_code_profile(const std::string &name,
+ const std::map<std::string,std::string>& profile) {
+ erasure_code_profiles[name] = profile;
+ }
+ const std::map<std::string,std::string> &get_erasure_code_profile(
+ const std::string &name) const {
+ static std::map<std::string,std::string> empty;
+ auto i = erasure_code_profiles.find(name);
+ if (i == erasure_code_profiles.end())
+ return empty;
+ else
+ return i->second;
+ }
+ const mempool::osdmap::map<std::string,std::map<std::string,std::string>> &get_erasure_code_profiles() const {
+ return erasure_code_profiles;
+ }
+
+ bool exists(int osd) const {
+ //assert(osd >= 0);
+ return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
+ }
+
+ bool is_destroyed(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
+ }
+
+ bool is_up(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
+ }
+
+ bool has_been_up_since(int osd, epoch_t epoch) const {
+ return is_up(osd) && get_up_from(osd) <= epoch;
+ }
+
+ bool is_down(int osd) const {
+ return !is_up(osd);
+ }
+
+ bool is_stop(int osd) const {
+ return exists(osd) && is_down(osd) &&
+ (osd_state[osd] & CEPH_OSD_STOP);
+ }
+
+ bool is_out(int osd) const {
+ return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
+ }
+
+ bool is_in(int osd) const {
+ return !is_out(osd);
+ }
+
+ bool is_dead(int osd) const {
+ if (!exists(osd)) {
+ return false; // unclear if they know they are removed from map
+ }
+ return get_xinfo(osd).dead_epoch > get_info(osd).up_from;
+ }
+
+ unsigned get_osd_crush_node_flags(int osd) const;
+ unsigned get_crush_node_flags(int id) const;
+ unsigned get_device_class_flags(int id) const;
+
+ bool is_noup_by_osd(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
+ }
+
+ bool is_nodown_by_osd(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
+ }
+
+ bool is_noin_by_osd(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
+ }
+
+ bool is_noout_by_osd(int osd) const {
+ return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
+ }
+
+ bool is_noup(int osd) const {
+ if (test_flag(CEPH_OSDMAP_NOUP)) // global?
+ return true;
+ if (is_noup_by_osd(osd)) // by osd?
+ return true;
+ if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node?
+ return true;
+ if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+ get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class?
+ return true;
+ return false;
+ }
+
+ bool is_nodown(int osd) const {
+ if (test_flag(CEPH_OSDMAP_NODOWN))
+ return true;
+ if (is_nodown_by_osd(osd))
+ return true;
+ if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN)
+ return true;
+ if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+ get_device_class_flags(class_id) & CEPH_OSD_NODOWN)
+ return true;
+ return false;
+ }
+
+ bool is_noin(int osd) const {
+ if (test_flag(CEPH_OSDMAP_NOIN))
+ return true;
+ if (is_noin_by_osd(osd))
+ return true;
+ if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN)
+ return true;
+ if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+ get_device_class_flags(class_id) & CEPH_OSD_NOIN)
+ return true;
+ return false;
+ }
+
+ bool is_noout(int osd) const {
+ if (test_flag(CEPH_OSDMAP_NOOUT))
+ return true;
+ if (is_noout_by_osd(osd))
+ return true;
+ if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT)
+ return true;
+ if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+ get_device_class_flags(class_id) & CEPH_OSD_NOOUT)
+ return true;
+ return false;
+ }
+
+ /**
+ * check if an entire crush subtree is down
+ */
+ bool subtree_is_down(int id, std::set<int> *down_cache) const;
+ bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, std::set<int> *down_cache) const;
+
+ bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, std::set<int> *down_in_osds, std::set<int> *up_in_osds,
+ std::set<int> *subtree_up, std::unordered_map<int, std::set<int> > *subtree_type_down) const;
+
+ int identify_osd(const entity_addr_t& addr) const;
+ int identify_osd(const uuid_d& u) const;
+ int identify_osd_on_all_channels(const entity_addr_t& addr) const;
+
+ bool have_addr(const entity_addr_t& addr) const {
+ return identify_osd(addr) >= 0;
+ }
+ int find_osd_on_ip(const entity_addr_t& ip) const;
+
+ const entity_addrvec_t& get_addrs(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_addrs->client_addrs[osd] ?
+ *osd_addrs->client_addrs[osd] : _blank_addrvec;
+ }
+ const entity_addrvec_t& get_most_recent_addrs(int osd) const {
+ return get_addrs(osd);
+ }
+ const entity_addrvec_t &get_cluster_addrs(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_addrs->cluster_addrs[osd] ?
+ *osd_addrs->cluster_addrs[osd] : _blank_addrvec;
+ }
+ const entity_addrvec_t &get_hb_back_addrs(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_addrs->hb_back_addrs[osd] ?
+ *osd_addrs->hb_back_addrs[osd] : _blank_addrvec;
+ }
+ const entity_addrvec_t &get_hb_front_addrs(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_addrs->hb_front_addrs[osd] ?
+ *osd_addrs->hb_front_addrs[osd] : _blank_addrvec;
+ }
+
+ const uuid_d& get_uuid(int osd) const {
+ ceph_assert(exists(osd));
+ return (*osd_uuid)[osd];
+ }
+
+ const epoch_t& get_up_from(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_info[osd].up_from;
+ }
+ const epoch_t& get_up_thru(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_info[osd].up_thru;
+ }
+ const epoch_t& get_down_at(int osd) const {
+ ceph_assert(exists(osd));
+ return osd_info[osd].down_at;
+ }
+ const osd_info_t& get_info(int osd) const {
+ ceph_assert(osd < max_osd);
+ return osd_info[osd];
+ }
+
+ const osd_xinfo_t& get_xinfo(int osd) const {
+ ceph_assert(osd < max_osd);
+ return osd_xinfo[osd];
+ }
+
+ int get_next_up_osd_after(int n) const {
+ if (get_max_osd() == 0)
+ return -1;
+ for (int i = n + 1; i != n; ++i) {
+ if (i >= get_max_osd())
+ i = 0;
+ if (i == n)
+ break;
+ if (is_up(i))
+ return i;
+ }
+ return -1;
+ }
+
+ int get_previous_up_osd_before(int n) const {
+ if (get_max_osd() == 0)
+ return -1;
+ for (int i = n - 1; i != n; --i) {
+ if (i < 0)
+ i = get_max_osd() - 1;
+ if (i == n)
+ break;
+ if (is_up(i))
+ return i;
+ }
+ return -1;
+ }
+
+
+ void get_random_up_osds_by_subtree(int n, // whoami
+ std::string &subtree,
+ int limit, // how many
+ std::set<int> skip,
+ std::set<int> *want) const;
+
+ /**
+ * get feature bits required by the current structure
+ *
+ * @param entity_type [in] what entity type we are asking about
+ * @param mask [out] std::set of all possible map-related features we could std::set
+ * @return feature bits used by this map
+ */
+ uint64_t get_features(int entity_type, uint64_t *mask) const;
+
+ /**
+ * get oldest *client* version (firefly, hammer, etc.) that can connect given
+ * the feature bits required (according to get_features()).
+ */
+ ceph_release_t get_min_compat_client() const;
+
+ /**
+ * gets the required minimum *client* version that can connect to the cluster.
+ */
+ ceph_release_t get_require_min_compat_client() const;
+
+ /**
+ * get intersection of features supported by up osds
+ */
+ uint64_t get_up_osd_features() const;
+
+ void get_upmap_pgs(std::vector<pg_t> *upmap_pgs) const;
+ bool check_pg_upmaps(
+ CephContext *cct,
+ const std::vector<pg_t>& to_check,
+ std::vector<pg_t> *to_cancel,
+ std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> *to_remap) const;
+ void clean_pg_upmaps(
+ CephContext *cct,
+ Incremental *pending_inc,
+ const std::vector<pg_t>& to_cancel,
+ const std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>>& to_remap) const;
+ bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const;
+
+ int apply_incremental(const Incremental &inc);
+
+ /// try to re-use/reference addrs in oldmap from newmap
+ static void dedup(const OSDMap *oldmap, OSDMap *newmap);
+
+ static void clean_temps(CephContext *cct,
+ const OSDMap& oldmap,
+ const OSDMap& nextmap,
+ Incremental *pending_inc);
+
+ // serialize, unserialize
+private:
+ void encode_client_old(ceph::buffer::list& bl) const;
+ void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
+ void decode_classic(ceph::buffer::list::const_iterator& p);
+ void post_decode();
+public:
+ void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
+ void decode(ceph::buffer::list& bl);
+ void decode(ceph::buffer::list::const_iterator& bl);
+
+
+ /**** mapping facilities ****/
+ int map_to_pg(
+ int64_t pool,
+ const std::string& name,
+ const std::string& key,
+ const std::string& nspace,
+ pg_t *pg) const;
+ int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
+ pg_t &pg) const;
+ pg_t object_locator_to_pg(const object_t& oid,
+ const object_locator_t& loc) const {
+ pg_t pg;
+ int ret = object_locator_to_pg(oid, loc, pg);
+ ceph_assert(ret == 0);
+ return pg;
+ }
+
+
+ static object_locator_t file_to_object_locator(const file_layout_t& layout) {
+ return object_locator_t(layout.pool_id, layout.pool_ns);
+ }
+
+ ceph_object_layout file_to_object_layout(object_t oid,
+ file_layout_t& layout) const {
+ return make_object_layout(oid, layout.pool_id, layout.pool_ns);
+ }
+
+ ceph_object_layout make_object_layout(object_t oid, int pg_pool,
+ std::string nspace) const;
+
+ int get_pg_num(int pg_pool) const
+ {
+ const pg_pool_t *pool = get_pg_pool(pg_pool);
+ ceph_assert(NULL != pool);
+ return pool->get_pg_num();
+ }
+
+ bool pg_exists(pg_t pgid) const {
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ return p && pgid.ps() < p->get_pg_num();
+ }
+
+ int get_pg_pool_min_size(pg_t pgid) const {
+ if (!pg_exists(pgid)) {
+ return -ENOENT;
+ }
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ ceph_assert(p);
+ return p->get_min_size();
+ }
+
+ int get_pg_pool_size(pg_t pgid) const {
+ if (!pg_exists(pgid)) {
+ return -ENOENT;
+ }
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ ceph_assert(p);
+ return p->get_size();
+ }
+
+ int get_pg_pool_crush_rule(pg_t pgid) const {
+ if (!pg_exists(pgid)) {
+ return -ENOENT;
+ }
+ const pg_pool_t *p = get_pg_pool(pgid.pool());
+ ceph_assert(p);
+ return p->get_crush_rule();
+ }
+
+private:
+ /// pg -> (raw osd std::list)
+ void _pg_to_raw_osds(
+ const pg_pool_t& pool, pg_t pg,
+ std::vector<int> *osds,
+ ps_t *ppps) const;
+ int _pick_primary(const std::vector<int>& osds) const;
+ void _remove_nonexistent_osds(const pg_pool_t& pool, std::vector<int>& osds) const;
+
+ void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
+ std::vector<int> *osds, int *primary) const;
+
+ /// apply pg_upmap[_items] mappings
+ void _apply_upmap(const pg_pool_t& pi, pg_t pg, std::vector<int> *raw) const;
+
+ /// pg -> (up osd std::list)
+ void _raw_to_up_osds(const pg_pool_t& pool, const std::vector<int>& raw,
+ std::vector<int> *up) const;
+
+
+ /**
+ * Get the pg and primary temp, if they are specified.
+ * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
+ * @param temp_primary [out] Will be the value in primary_temp, or a value derived
+ * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
+ */
+ void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
+ std::vector<int> *temp_pg, int *temp_primary) const;
+
+ /**
+ * map to up and acting. Fills in whatever fields are non-NULL.
+ */
+ void _pg_to_up_acting_osds(const pg_t& pg, std::vector<int> *up, int *up_primary,
+ std::vector<int> *acting, int *acting_primary,
+ bool raw_pg_to_pg = true) const;
+
+public:
+ /***
+ * This is suitable only for looking at raw CRUSH outputs. It skips
+ * applying the temp and up checks and should not be used
+ * by anybody for data mapping purposes.
+ * raw and primary must be non-NULL
+ */
+ void pg_to_raw_osds(pg_t pg, std::vector<int> *raw, int *primary) const;
+ void pg_to_raw_upmap(pg_t pg, std::vector<int> *raw,
+ std::vector<int> *raw_upmap) const;
+ /// map a pg to its acting set. @return acting set size
+ void pg_to_acting_osds(const pg_t& pg, std::vector<int> *acting,
+ int *acting_primary) const {
+ _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
+ }
+ void pg_to_acting_osds(pg_t pg, std::vector<int>& acting) const {
+ return pg_to_acting_osds(pg, &acting, NULL);
+ }
+ /**
+ * This does not apply temp overrides and should not be used
+ * by anybody for data mapping purposes. Specify both pointers.
+ */
+ void pg_to_raw_up(pg_t pg, std::vector<int> *up, int *primary) const;
+ /**
+ * map a pg to its acting set as well as its up set. You must use
+ * the acting set for data mapping purposes, but some users will
+ * also find the up set useful for things like deciding what to
+ * set as pg_temp.
+ * Each of these pointers must be non-NULL.
+ */
+ void pg_to_up_acting_osds(pg_t pg, std::vector<int> *up, int *up_primary,
+ std::vector<int> *acting, int *acting_primary) const {
+ _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
+ }
+ void pg_to_up_acting_osds(pg_t pg, std::vector<int>& up, std::vector<int>& acting) const {
+ int up_primary, acting_primary;
+ pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
+ }
+ bool pg_is_ec(pg_t pg) const {
+ auto i = pools.find(pg.pool());
+ ceph_assert(i != pools.end());
+ return i->second.is_erasure();
+ }
+ bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
+ auto i = get_pools().find(pgid.pool());
+ if (i == get_pools().end()) {
+ return false;
+ }
+ if (!i->second.is_erasure()) {
+ *out = spg_t(pgid);
+ return true;
+ }
+ int primary;
+ std::vector<int> acting;
+ pg_to_acting_osds(pgid, &acting, &primary);
+ for (uint8_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] == primary) {
+ *out = spg_t(pgid, shard_id_t(i));
+ return true;
+ }
+ }
+ return false;
+ }
+ bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
+ auto i = get_pools().find(pgid.pool());
+ if (i == get_pools().end()) {
+ return false;
+ }
+ std::vector<int> acting;
+ pg_to_acting_osds(pgid, &acting, primary);
+ if (i->second.is_erasure()) {
+ for (uint8_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] == *primary) {
+ *out = spg_t(pgid, shard_id_t(i));
+ return true;
+ }
+ }
+ } else {
+ *out = spg_t(pgid);
+ return true;
+ }
+ return false;
+ }
+
+ bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
+ auto p = removed_snaps_queue.find(pool);
+ if (p == removed_snaps_queue.end()) {
+ return false;
+ }
+ return p->second.contains(snap);
+ }
+
+ const mempool::osdmap::map<int64_t,snap_interval_set_t>&
+ get_removed_snaps_queue() const {
+ return removed_snaps_queue;
+ }
+ const mempool::osdmap::map<int64_t,snap_interval_set_t>&
+ get_new_removed_snaps() const {
+ return new_removed_snaps;
+ }
+ const mempool::osdmap::map<int64_t,snap_interval_set_t>&
+ get_new_purged_snaps() const {
+ return new_purged_snaps;
+ }
+
+ int64_t lookup_pg_pool_name(std::string_view name) const {
+ auto p = name_pool.find(name);
+ if (p == name_pool.end())
+ return -ENOENT;
+ return p->second;
+ }
+
+ int64_t get_pool_max() const {
+ return pool_max;
+ }
+ const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
+ return pools;
+ }
+ mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
+ return pools;
+ }
+ void get_pool_ids_by_rule(int rule_id, std::set<int64_t> *pool_ids) const {
+ ceph_assert(pool_ids);
+ for (auto &p: pools) {
+ if (p.second.get_crush_rule() == rule_id) {
+ pool_ids->insert(p.first);
+ }
+ }
+ }
+ void get_pool_ids_by_osd(CephContext *cct,
+ int osd,
+ std::set<int64_t> *pool_ids) const;
+ const std::string& get_pool_name(int64_t p) const {
+ auto i = pool_name.find(p);
+ ceph_assert(i != pool_name.end());
+ return i->second;
+ }
+ const mempool::osdmap::map<int64_t,std::string>& get_pool_names() const {
+ return pool_name;
+ }
+ bool have_pg_pool(int64_t p) const {
+ return pools.count(p);
+ }
+ const pg_pool_t* get_pg_pool(int64_t p) const {
+ auto i = pools.find(p);
+ if (i != pools.end())
+ return &i->second;
+ return NULL;
+ }
+ unsigned get_pg_size(pg_t pg) const {
+ auto p = pools.find(pg.pool());
+ ceph_assert(p != pools.end());
+ return p->second.get_size();
+ }
+ int get_pg_type(pg_t pg) const {
+ auto p = pools.find(pg.pool());
+ ceph_assert(p != pools.end());
+ return p->second.get_type();
+ }
+ int get_pool_crush_rule(int64_t pool_id) const {
+ auto pool = get_pg_pool(pool_id);
+ if (!pool)
+ return -ENOENT;
+ return pool->get_crush_rule();
+ }
+
+
+ pg_t raw_pg_to_pg(pg_t pg) const {
+ auto p = pools.find(pg.pool());
+ ceph_assert(p != pools.end());
+ return p->second.raw_pg_to_pg(pg);
+ }
+
+ // pg -> acting primary osd
+ int get_pg_acting_primary(pg_t pg) const {
+ int primary = -1;
+ _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
+ return primary;
+ }
+
+ /*
+ * check whether an spg_t maps to a particular osd
+ */
+ bool is_up_acting_osd_shard(spg_t pg, int osd) const {
+ std::vector<int> up, acting;
+ _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
+ if (calc_pg_role(pg_shard_t(osd, pg.shard), acting) >= 0 ||
+ calc_pg_role(pg_shard_t(osd, pg.shard), up) >= 0) {
+ return true;
+ }
+ return false;
+ }
+
+
+ static int calc_pg_role_broken(int osd, const std::vector<int>& acting, int nrep=0);
+ static int calc_pg_role(pg_shard_t who, const std::vector<int>& acting);
+ static bool primary_changed_broken(
+ int oldprimary,
+ const std::vector<int> &oldacting,
+ int newprimary,
+ const std::vector<int> &newacting);
+
+ /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
+ int get_pg_acting_role(spg_t pg, int osd) const {
+ std::vector<int> group;
+ pg_to_acting_osds(pg.pgid, group);
+ return calc_pg_role(pg_shard_t(osd, pg.shard), group);
+ }
+
+ bool try_pg_upmap(
+ CephContext *cct,
+ pg_t pg, ///< pg to potentially remap
+ const std::set<int>& overfull, ///< osds we'd want to evacuate
+ const std::vector<int>& underfull, ///< osds to move to, in order of preference
+ const std::vector<int>& more_underfull, ///< less full osds to move to, in order of preference
+ std::vector<int> *orig,
+ std::vector<int> *out); ///< resulting alternative mapping
+
+ int calc_pg_upmaps(
+ CephContext *cct,
+ uint32_t max_deviation, ///< max deviation from target (value >= 1)
+ int max_iterations, ///< max iterations to run
+ const std::set<int64_t>& pools, ///< [optional] restrict to pool
+ Incremental *pending_inc
+ );
+
+ int get_osds_by_bucket_name(const std::string &name, std::set<int> *osds) const;
+
+ bool have_pg_upmaps(pg_t pg) const {
+ return pg_upmap.count(pg) ||
+ pg_upmap_items.count(pg);
+ }
+
+ bool check_full(const std::set<pg_shard_t> &missing_on) const {
+ for (auto shard : missing_on) {
+ if (get_state(shard.osd) & CEPH_OSD_FULL)
+ return true;
+ }
+ return false;
+ }
+
+ /*
+ * handy helpers to build simple maps...
+ */
+ /**
+ * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
+ * it will be initialized with the specified number of OSDs in a
+ * single host. If **num_osd** is < 0 the layout of the OSD map will
+ * be built by reading the content of the configuration file.
+ *
+ * @param cct [in] in core ceph context
+ * @param e [in] initial epoch
+ * @param fsid [in] id of the cluster
+ * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
+ * @return **0** on success, negative errno on error.
+ */
+private:
+ int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
+ int num_osd, int pg_bits, int pgp_bits,
+ bool default_pool);
+public:
+ int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
+ int num_osd) {
+ return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
+ }
+ int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
+ int num_osd, int pg_bits, int pgp_bits) {
+ return build_simple_optioned(cct, e, fsid, num_osd,
+ pg_bits, pgp_bits, true);
+ }
+ static int _build_crush_types(CrushWrapper& crush);
+ static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
+ int num_osd, std::ostream *ss);
+ static int build_simple_crush_map_from_conf(CephContext *cct,
+ CrushWrapper& crush,
+ std::ostream *ss);
+ static int build_simple_crush_rules(
+ CephContext *cct, CrushWrapper& crush,
+ const std::string& root,
+ std::ostream *ss);
+
+ bool crush_rule_in_use(int rule_id) const;
+
+ int validate_crush_rules(CrushWrapper *crush, std::ostream *ss) const;
+
+ void clear_temp() {
+ pg_temp->clear();
+ primary_temp->clear();
+ }
+
+private:
+ void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const;
+public:
+ void print(std::ostream& out) const;
+ void print_osd(int id, std::ostream& out) const;
+ void print_osds(std::ostream& out) const;
+ void print_pools(std::ostream& out) const;
+ void print_summary(ceph::Formatter *f, std::ostream& out,
+ const std::string& prefix, bool extra=false) const;
+ void print_oneline_summary(std::ostream& out) const;
+
+ enum {
+ DUMP_IN = 1, // only 'in' osds
+ DUMP_OUT = 2, // only 'out' osds
+ DUMP_UP = 4, // only 'up' osds
+ DUMP_DOWN = 8, // only 'down' osds
+ DUMP_DESTROYED = 16, // only 'destroyed' osds
+ };
+ void print_tree(ceph::Formatter *f, std::ostream *out,
+ unsigned dump_flags=0, std::string bucket="") const;
+
+ int summarize_mapping_stats(
+ OSDMap *newmap,
+ const std::set<int64_t> *pools,
+ std::string *out,
+ ceph::Formatter *f) const;
+
+ std::string get_flag_string() const;
+ static std::string get_flag_string(unsigned flags);
+ static void dump_erasure_code_profiles(
+ const mempool::osdmap::map<std::string,std::map<std::string,std::string> > &profiles,
+ ceph::Formatter *f);
+ void dump(ceph::Formatter *f) const;
+ void dump_osd(int id, ceph::Formatter *f) const;
+ void dump_osds(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<OSDMap*>& o);
+ bool check_new_blocklist_entries() const { return new_blocklist_entries; }
+
+ void check_health(CephContext *cct, health_check_map_t *checks) const;
+
+ int parse_osd_id_list(const std::vector<std::string>& ls,
+ std::set<int> *out,
+ std::ostream *ss) const;
+
+ float pool_raw_used_rate(int64_t poolid) const;
+ std::optional<std::string> pending_require_osd_release() const;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(OSDMap)
+WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
+
+#ifdef WITH_SEASTAR
+using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
+#else
+using OSDMapRef = std::shared_ptr<const OSDMap>;
+#endif
+
+
+inline std::ostream& operator<<(std::ostream& out, const OSDMap& m) {
+ m.print_oneline_summary(out);
+ return out;
+}
+
+class PGMap;
+
+void print_osd_utilization(const OSDMap& osdmap,
+ const PGMap& pgmap,
+ std::ostream& out,
+ ceph::Formatter *f,
+ bool tree,
+ const std::string& filter);
+
+#endif
diff --git a/src/osd/OSDMapMapping.cc b/src/osd/OSDMapMapping.cc
new file mode 100644
index 000000000..9cd1fbf58
--- /dev/null
+++ b/src/osd/OSDMapMapping.cc
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "OSDMapMapping.h"
+#include "OSDMap.h"
+
+#define dout_subsys ceph_subsys_mon
+
+#include "common/debug.h"
+
+using std::vector;
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMapMapping, osdmapmapping,
+ osdmap_mapping);
+
+// ensure that we have a PoolMappings for each pool and that
+// the dimensions (pg_num and size) match up.
+void OSDMapMapping::_init_mappings(const OSDMap& osdmap)
+{
+ num_pgs = 0;
+ auto q = pools.begin();
+ for (auto& p : osdmap.get_pools()) {
+ num_pgs += p.second.get_pg_num();
+ // drop unneeded pools
+ while (q != pools.end() && q->first < p.first) {
+ q = pools.erase(q);
+ }
+ if (q != pools.end() && q->first == p.first) {
+ if (q->second.pg_num != p.second.get_pg_num() ||
+ q->second.size != p.second.get_size()) {
+ // pg_num changed
+ q = pools.erase(q);
+ } else {
+ // keep it
+ ++q;
+ continue;
+ }
+ }
+ pools.emplace(p.first, PoolMapping(p.second.get_size(),
+ p.second.get_pg_num(),
+ p.second.is_erasure()));
+ }
+ pools.erase(q, pools.end());
+ ceph_assert(pools.size() == osdmap.get_pools().size());
+}
+
+void OSDMapMapping::update(const OSDMap& osdmap)
+{
+ _start(osdmap);
+ for (auto& p : osdmap.get_pools()) {
+ _update_range(osdmap, p.first, 0, p.second.get_pg_num());
+ }
+ _finish(osdmap);
+ //_dump(); // for debugging
+}
+
+void OSDMapMapping::update(const OSDMap& osdmap, pg_t pgid)
+{
+ _update_range(osdmap, pgid.pool(), pgid.ps(), pgid.ps() + 1);
+}
+
+void OSDMapMapping::_build_rmap(const OSDMap& osdmap)
+{
+ acting_rmap.resize(osdmap.get_max_osd());
+ //up_rmap.resize(osdmap.get_max_osd());
+ for (auto& v : acting_rmap) {
+ v.resize(0);
+ }
+ //for (auto& v : up_rmap) {
+ // v.resize(0);
+ //}
+ for (auto& p : pools) {
+ pg_t pgid(0, p.first);
+ for (unsigned ps = 0; ps < p.second.pg_num; ++ps) {
+ pgid.set_ps(ps);
+ int32_t *row = &p.second.table[p.second.row_size() * ps];
+ for (int i = 0; i < row[2]; ++i) {
+ if (row[4 + i] != CRUSH_ITEM_NONE) {
+ acting_rmap[row[4 + i]].push_back(pgid);
+ }
+ }
+ //for (int i = 0; i < row[3]; ++i) {
+ //up_rmap[row[4 + p.second.size + i]].push_back(pgid);
+ //}
+ }
+ }
+}
+
+void OSDMapMapping::_finish(const OSDMap& osdmap)
+{
+ _build_rmap(osdmap);
+ epoch = osdmap.get_epoch();
+}
+
+void OSDMapMapping::_dump()
+{
+ for (auto& p : pools) {
+ std::cout << "pool " << p.first << std::endl;
+ for (unsigned i = 0; i < p.second.table.size(); ++i) {
+ std::cout << " " << p.second.table[i];
+ if (i % p.second.row_size() == p.second.row_size() - 1)
+ std::cout << std::endl;
+ }
+ }
+}
+
+void OSDMapMapping::_update_range(
+ const OSDMap& osdmap,
+ int64_t pool,
+ unsigned pg_begin,
+ unsigned pg_end)
+{
+ auto i = pools.find(pool);
+ ceph_assert(i != pools.end());
+ ceph_assert(pg_begin <= pg_end);
+ ceph_assert(pg_end <= i->second.pg_num);
+ for (unsigned ps = pg_begin; ps < pg_end; ++ps) {
+ std::vector<int> up, acting;
+ int up_primary, acting_primary;
+ osdmap.pg_to_up_acting_osds(
+ pg_t(ps, pool),
+ &up, &up_primary, &acting, &acting_primary);
+ i->second.set(ps, std::move(up), up_primary,
+ std::move(acting), acting_primary);
+ }
+}
+
+// ---------------------------
+
+void ParallelPGMapper::Job::finish_one()
+{
+ Context *fin = nullptr;
+ {
+ std::lock_guard l(lock);
+ if (--shards == 0) {
+ if (!aborted) {
+ finish = ceph_clock_now();
+ complete();
+ }
+ cond.notify_all();
+ fin = onfinish;
+ onfinish = nullptr;
+ }
+ }
+ if (fin) {
+ fin->complete(0);
+ }
+}
+
+void ParallelPGMapper::WQ::_process(Item *i, ThreadPool::TPHandle &h)
+{
+ ldout(m->cct, 20) << __func__ << " " << i->job << " pool " << i->pool
+ << " [" << i->begin << "," << i->end << ")"
+ << " pgs " << i->pgs
+ << dendl;
+ if (!i->pgs.empty())
+ i->job->process(i->pgs);
+ else
+ i->job->process(i->pool, i->begin, i->end);
+ i->job->finish_one();
+ delete i;
+}
+
+void ParallelPGMapper::queue(
+ Job *job,
+ unsigned pgs_per_item,
+ const vector<pg_t>& input_pgs)
+{
+ bool any = false;
+ if (!input_pgs.empty()) {
+ unsigned i = 0;
+ vector<pg_t> item_pgs;
+ item_pgs.reserve(pgs_per_item);
+ for (auto& pg : input_pgs) {
+ if (i < pgs_per_item) {
+ ++i;
+ item_pgs.push_back(pg);
+ }
+ if (i >= pgs_per_item) {
+ job->start_one();
+ wq.queue(new Item(job, item_pgs));
+ i = 0;
+ item_pgs.clear();
+ any = true;
+ }
+ }
+ if (!item_pgs.empty()) {
+ job->start_one();
+ wq.queue(new Item(job, item_pgs));
+ any = true;
+ }
+ ceph_assert(any);
+ return;
+ }
+ // no input pgs, load all from map
+ for (auto& p : job->osdmap->get_pools()) {
+ for (unsigned ps = 0; ps < p.second.get_pg_num(); ps += pgs_per_item) {
+ unsigned ps_end = std::min(ps + pgs_per_item, p.second.get_pg_num());
+ job->start_one();
+ wq.queue(new Item(job, p.first, ps, ps_end));
+ ldout(cct, 20) << __func__ << " " << job << " " << p.first << " [" << ps
+ << "," << ps_end << ")" << dendl;
+ any = true;
+ }
+ }
+ ceph_assert(any);
+}
diff --git a/src/osd/OSDMapMapping.h b/src/osd/OSDMapMapping.h
new file mode 100644
index 000000000..3274d02e4
--- /dev/null
+++ b/src/osd/OSDMapMapping.h
@@ -0,0 +1,352 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#ifndef CEPH_OSDMAPMAPPING_H
+#define CEPH_OSDMAPMAPPING_H
+
+#include <vector>
+#include <map>
+
+#include "osd/osd_types.h"
+#include "common/WorkQueue.h"
+#include "common/Cond.h"
+
+class OSDMap;
+
+/// work queue to perform work on batches of pgids on multiple CPUs
+class ParallelPGMapper {
+public:
+ struct Job {
+ utime_t start, finish;
+ unsigned shards = 0;
+ const OSDMap *osdmap;
+ bool aborted = false;
+ Context *onfinish = nullptr;
+
+ ceph::mutex lock = ceph::make_mutex("ParallelPGMapper::Job::lock");
+ ceph::condition_variable cond;
+
+ Job(const OSDMap *om) : start(ceph_clock_now()), osdmap(om) {}
+ virtual ~Job() {
+ ceph_assert(shards == 0);
+ }
+
+ // child must implement either form of process
+ virtual void process(const std::vector<pg_t>& pgs) = 0;
+ virtual void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) = 0;
+ virtual void complete() = 0;
+
+ void set_finish_event(Context *fin) {
+ lock.lock();
+ if (shards == 0) {
+ // already done.
+ lock.unlock();
+ fin->complete(0);
+ } else {
+ // set finisher
+ onfinish = fin;
+ lock.unlock();
+ }
+ }
+ bool is_done() {
+ std::lock_guard l(lock);
+ return shards == 0;
+ }
+ utime_t get_duration() {
+ return finish - start;
+ }
+ void wait() {
+ std::unique_lock l(lock);
+ cond.wait(l, [this] { return shards == 0; });
+ }
+ bool wait_for(double duration) {
+ utime_t until = start;
+ until += duration;
+ std::unique_lock l(lock);
+ while (shards > 0) {
+ if (ceph_clock_now() >= until) {
+ return false;
+ }
+ cond.wait(l);
+ }
+ return true;
+ }
+ void abort() {
+ Context *fin = nullptr;
+ {
+ std::unique_lock l(lock);
+ aborted = true;
+ fin = onfinish;
+ onfinish = nullptr;
+ cond.wait(l, [this] { return shards == 0; });
+ }
+ if (fin) {
+ fin->complete(-ECANCELED);
+ }
+ }
+
+ void start_one() {
+ std::lock_guard l(lock);
+ ++shards;
+ }
+ void finish_one();
+ };
+
+protected:
+ CephContext *cct;
+
+ struct Item {
+ Job *job;
+ int64_t pool;
+ unsigned begin, end;
+ std::vector<pg_t> pgs;
+
+ Item(Job *j, std::vector<pg_t> pgs) : job(j), pgs(pgs) {}
+ Item(Job *j, int64_t p, unsigned b, unsigned e)
+ : job(j),
+ pool(p),
+ begin(b),
+ end(e) {}
+ };
+ std::deque<Item*> q;
+
+ struct WQ : public ThreadPool::WorkQueue<Item> {
+ ParallelPGMapper *m;
+
+ WQ(ParallelPGMapper *m_, ThreadPool *tp)
+ : ThreadPool::WorkQueue<Item>(
+ "ParallelPGMapper::WQ",
+ ceph::make_timespan(m_->cct->_conf->threadpool_default_timeout),
+ ceph::timespan::zero(),
+ tp),
+ m(m_) {}
+
+ bool _enqueue(Item *i) override {
+ m->q.push_back(i);
+ return true;
+ }
+ void _dequeue(Item *i) override {
+ ceph_abort();
+ }
+ Item *_dequeue() override {
+ while (!m->q.empty()) {
+ Item *i = m->q.front();
+ m->q.pop_front();
+ if (i->job->aborted) {
+ i->job->finish_one();
+ delete i;
+ } else {
+ return i;
+ }
+ }
+ return nullptr;
+ }
+
+ void _process(Item *i, ThreadPool::TPHandle &h) override;
+
+ void _clear() override {
+ ceph_assert(_empty());
+ }
+
+ bool _empty() override {
+ return m->q.empty();
+ }
+ } wq;
+
+public:
+ ParallelPGMapper(CephContext *cct, ThreadPool *tp)
+ : cct(cct),
+ wq(this, tp) {}
+
+ void queue(
+ Job *job,
+ unsigned pgs_per_item,
+ const std::vector<pg_t>& input_pgs);
+
+ void drain() {
+ wq.drain();
+ }
+};
+
+
+/// a precalculated mapping of every PG for a given OSDMap
+class OSDMapMapping {
+public:
+ MEMPOOL_CLASS_HELPERS();
+private:
+
+ struct PoolMapping {
+ MEMPOOL_CLASS_HELPERS();
+
+ unsigned size = 0;
+ unsigned pg_num = 0;
+ bool erasure = false;
+ mempool::osdmap_mapping::vector<int32_t> table;
+
+ size_t row_size() const {
+ return
+ 1 + // acting_primary
+ 1 + // up_primary
+ 1 + // num acting
+ 1 + // num up
+ size + // acting
+ size; // up
+ }
+
+ PoolMapping(int s, int p, bool e)
+ : size(s),
+ pg_num(p),
+ erasure(e),
+ table(pg_num * row_size()) {
+ }
+
+ void get(size_t ps,
+ std::vector<int> *up,
+ int *up_primary,
+ std::vector<int> *acting,
+ int *acting_primary) const {
+ const int32_t *row = &table[row_size() * ps];
+ if (acting_primary) {
+ *acting_primary = row[0];
+ }
+ if (up_primary) {
+ *up_primary = row[1];
+ }
+ if (acting) {
+ acting->resize(row[2]);
+ for (int i = 0; i < row[2]; ++i) {
+ (*acting)[i] = row[4 + i];
+ }
+ }
+ if (up) {
+ up->resize(row[3]);
+ for (int i = 0; i < row[3]; ++i) {
+ (*up)[i] = row[4 + size + i];
+ }
+ }
+ }
+
+ void set(size_t ps,
+ const std::vector<int>& up,
+ int up_primary,
+ const std::vector<int>& acting,
+ int acting_primary) {
+ int32_t *row = &table[row_size() * ps];
+ row[0] = acting_primary;
+ row[1] = up_primary;
+ // these should always be <= the pool size, but just in case, avoid
+ // blowing out the array. Note that our mapping is not completely
+ // accurate in this case--this is just to avoid crashing.
+ row[2] = std::min<int32_t>(acting.size(), size);
+ row[3] = std::min<int32_t>(up.size(), size);
+ for (int i = 0; i < row[2]; ++i) {
+ row[4 + i] = acting[i];
+ }
+ for (int i = 0; i < row[3]; ++i) {
+ row[4 + size + i] = up[i];
+ }
+ }
+ };
+
+ mempool::osdmap_mapping::map<int64_t,PoolMapping> pools;
+ mempool::osdmap_mapping::vector<
+ mempool::osdmap_mapping::vector<pg_t>> acting_rmap; // osd -> pg
+ //unused: mempool::osdmap_mapping::vector<std::vector<pg_t>> up_rmap; // osd -> pg
+ epoch_t epoch = 0;
+ uint64_t num_pgs = 0;
+
+ void _init_mappings(const OSDMap& osdmap);
+ void _update_range(
+ const OSDMap& map,
+ int64_t pool,
+ unsigned pg_begin, unsigned pg_end);
+
+ void _build_rmap(const OSDMap& osdmap);
+
+ void _start(const OSDMap& osdmap) {
+ _init_mappings(osdmap);
+ }
+ void _finish(const OSDMap& osdmap);
+
+ void _dump();
+
+ friend class ParallelPGMapper;
+
+ struct MappingJob : public ParallelPGMapper::Job {
+ OSDMapMapping *mapping;
+ MappingJob(const OSDMap *osdmap, OSDMapMapping *m)
+ : Job(osdmap), mapping(m) {
+ mapping->_start(*osdmap);
+ }
+ void process(const std::vector<pg_t>& pgs) override {}
+ void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
+ mapping->_update_range(*osdmap, pool, ps_begin, ps_end);
+ }
+ void complete() override {
+ mapping->_finish(*osdmap);
+ }
+ };
+
+public:
+ void get(pg_t pgid,
+ std::vector<int> *up,
+ int *up_primary,
+ std::vector<int> *acting,
+ int *acting_primary) const {
+ auto p = pools.find(pgid.pool());
+ ceph_assert(p != pools.end());
+ ceph_assert(pgid.ps() < p->second.pg_num);
+ p->second.get(pgid.ps(), up, up_primary, acting, acting_primary);
+ }
+
+ bool get_primary_and_shard(pg_t pgid,
+ int *acting_primary,
+ spg_t *spgid) {
+ auto p = pools.find(pgid.pool());
+ ceph_assert(p != pools.end());
+ ceph_assert(pgid.ps() < p->second.pg_num);
+ std::vector<int> acting;
+ p->second.get(pgid.ps(), nullptr, nullptr, &acting, acting_primary);
+ if (p->second.erasure) {
+ for (uint8_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] == *acting_primary) {
+ *spgid = spg_t(pgid, shard_id_t(i));
+ return true;
+ }
+ }
+ return false;
+ } else {
+ *spgid = spg_t(pgid);
+ return true;
+ }
+ }
+
+ const mempool::osdmap_mapping::vector<pg_t>& get_osd_acting_pgs(unsigned osd) {
+ ceph_assert(osd < acting_rmap.size());
+ return acting_rmap[osd];
+ }
+
+ void update(const OSDMap& map);
+ void update(const OSDMap& map, pg_t pgid);
+
+ std::unique_ptr<MappingJob> start_update(
+ const OSDMap& map,
+ ParallelPGMapper& mapper,
+ unsigned pgs_per_item) {
+ std::unique_ptr<MappingJob> job(new MappingJob(&map, this));
+ mapper.queue(job.get(), pgs_per_item, {});
+ return job;
+ }
+
+ epoch_t get_epoch() const {
+ return epoch;
+ }
+
+ uint64_t get_num_pgs() const {
+ return num_pgs;
+ }
+};
+
+
+#endif
diff --git a/src/osd/ObjectVersioner.h b/src/osd/ObjectVersioner.h
new file mode 100644
index 000000000..f7d756330
--- /dev/null
+++ b/src/osd/ObjectVersioner.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_OBJECTVERSIONER_H
+#define CEPH_OSD_OBJECTVERSIONER_H
+
+class ObjectVersioner {
+ public:
+ pobject_t oid;
+
+ void get_versions(list<version_t>& ls);
+ version_t head(); // newest
+ version_t committed(); // last committed
+ version_t tail(); // oldest
+
+ /*
+ * prepare a new version, starting wit "raw" transaction t.
+ */
+ void prepare(ObjectStore::Transaction& t, version_t v);
+ void rollback_to(version_t v);
+ void commit_to(version_t v);
+};
+
+#endif
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
new file mode 100644
index 000000000..0eb92c23a
--- /dev/null
+++ b/src/osd/OpRequest.cc
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "OpRequest.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <vector>
+#include "common/debug.h"
+#include "common/config.h"
+#include "msg/Message.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "include/ceph_assert.h"
+#include "osd/osd_types.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/oprequest.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+using std::ostream;
+using std::set;
+using std::string;
+using std::stringstream;
+
+using ceph::Formatter;
+
+OpRequest::OpRequest(Message* req, OpTracker* tracker)
+ : TrackedOp(tracker, req->get_throttle_stamp()),
+ request(req),
+ hit_flag_points(0),
+ latest_flag_point(0),
+ hitset_inserted(false) {
+ if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) {
+ // don't warn as quickly for low priority ops
+ warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple;
+ }
+ if (req->get_type() == CEPH_MSG_OSD_OP) {
+ reqid = static_cast<MOSDOp*>(req)->get_reqid();
+ } else if (req->get_type() == MSG_OSD_REPOP) {
+ reqid = static_cast<MOSDRepOp*>(req)->reqid;
+ } else if (req->get_type() == MSG_OSD_REPOPREPLY) {
+ reqid = static_cast<MOSDRepOpReply*>(req)->reqid;
+ }
+ req_src_inst = req->get_source_inst();
+}
+
+void OpRequest::_dump(Formatter *f) const
+{
+ Message *m = request;
+ f->dump_string("flag_point", state_string());
+ if (m->get_orig_source().is_client()) {
+ f->open_object_section("client_info");
+ stringstream client_name, client_addr;
+ client_name << req_src_inst.name;
+ client_addr << req_src_inst.addr;
+ f->dump_string("client", client_name.str());
+ f->dump_string("client_addr", client_addr.str());
+ f->dump_unsigned("tid", m->get_tid());
+ f->close_section(); // client_info
+ }
+
+ {
+ f->open_array_section("events");
+ std::lock_guard l(lock);
+
+ for (auto i = events.begin(); i != events.end(); ++i) {
+ f->open_object_section("event");
+ f->dump_string("event", i->str);
+ f->dump_stream("time") << i->stamp;
+
+ auto i_next = i + 1;
+
+ if (i_next < events.end()) {
+ f->dump_float("duration", i_next->stamp - i->stamp);
+ } else {
+ f->dump_float("duration", events.rbegin()->stamp - get_initiated());
+ }
+
+ f->close_section();
+ }
+ f->close_section();
+ }
+}
+
+void OpRequest::_dump_op_descriptor_unlocked(ostream& stream) const
+{
+ get_req()->print(stream);
+}
+
+void OpRequest::_unregistered() {
+ request->clear_data();
+ request->clear_payload();
+ request->release_message_throttle();
+ request->set_connection(nullptr);
+}
+
+int OpRequest::maybe_init_op_info(const OSDMap &osdmap) {
+ if (op_info.get_flags())
+ return 0;
+
+ auto m = get_req<MOSDOp>();
+
+#ifdef WITH_LTTNG
+ auto old_rmw_flags = op_info.get_flags();
+#endif
+ auto ret = op_info.set_from_op(m, osdmap);
+ tracepoint(oprequest, set_rmw_flags, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc,
+ op_info.get_flags(), old_rmw_flags, op_info.get_flags());
+ return ret;
+}
+
+void OpRequest::mark_flag_point(uint8_t flag, const char *s) {
+#ifdef WITH_LTTNG
+ uint8_t old_flags = hit_flag_points;
+#endif
+ mark_event(s);
+ hit_flag_points |= flag;
+ latest_flag_point = flag;
+ tracepoint(oprequest, mark_flag_point, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc, op_info.get_flags(),
+ flag, s, old_flags, hit_flag_points);
+}
+
+void OpRequest::mark_flag_point_string(uint8_t flag, const string& s) {
+#ifdef WITH_LTTNG
+ uint8_t old_flags = hit_flag_points;
+#endif
+ mark_event(s);
+ hit_flag_points |= flag;
+ latest_flag_point = flag;
+ tracepoint(oprequest, mark_flag_point, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc, op_info.get_flags(),
+ flag, s.c_str(), old_flags, hit_flag_points);
+}
+
+bool OpRequest::filter_out(const set<string>& filters)
+{
+ set<entity_addr_t> addrs;
+ for (auto it = filters.begin(); it != filters.end(); it++) {
+ entity_addr_t addr;
+ if (addr.parse((*it).c_str())) {
+ addrs.insert(addr);
+ }
+ }
+ if (addrs.empty())
+ return true;
+
+ entity_addr_t cmp_addr = req_src_inst.addr;
+ if (addrs.count(cmp_addr)) {
+ return true;
+ }
+ cmp_addr.set_nonce(0);
+ if (addrs.count(cmp_addr)) {
+ return true;
+ }
+ cmp_addr.set_port(0);
+ if (addrs.count(cmp_addr)) {
+ return true;
+ }
+
+ return false;
+}
+
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
new file mode 100644
index 000000000..daa0e1993
--- /dev/null
+++ b/src/osd/OpRequest.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 New Dream Network/Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef OPREQUEST_H_
+#define OPREQUEST_H_
+
+#include "osd/osd_op_util.h"
+#include "osd/osd_types.h"
+#include "common/TrackedOp.h"
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+/**
+ * The OpRequest takes in a Message* and takes over a single reference
+ * to it, which it puts() when destroyed.
+ */
+struct OpRequest : public TrackedOp {
+ friend class OpTracker;
+
+private:
+ OpInfo op_info;
+
+public:
+ int maybe_init_op_info(const OSDMap &osdmap);
+
+ auto get_flags() const { return op_info.get_flags(); }
+ bool op_info_needs_init() const { return op_info.get_flags() == 0; }
+ bool check_rmw(int flag) const { return op_info.check_rmw(flag); }
+ bool may_read() const { return op_info.may_read(); }
+ bool may_write() const { return op_info.may_write(); }
+ bool may_cache() const { return op_info.may_cache(); }
+ bool rwordered_forced() const { return op_info.rwordered_forced(); }
+ bool rwordered() const { return op_info.rwordered(); }
+ bool includes_pg_op() const { return op_info.includes_pg_op(); }
+ bool need_read_cap() const { return op_info.need_read_cap(); }
+ bool need_write_cap() const { return op_info.need_write_cap(); }
+ bool need_promote() const { return op_info.need_promote(); }
+ bool need_skip_handle_cache() const { return op_info.need_skip_handle_cache(); }
+ bool need_skip_promote() const { return op_info.need_skip_promote(); }
+ bool allows_returnvec() const { return op_info.allows_returnvec(); }
+
+ std::vector<OpInfo::ClassInfo> classes() const {
+ return op_info.get_classes();
+ }
+
+ void _dump(ceph::Formatter *f) const override;
+
+ bool has_feature(uint64_t f) const {
+ return request->get_connection()->has_feature(f);
+ }
+
+private:
+ Message *request; /// the logical request we are tracking
+ osd_reqid_t reqid;
+ entity_inst_t req_src_inst;
+ uint8_t hit_flag_points;
+ uint8_t latest_flag_point;
+ utime_t dequeued_time;
+ static const uint8_t flag_queued_for_pg=1 << 0;
+ static const uint8_t flag_reached_pg = 1 << 1;
+ static const uint8_t flag_delayed = 1 << 2;
+ static const uint8_t flag_started = 1 << 3;
+ static const uint8_t flag_sub_op_sent = 1 << 4;
+ static const uint8_t flag_commit_sent = 1 << 5;
+
+ OpRequest(Message *req, OpTracker *tracker);
+
+protected:
+ void _dump_op_descriptor_unlocked(std::ostream& stream) const override;
+ void _unregistered() override;
+ bool filter_out(const std::set<std::string>& filters) override;
+
+public:
+ ~OpRequest() override {
+ request->put();
+ }
+
+ bool check_send_map = true; ///< true until we check if sender needs a map
+ epoch_t sent_epoch = 0; ///< client's map epoch
+ epoch_t min_epoch = 0; ///< min epoch needed to handle this msg
+
+ bool hitset_inserted;
+#ifdef HAVE_JAEGER
+ jspan osd_parent_span = nullptr;
+ void set_osd_parent_span(jspan& span) {
+ if(osd_parent_span){
+ jaeger_tracing::finish_span(osd_parent_span);
+ }
+ osd_parent_span = move(span);
+ }
+#else
+ void set_osd_parent_span(...) {}
+#endif
+ template<class T>
+ const T* get_req() const { return static_cast<const T*>(request); }
+
+ const Message *get_req() const { return request; }
+ Message *get_nonconst_req() { return request; }
+
+ entity_name_t get_source() {
+ if (request) {
+ return request->get_source();
+ } else {
+ return entity_name_t();
+ }
+ }
+ uint8_t state_flag() const {
+ return latest_flag_point;
+ }
+
+ std::string_view state_string() const override {
+ switch(latest_flag_point) {
+ case flag_queued_for_pg: return "queued for pg";
+ case flag_reached_pg: return "reached pg";
+ case flag_delayed: return "delayed";
+ case flag_started: return "started";
+ case flag_sub_op_sent: return "waiting for sub ops";
+ case flag_commit_sent: return "commit sent; apply or cleanup";
+ default: break;
+ }
+ return "no flag points reached";
+ }
+
+ static std::string get_state_string(uint8_t flag) {
+ std::string flag_point;
+
+ switch(flag) {
+ case flag_queued_for_pg:
+ flag_point = "queued for pg";
+ break;
+ case flag_reached_pg:
+ flag_point = "reached pg";
+ break;
+ case flag_delayed:
+ flag_point = "delayed";
+ break;
+ case flag_started:
+ flag_point = "started";
+ break;
+ case flag_sub_op_sent:
+ flag_point = "waiting for sub ops";
+ break;
+ case flag_commit_sent:
+ flag_point = "commit sent; apply or cleanup";
+ break;
+ }
+ return flag_point;
+ }
+
+ void mark_queued_for_pg() {
+ mark_flag_point(flag_queued_for_pg, "queued_for_pg");
+ }
+ void mark_reached_pg() {
+ mark_flag_point(flag_reached_pg, "reached_pg");
+ }
+ void mark_delayed(const std::string& s) {
+ mark_flag_point_string(flag_delayed, s);
+ }
+ void mark_started() {
+ mark_flag_point(flag_started, "started");
+ }
+ void mark_sub_op_sent(const std::string& s) {
+ mark_flag_point_string(flag_sub_op_sent, s);
+ }
+ void mark_commit_sent() {
+ mark_flag_point(flag_commit_sent, "commit_sent");
+ }
+
+ utime_t get_dequeued_time() const {
+ return dequeued_time;
+ }
+ void set_dequeued_time(utime_t deq_time) {
+ dequeued_time = deq_time;
+ }
+
+ osd_reqid_t get_reqid() const {
+ return reqid;
+ }
+
+ typedef boost::intrusive_ptr<OpRequest> Ref;
+
+private:
+ void mark_flag_point(uint8_t flag, const char *s);
+ void mark_flag_point_string(uint8_t flag, const std::string& s);
+};
+
+typedef OpRequest::Ref OpRequestRef;
+
+#endif /* OPREQUEST_H_ */
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
new file mode 100644
index 000000000..5b10f1466
--- /dev/null
+++ b/src/osd/PG.cc
@@ -0,0 +1,2753 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "PG.h"
+#include "messages/MOSDRepScrub.h"
+
+#include "common/errno.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "OSD.h"
+#include "OpRequest.h"
+#include "ScrubStore.h"
+#include "pg_scrubber.h"
+#include "Session.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDScrubReserve.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+
+#include "common/BackTrace.h"
+#include "common/EventTrace.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/pg.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#include <sstream>
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+using std::list;
+using std::map;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+using namespace ceph::osd::scheduler;
+
+template <class T>
+static ostream& _prefix(std::ostream *_dout, T *t)
+{
+ return t->gen_prefix(*_dout);
+}
+
+void PG::get(const char* tag)
+{
+ int after = ++ref;
+ lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
+ << "tag " << (tag ? tag : "(none") << " "
+ << (after - 1) << " -> " << after << dendl;
+#ifdef PG_DEBUG_REFS
+ std::lock_guard l(_ref_id_lock);
+ _tag_counts[tag]++;
+#endif
+}
+
+void PG::put(const char* tag)
+{
+#ifdef PG_DEBUG_REFS
+ {
+ std::lock_guard l(_ref_id_lock);
+ auto tag_counts_entry = _tag_counts.find(tag);
+ ceph_assert(tag_counts_entry != _tag_counts.end());
+ --tag_counts_entry->second;
+ if (tag_counts_entry->second == 0) {
+ _tag_counts.erase(tag_counts_entry);
+ }
+ }
+#endif
+ auto local_cct = cct;
+ int after = --ref;
+ lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
+ << "tag " << (tag ? tag : "(none") << " "
+ << (after + 1) << " -> " << after
+ << dendl;
+ if (after == 0)
+ delete this;
+}
+
+#ifdef PG_DEBUG_REFS
+uint64_t PG::get_with_id()
+{
+ ref++;
+ std::lock_guard l(_ref_id_lock);
+ uint64_t id = ++_ref_id;
+ BackTrace bt(0);
+ stringstream ss;
+ bt.print(ss);
+ lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
+ << " got id " << id << " "
+ << (ref - 1) << " -> " << ref
+ << dendl;
+ ceph_assert(!_live_ids.count(id));
+ _live_ids.insert(make_pair(id, ss.str()));
+ return id;
+}
+
+void PG::put_with_id(uint64_t id)
+{
+ int newref = --ref;
+ lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
+ << " put id " << id << " "
+ << (newref + 1) << " -> " << newref
+ << dendl;
+ {
+ std::lock_guard l(_ref_id_lock);
+ ceph_assert(_live_ids.count(id));
+ _live_ids.erase(id);
+ }
+ if (newref)
+ delete this;
+}
+
+void PG::dump_live_ids()
+{
+ std::lock_guard l(_ref_id_lock);
+ dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
+ for (map<uint64_t, string>::iterator i = _live_ids.begin();
+ i != _live_ids.end();
+ ++i) {
+ dout(0) << "\t\tid: " << *i << dendl;
+ }
+ dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
+ for (map<string, uint64_t>::iterator i = _tag_counts.begin();
+ i != _tag_counts.end();
+ ++i) {
+ dout(0) << "\t\tid: " << *i << dendl;
+ }
+}
+#endif
+
+PG::PG(OSDService *o, OSDMapRef curmap,
+ const PGPool &_pool, spg_t p) :
+ pg_whoami(o->whoami, p.shard),
+ pg_id(p),
+ coll(p),
+ osd(o),
+ cct(o->cct),
+ osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
+ snap_mapper(
+ cct,
+ &osdriver,
+ p.ps(),
+ p.get_split_bits(_pool.info.get_pg_num()),
+ _pool.id,
+ p.shard),
+ trace_endpoint("0.0.0.0", 0, "PG"),
+ info_struct_v(0),
+ pgmeta_oid(p.make_pgmeta_oid()),
+ stat_queue_item(this),
+ recovery_queued(false),
+ recovery_ops_active(0),
+ backfill_reserving(false),
+ pg_stats_publish_valid(false),
+ finish_sync_event(NULL),
+ scrub_after_recovery(false),
+ active_pushes(0),
+ recovery_state(
+ o->cct,
+ pg_whoami,
+ p,
+ _pool,
+ curmap,
+ this,
+ this),
+ pool(recovery_state.get_pool()),
+ info(recovery_state.get_info())
+{
+#ifdef PG_DEBUG_REFS
+ osd->add_pgid(p, this);
+#endif
+#ifdef WITH_BLKIN
+ std::stringstream ss;
+ ss << "PG " << info.pgid;
+ trace_endpoint.copy_name(ss.str());
+#endif
+}
+
+PG::~PG()
+{
+#ifdef PG_DEBUG_REFS
+ osd->remove_pgid(info.pgid, this);
+#endif
+}
+
+void PG::lock(bool no_lockdep) const
+{
+#ifdef CEPH_DEBUG_MUTEX
+ _lock.lock(no_lockdep);
+#else
+ _lock.lock();
+ locked_by = std::this_thread::get_id();
+#endif
+ // if we have unrecorded dirty state with the lock dropped, there is a bug
+ ceph_assert(!recovery_state.debug_has_dirty_state());
+
+ dout(30) << "lock" << dendl;
+}
+
+bool PG::is_locked() const
+{
+ return ceph_mutex_is_locked(_lock);
+}
+
+void PG::unlock() const
+{
+ //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
+ ceph_assert(!recovery_state.debug_has_dirty_state());
+#ifndef CEPH_DEBUG_MUTEX
+ locked_by = {};
+#endif
+ _lock.unlock();
+}
+
+std::ostream& PG::gen_prefix(std::ostream& out) const
+{
+ OSDMapRef mapref = recovery_state.get_osdmap();
+#ifdef CEPH_DEBUG_MUTEX
+ if (_lock.is_locked_by_me()) {
+#else
+ if (locked_by == std::this_thread::get_id()) {
+#endif
+ out << "osd." << osd->whoami
+ << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
+ << " " << *this << " ";
+ } else {
+ out << "osd." << osd->whoami
+ << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
+ << " pg[" << pg_id.pgid << "(unlocked)] ";
+ }
+ return out;
+}
+
+PerfCounters &PG::get_peering_perf() {
+ return *(osd->recoverystate_perf);
+}
+
+PerfCounters &PG::get_perf_logger() {
+ return *(osd->logger);
+}
+
+void PG::log_state_enter(const char *state) {
+ osd->pg_recovery_stats.log_enter(state);
+}
+
+void PG::log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) {
+ osd->pg_recovery_stats.log_exit(
+ state_name, ceph_clock_now() - enter_time, events, event_dur);
+}
+
+/********* PG **********/
+
+void PG::remove_snap_mapped_object(
+ ObjectStore::Transaction &t, const hobject_t &soid)
+{
+ t.remove(
+ coll,
+ ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
+ clear_object_snap_mapping(&t, soid);
+}
+
+void PG::clear_object_snap_mapping(
+ ObjectStore::Transaction *t, const hobject_t &soid)
+{
+ OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+ if (soid.snap < CEPH_MAXSNAP) {
+ int r = snap_mapper.remove_oid(
+ soid,
+ &_t);
+ if (!(r == 0 || r == -ENOENT)) {
+ derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ }
+}
+
+void PG::update_object_snap_mapping(
+ ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
+{
+ OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+ ceph_assert(soid.snap < CEPH_MAXSNAP);
+ int r = snap_mapper.remove_oid(
+ soid,
+ &_t);
+ if (!(r == 0 || r == -ENOENT)) {
+ derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ snap_mapper.add_oid(
+ soid,
+ snaps,
+ &_t);
+}
+
+/******* PG ***********/
+void PG::clear_primary_state()
+{
+ dout(20) << __func__ << dendl;
+
+ projected_log = PGLog::IndexedLog();
+
+ snap_trimq.clear();
+ snap_trimq_repeat.clear();
+ finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
+ release_pg_backoffs();
+
+ if (m_scrubber) {
+ m_scrubber->discard_replica_reservations();
+ }
+ scrub_after_recovery = false;
+
+ agent_clear();
+}
+
+
+bool PG::op_has_sufficient_caps(OpRequestRef& op)
+{
+ // only check MOSDOp
+ if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
+ return true;
+
+ auto req = op->get_req<MOSDOp>();
+ auto priv = req->get_connection()->get_priv();
+ auto session = static_cast<Session*>(priv.get());
+ if (!session) {
+ dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
+ return false;
+ }
+ OSDCap& caps = session->caps;
+ priv.reset();
+
+ const string &key = req->get_hobj().get_key().empty() ?
+ req->get_oid().name :
+ req->get_hobj().get_key();
+
+ bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
+ pool.info.application_metadata,
+ key,
+ op->need_read_cap(),
+ op->need_write_cap(),
+ op->classes(),
+ session->get_peer_socket_addr());
+
+ dout(20) << "op_has_sufficient_caps "
+ << "session=" << session
+ << " pool=" << pool.id << " (" << pool.name
+ << " " << req->get_hobj().nspace
+ << ")"
+ << " pool_app_metadata=" << pool.info.application_metadata
+ << " need_read_cap=" << op->need_read_cap()
+ << " need_write_cap=" << op->need_write_cap()
+ << " classes=" << op->classes()
+ << " -> " << (cap ? "yes" : "NO")
+ << dendl;
+ return cap;
+}
+
+void PG::queue_recovery()
+{
+ if (!is_primary() || !is_peered()) {
+ dout(10) << "queue_recovery -- not primary or not peered " << dendl;
+ ceph_assert(!recovery_queued);
+ } else if (recovery_queued) {
+ dout(10) << "queue_recovery -- already queued" << dendl;
+ } else {
+ dout(10) << "queue_recovery -- queuing" << dendl;
+ recovery_queued = true;
+ osd->queue_for_recovery(this);
+ }
+}
+
+void PG::queue_scrub_after_repair()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(ceph_mutex_is_locked(_lock));
+
+ m_planned_scrub.must_deep_scrub = true;
+ m_planned_scrub.check_repair = true;
+ m_planned_scrub.must_scrub = true;
+
+ if (is_scrub_queued_or_active()) {
+ dout(10) << __func__ << ": scrubbing already ("
+ << (is_scrubbing() ? "active)" : "queued)") << dendl;
+ return;
+ }
+
+ m_scrubber->set_op_parameters(m_planned_scrub);
+ dout(15) << __func__ << ": queueing" << dendl;
+
+ m_scrubber->set_queued_or_active();
+ osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority);
+}
+
+unsigned PG::get_scrub_priority()
+{
+ // a higher value -> a higher priority
+ int64_t pool_scrub_priority =
+ pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0);
+ return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
+}
+
+Context *PG::finish_recovery()
+{
+ dout(10) << "finish_recovery" << dendl;
+ ceph_assert(info.last_complete == info.last_update);
+
+ clear_recovery_state();
+
+ /*
+ * sync all this before purging strays. but don't block!
+ */
+ finish_sync_event = new C_PG_FinishRecovery(this);
+ return finish_sync_event;
+}
+
+void PG::_finish_recovery(Context* c)
+{
+ dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? "
+ << is_clean() << dendl;
+
+ std::scoped_lock locker{*this};
+ if (recovery_state.is_deleting() || !is_clean()) {
+ dout(10) << __func__ << " raced with delete or repair" << dendl;
+ return;
+ }
+ // When recovery is initiated by a repair, that flag is left on
+ state_clear(PG_STATE_REPAIR);
+ if (c == finish_sync_event) {
+ dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl;
+ finish_sync_event = 0;
+ recovery_state.purge_strays();
+
+ publish_stats_to_osd();
+
+ if (scrub_after_recovery) {
+ dout(10) << "_finish_recovery requeueing for scrub" << dendl;
+ scrub_after_recovery = false;
+ queue_scrub_after_repair();
+ }
+ } else {
+ dout(10) << "_finish_recovery -- stale" << dendl;
+ }
+}
+
+void PG::start_recovery_op(const hobject_t& soid)
+{
+ dout(10) << "start_recovery_op " << soid
+#ifdef DEBUG_RECOVERY_OIDS
+ << " (" << recovering_oids << ")"
+#endif
+ << dendl;
+ ceph_assert(recovery_ops_active >= 0);
+ recovery_ops_active++;
+#ifdef DEBUG_RECOVERY_OIDS
+ recovering_oids.insert(soid);
+#endif
+ osd->start_recovery_op(this, soid);
+}
+
+void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
+{
+ dout(10) << "finish_recovery_op " << soid
+#ifdef DEBUG_RECOVERY_OIDS
+ << " (" << recovering_oids << ")"
+#endif
+ << dendl;
+ ceph_assert(recovery_ops_active > 0);
+ recovery_ops_active--;
+#ifdef DEBUG_RECOVERY_OIDS
+ ceph_assert(recovering_oids.count(soid));
+ recovering_oids.erase(recovering_oids.find(soid));
+#endif
+ osd->finish_recovery_op(this, soid, dequeue);
+
+ if (!dequeue) {
+ queue_recovery();
+ }
+}
+
+void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
+{
+ recovery_state.split_into(child_pgid, &child->recovery_state, split_bits);
+
+ child->update_snap_mapper_bits(split_bits);
+
+ child->snap_trimq = snap_trimq;
+ child->snap_trimq_repeat = snap_trimq_repeat;
+
+ _split_into(child_pgid, child, split_bits);
+
+ // release all backoffs for simplicity
+ release_backoffs(hobject_t(), hobject_t::get_max());
+}
+
+void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
+{
+ recovery_state.start_split_stats(childpgs, out);
+}
+
+void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction &t)
+{
+ recovery_state.finish_split_stats(stats, t);
+}
+
+void PG::merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx,
+ unsigned split_bits,
+ const pg_merge_meta_t& last_pg_merge_meta)
+{
+ dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
+ << dendl;
+ map<spg_t, PeeringState*> source_ps;
+ for (auto &&source : sources) {
+ source_ps.emplace(source.first, &source.second->recovery_state);
+ }
+ recovery_state.merge_from(source_ps, rctx, split_bits, last_pg_merge_meta);
+
+ for (auto& i : sources) {
+ auto& source = i.second;
+ // wipe out source's pgmeta
+ rctx.transaction.remove(source->coll, source->pgmeta_oid);
+
+ // merge (and destroy source collection)
+ rctx.transaction.merge_collection(source->coll, coll, split_bits);
+ }
+
+ // merge_collection does this, but maybe all of our sources were missing.
+ rctx.transaction.collection_set_bits(coll, split_bits);
+
+ snap_mapper.update_bits(split_bits);
+}
+
+void PG::add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end)
+{
+ auto con = s->con;
+ if (!con) // OSD::ms_handle_reset clears s->con without a lock
+ return;
+ auto b = s->have_backoff(info.pgid, begin);
+ if (b) {
+ derr << __func__ << " already have backoff for " << s << " begin " << begin
+ << " " << *b << dendl;
+ ceph_abort();
+ }
+ std::lock_guard l(backoff_lock);
+ b = ceph::make_ref<Backoff>(info.pgid, this, s, ++s->backoff_seq, begin, end);
+ backoffs[begin].insert(b);
+ s->add_backoff(b);
+ dout(10) << __func__ << " session " << s << " added " << *b << dendl;
+ con->send_message(
+ new MOSDBackoff(
+ info.pgid,
+ get_osdmap_epoch(),
+ CEPH_OSD_BACKOFF_OP_BLOCK,
+ b->id,
+ begin,
+ end));
+}
+
+void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
+{
+ dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
+ vector<ceph::ref_t<Backoff>> bv;
+ {
+ std::lock_guard l(backoff_lock);
+ auto p = backoffs.lower_bound(begin);
+ while (p != backoffs.end()) {
+ int r = cmp(p->first, end);
+ dout(20) << __func__ << " ? " << r << " " << p->first
+ << " " << p->second << dendl;
+ // note: must still examine begin=end=p->first case
+ if (r > 0 || (r == 0 && begin < end)) {
+ break;
+ }
+ dout(20) << __func__ << " checking " << p->first
+ << " " << p->second << dendl;
+ auto q = p->second.begin();
+ while (q != p->second.end()) {
+ dout(20) << __func__ << " checking " << *q << dendl;
+ int r = cmp((*q)->begin, begin);
+ if (r == 0 || (r > 0 && (*q)->end < end)) {
+ bv.push_back(*q);
+ q = p->second.erase(q);
+ } else {
+ ++q;
+ }
+ }
+ if (p->second.empty()) {
+ p = backoffs.erase(p);
+ } else {
+ ++p;
+ }
+ }
+ }
+ for (auto b : bv) {
+ std::lock_guard l(b->lock);
+ dout(10) << __func__ << " " << *b << dendl;
+ if (b->session) {
+ ceph_assert(b->pg == this);
+ ConnectionRef con = b->session->con;
+ if (con) { // OSD::ms_handle_reset clears s->con without a lock
+ con->send_message(
+ new MOSDBackoff(
+ info.pgid,
+ get_osdmap_epoch(),
+ CEPH_OSD_BACKOFF_OP_UNBLOCK,
+ b->id,
+ b->begin,
+ b->end));
+ }
+ if (b->is_new()) {
+ b->state = Backoff::STATE_DELETING;
+ } else {
+ b->session->rm_backoff(b);
+ b->session.reset();
+ }
+ b->pg.reset();
+ }
+ }
+}
+
+void PG::clear_backoffs()
+{
+ dout(10) << __func__ << " " << dendl;
+ map<hobject_t,set<ceph::ref_t<Backoff>>> ls;
+ {
+ std::lock_guard l(backoff_lock);
+ ls.swap(backoffs);
+ }
+ for (auto& p : ls) {
+ for (auto& b : p.second) {
+ std::lock_guard l(b->lock);
+ dout(10) << __func__ << " " << *b << dendl;
+ if (b->session) {
+ ceph_assert(b->pg == this);
+ if (b->is_new()) {
+ b->state = Backoff::STATE_DELETING;
+ } else {
+ b->session->rm_backoff(b);
+ b->session.reset();
+ }
+ b->pg.reset();
+ }
+ }
+ }
+}
+
+// called by Session::clear_backoffs()
+void PG::rm_backoff(const ceph::ref_t<Backoff>& b)
+{
+ dout(10) << __func__ << " " << *b << dendl;
+ std::lock_guard l(backoff_lock);
+ ceph_assert(ceph_mutex_is_locked_by_me(b->lock));
+ ceph_assert(b->pg == this);
+ auto p = backoffs.find(b->begin);
+ // may race with release_backoffs()
+ if (p != backoffs.end()) {
+ auto q = p->second.find(b);
+ if (q != p->second.end()) {
+ p->second.erase(q);
+ if (p->second.empty()) {
+ backoffs.erase(p);
+ }
+ }
+ }
+}
+
+void PG::clear_recovery_state()
+{
+ dout(10) << "clear_recovery_state" << dendl;
+
+ finish_sync_event = 0;
+
+ hobject_t soid;
+ while (recovery_ops_active > 0) {
+#ifdef DEBUG_RECOVERY_OIDS
+ soid = *recovering_oids.begin();
+#endif
+ finish_recovery_op(soid, true);
+ }
+
+ backfill_info.clear();
+ peer_backfill_info.clear();
+ waiting_on_backfill.clear();
+ _clear_recovery_state(); // pg impl specific hook
+}
+
+void PG::cancel_recovery()
+{
+ dout(10) << "cancel_recovery" << dendl;
+ clear_recovery_state();
+}
+
+void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
+{
+ std::lock_guard l(heartbeat_peer_lock);
+ probe_targets.clear();
+ for (set<pg_shard_t>::iterator i = probe_set.begin();
+ i != probe_set.end();
+ ++i) {
+ probe_targets.insert(i->osd);
+ }
+}
+
+void PG::send_cluster_message(
+ int target, MessageRef m,
+ epoch_t epoch, bool share_map_update=false)
+{
+ ConnectionRef con = osd->get_con_osd_cluster(
+ target, get_osdmap_epoch());
+ if (!con) {
+ return;
+ }
+
+ if (share_map_update) {
+ osd->maybe_share_map(con.get(), get_osdmap());
+ }
+ osd->send_message_osd_cluster(m, con.get());
+}
+
+void PG::clear_probe_targets()
+{
+ std::lock_guard l(heartbeat_peer_lock);
+ probe_targets.clear();
+}
+
+void PG::update_heartbeat_peers(set<int> new_peers)
+{
+ bool need_update = false;
+ heartbeat_peer_lock.lock();
+ if (new_peers == heartbeat_peers) {
+ dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
+ } else {
+ dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
+ heartbeat_peers.swap(new_peers);
+ need_update = true;
+ }
+ heartbeat_peer_lock.unlock();
+
+ if (need_update)
+ osd->need_heartbeat_peer_update();
+}
+
+
+bool PG::check_in_progress_op(
+ const osd_reqid_t &r,
+ eversion_t *version,
+ version_t *user_version,
+ int *return_code,
+ vector<pg_log_op_return_item_t> *op_returns
+ ) const
+{
+ return (
+ projected_log.get_request(r, version, user_version, return_code,
+ op_returns) ||
+ recovery_state.get_pg_log().get_log().get_request(
+ r, version, user_version, return_code, op_returns));
+}
+
+void PG::publish_stats_to_osd()
+{
+ if (!is_primary())
+ return;
+
+ std::lock_guard l{pg_stats_publish_lock};
+ auto stats = recovery_state.prepare_stats_for_publish(
+ pg_stats_publish_valid,
+ pg_stats_publish,
+ unstable_stats);
+ if (stats) {
+ pg_stats_publish = stats.value();
+ pg_stats_publish_valid = true;
+ }
+}
+
+unsigned PG::get_target_pg_log_entries() const
+{
+ return osd->get_target_pg_log_entries();
+}
+
+void PG::clear_publish_stats()
+{
+ dout(15) << "clear_stats" << dendl;
+ std::lock_guard l{pg_stats_publish_lock};
+ pg_stats_publish_valid = false;
+}
+
+/**
+ * initialize a newly instantiated pg
+ *
+ * Initialize PG state, as when a PG is initially created, or when it
+ * is first instantiated on the current node.
+ *
+ * @param role our role/rank
+ * @param newup up set
+ * @param newacting acting set
+ * @param history pg history
+ * @param pi past_intervals
+ * @param backfill true if info should be marked as backfill
+ * @param t transaction to write out our new state in
+ */
+void PG::init(
+ int role,
+ const vector<int>& newup, int new_up_primary,
+ const vector<int>& newacting, int new_acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pi,
+ bool backfill,
+ ObjectStore::Transaction &t)
+{
+ recovery_state.init(
+ role, newup, new_up_primary, newacting,
+ new_acting_primary, history, pi, backfill, t);
+}
+
+void PG::shutdown()
+{
+ ch->flush();
+ std::scoped_lock l{*this};
+ recovery_state.shutdown();
+ on_shutdown();
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void PG::upgrade(ObjectStore *store)
+{
+ dout(0) << __func__ << " " << info_struct_v << " -> " << pg_latest_struct_v
+ << dendl;
+ ceph_assert(info_struct_v <= 10);
+ ObjectStore::Transaction t;
+
+ // <do upgrade steps here>
+
+ // finished upgrade!
+ ceph_assert(info_struct_v == 10);
+
+ // update infover_key
+ if (info_struct_v < pg_latest_struct_v) {
+ map<string,bufferlist> v;
+ __u8 ver = pg_latest_struct_v;
+ encode(ver, v[string(infover_key)]);
+ t.omap_setkeys(coll, pgmeta_oid, v);
+ }
+
+ recovery_state.force_write_state(t);
+
+ ObjectStore::CollectionHandle ch = store->open_collection(coll);
+ int r = store->queue_transaction(ch, std::move(t));
+ if (r != 0) {
+ derr << __func__ << ": queue_transaction returned "
+ << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ ceph_assert(r == 0);
+
+ C_SaferCond waiter;
+ if (!ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+void PG::prepare_write(
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ObjectStore::Transaction &t)
+{
+ info.stats.stats.add(unstable_stats);
+ unstable_stats.clear();
+ map<string,bufferlist> km;
+ string key_to_remove;
+ if (dirty_big_info || dirty_info) {
+ int ret = prepare_info_keymap(
+ cct,
+ &km,
+ &key_to_remove,
+ get_osdmap_epoch(),
+ info,
+ last_written_info,
+ past_intervals,
+ dirty_big_info,
+ need_write_epoch,
+ cct->_conf->osd_fast_info,
+ osd->logger,
+ this);
+ ceph_assert(ret == 0);
+ }
+ pglog.write_log_and_missing(
+ t, &km, coll, pgmeta_oid, pool.info.require_rollback());
+ if (!km.empty())
+ t.omap_setkeys(coll, pgmeta_oid, km);
+ if (!key_to_remove.empty())
+ t.omap_rmkey(coll, pgmeta_oid, key_to_remove);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+bool PG::_has_removal_flag(ObjectStore *store,
+ spg_t pgid)
+{
+ coll_t coll(pgid);
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+
+ // first try new way
+ set<string> keys;
+ keys.insert("_remove");
+ map<string,bufferlist> values;
+ auto ch = store->open_collection(coll);
+ ceph_assert(ch);
+ if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
+ values.size() == 1)
+ return true;
+
+ return false;
+}
+
+int PG::peek_map_epoch(ObjectStore *store,
+ spg_t pgid,
+ epoch_t *pepoch)
+{
+ coll_t coll(pgid);
+ ghobject_t legacy_infos_oid(OSD::make_infos_oid());
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+ epoch_t cur_epoch = 0;
+
+ // validate collection name
+ ceph_assert(coll.is_pg());
+
+ // try for v8
+ set<string> keys;
+ keys.insert(string(infover_key));
+ keys.insert(string(epoch_key));
+ map<string,bufferlist> values;
+ auto ch = store->open_collection(coll);
+ ceph_assert(ch);
+ int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
+ if (r == 0) {
+ ceph_assert(values.size() == 2);
+
+ // sanity check version
+ auto bp = values[string(infover_key)].cbegin();
+ __u8 struct_v = 0;
+ decode(struct_v, bp);
+ ceph_assert(struct_v >= 8);
+
+ // get epoch
+ bp = values[string(epoch_key)].begin();
+ decode(cur_epoch, bp);
+ } else {
+ // probably bug 10617; see OSD::load_pgs()
+ return -1;
+ }
+
+ *pepoch = cur_epoch;
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+bool PG::check_log_for_corruption(ObjectStore *store)
+{
+ /// TODO: this method needs to work with the omap log
+ return true;
+}
+
+//! Get the name we're going to save our corrupt page log as
+std::string PG::get_corrupt_pg_log_name() const
+{
+ const int MAX_BUF = 512;
+ char buf[MAX_BUF];
+ struct tm tm_buf;
+ time_t my_time(time(NULL));
+ const struct tm *t = localtime_r(&my_time, &tm_buf);
+ int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
+ if (ret == 0) {
+ dout(0) << "strftime failed" << dendl;
+ return "corrupt_log_unknown_time";
+ }
+ string out(buf);
+ out += stringify(info.pgid);
+ return out;
+}
+
+int PG::read_info(
+ ObjectStore *store, spg_t pgid, const coll_t &coll,
+ pg_info_t &info, PastIntervals &past_intervals,
+ __u8 &struct_v)
+{
+ set<string> keys;
+ keys.insert(string(infover_key));
+ keys.insert(string(info_key));
+ keys.insert(string(biginfo_key));
+ keys.insert(string(fastinfo_key));
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+ map<string,bufferlist> values;
+ auto ch = store->open_collection(coll);
+ ceph_assert(ch);
+ int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
+ ceph_assert(r == 0);
+ ceph_assert(values.size() == 3 ||
+ values.size() == 4);
+
+ auto p = values[string(infover_key)].cbegin();
+ decode(struct_v, p);
+ ceph_assert(struct_v >= 10);
+
+ p = values[string(info_key)].begin();
+ decode(info, p);
+
+ p = values[string(biginfo_key)].begin();
+ decode(past_intervals, p);
+ decode(info.purged_snaps, p);
+
+ p = values[string(fastinfo_key)].begin();
+ if (!p.end()) {
+ pg_fast_info_t fast;
+ decode(fast, p);
+ fast.try_apply_to(&info);
+ }
+ return 0;
+}
+
+void PG::read_state(ObjectStore *store)
+{
+ PastIntervals past_intervals_from_disk;
+ pg_info_t info_from_disk;
+ int r = read_info(
+ store,
+ pg_id,
+ coll,
+ info_from_disk,
+ past_intervals_from_disk,
+ info_struct_v);
+ ceph_assert(r >= 0);
+
+ if (info_struct_v < pg_compat_struct_v) {
+ derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
+ << " an older version first." << dendl;
+ ceph_abort_msg("PG too old to upgrade");
+ }
+
+ recovery_state.init_from_disk_state(
+ std::move(info_from_disk),
+ std::move(past_intervals_from_disk),
+ [this, store] (PGLog &pglog) {
+ ostringstream oss;
+ pglog.read_log_and_missing(
+ store,
+ ch,
+ pgmeta_oid,
+ info,
+ oss,
+ cct->_conf->osd_ignore_stale_divergent_priors,
+ cct->_conf->osd_debug_verify_missing_on_start);
+
+ if (oss.tellp())
+ osd->clog->error() << oss.str();
+ return 0;
+ });
+
+ if (info_struct_v < pg_latest_struct_v) {
+ upgrade(store);
+ }
+
+ // initialize current mapping
+ {
+ int primary, up_primary;
+ vector<int> acting, up;
+ get_osdmap()->pg_to_up_acting_osds(
+ pg_id.pgid, &up, &up_primary, &acting, &primary);
+ recovery_state.init_primary_up_acting(
+ up,
+ acting,
+ up_primary,
+ primary);
+ recovery_state.set_role(OSDMap::calc_pg_role(pg_whoami, acting));
+ }
+
+ // init pool options
+ store->set_collection_opts(ch, pool.info.opts);
+
+ PeeringCtx rctx(ceph_release_t::unknown);
+ handle_initialize(rctx);
+ // note: we don't activate here because we know the OSD will advance maps
+ // during boot.
+ write_if_dirty(rctx.transaction);
+ store->queue_transaction(ch, std::move(rctx.transaction));
+}
+
+void PG::update_snap_map(
+ const vector<pg_log_entry_t> &log_entries,
+ ObjectStore::Transaction &t)
+{
+ for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
+ OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+ if (i->soid.snap < CEPH_MAXSNAP) {
+ if (i->is_delete()) {
+ int r = snap_mapper.remove_oid(
+ i->soid,
+ &_t);
+ if (r)
+ derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
+ // On removal tolerate missing key corruption
+ ceph_assert(r == 0 || r == -ENOENT);
+ } else if (i->is_update()) {
+ ceph_assert(i->snaps.length() > 0);
+ vector<snapid_t> snaps;
+ bufferlist snapbl = i->snaps;
+ auto p = snapbl.cbegin();
+ try {
+ decode(snaps, p);
+ } catch (...) {
+ derr << __func__ << " decode snaps failure on " << *i << dendl;
+ snaps.clear();
+ }
+ set<snapid_t> _snaps(snaps.begin(), snaps.end());
+
+ if (i->is_clone() || i->is_promote()) {
+ snap_mapper.add_oid(
+ i->soid,
+ _snaps,
+ &_t);
+ } else if (i->is_modify()) {
+ int r = snap_mapper.update_snaps(
+ i->soid,
+ _snaps,
+ 0,
+ &_t);
+ ceph_assert(r == 0);
+ } else {
+ ceph_assert(i->is_clean());
+ }
+ }
+ }
+ }
+}
+
+/**
+ * filter trimming|trimmed snaps out of snapcontext
+ */
+void PG::filter_snapc(vector<snapid_t> &snaps)
+{
+ // nothing needs to trim, we can return immediately
+ if (snap_trimq.empty() && info.purged_snaps.empty())
+ return;
+
+ bool filtering = false;
+ vector<snapid_t> newsnaps;
+ for (vector<snapid_t>::iterator p = snaps.begin();
+ p != snaps.end();
+ ++p) {
+ if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
+ if (!filtering) {
+ // start building a new vector with what we've seen so far
+ dout(10) << "filter_snapc filtering " << snaps << dendl;
+ newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
+ filtering = true;
+ }
+ dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
+ } else {
+ if (filtering)
+ newsnaps.push_back(*p); // continue building new vector
+ }
+ }
+ if (filtering) {
+ snaps.swap(newsnaps);
+ dout(10) << "filter_snapc result " << snaps << dendl;
+ }
+}
+
+void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
+{
+ for (auto it = m.begin(); it != m.end(); ++it)
+ requeue_ops(it->second);
+ m.clear();
+}
+
+void PG::requeue_op(OpRequestRef op)
+{
+ auto p = waiting_for_map.find(op->get_source());
+ if (p != waiting_for_map.end()) {
+ dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
+ << dendl;
+ p->second.push_front(op);
+ } else {
+ dout(20) << __func__ << " " << op << dendl;
+ osd->enqueue_front(
+ OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
+ op->get_req()->get_cost(),
+ op->get_req()->get_priority(),
+ op->get_req()->get_recv_stamp(),
+ op->get_req()->get_source().num(),
+ get_osdmap_epoch()));
+ }
+}
+
+void PG::requeue_ops(list<OpRequestRef> &ls)
+{
+ for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
+ i != ls.rend();
+ ++i) {
+ requeue_op(*i);
+ }
+ ls.clear();
+}
+
+void PG::requeue_map_waiters()
+{
+ epoch_t epoch = get_osdmap_epoch();
+ auto p = waiting_for_map.begin();
+ while (p != waiting_for_map.end()) {
+ if (epoch < p->second.front()->min_epoch) {
+ dout(20) << __func__ << " " << p->first << " front op "
+ << p->second.front() << " must still wait, doing nothing"
+ << dendl;
+ ++p;
+ } else {
+ dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
+ for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
+ auto req = *q;
+ osd->enqueue_front(OpSchedulerItem(
+ unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, req)),
+ req->get_req()->get_cost(),
+ req->get_req()->get_priority(),
+ req->get_req()->get_recv_stamp(),
+ req->get_req()->get_source().num(),
+ epoch));
+ }
+ p = waiting_for_map.erase(p);
+ }
+ }
+}
+
+bool PG::get_must_scrub() const
+{
+ dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl;
+ return m_planned_scrub.must_scrub;
+}
+
+unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+ return m_scrubber->scrub_requeue_priority(with_priority);
+}
+
+unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const
+{
+ return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
+}
+
+// ==========================================================================================
+// SCRUB
+
+/*
+ * implementation note:
+ * PG::sched_scrub() is called only once per a specific scrub session.
+ * That call commits us to the whatever choices are made (deep/shallow, etc').
+ * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into
+ * PgScrubber's m_flags, then cleared.
+ */
+bool PG::sched_scrub()
+{
+ dout(15) << __func__ << " pg(" << info.pgid
+ << (is_active() ? ") <active>" : ") <not-active>")
+ << (is_clean() ? " <clean>" : " <not-clean>") << dendl;
+ ceph_assert(ceph_mutex_is_locked(_lock));
+
+ if (!is_primary() || !is_active() || !is_clean()) {
+ return false;
+ }
+
+ if (is_scrub_queued_or_active()) {
+ return false;
+ }
+
+ // analyse the combination of the requested scrub flags, the osd/pool configuration
+ // and the PG status to determine whether we should scrub now, and what type of scrub
+ // should that be.
+ auto updated_flags = verify_scrub_mode();
+ if (!updated_flags) {
+ // the stars do not align for starting a scrub for this PG at this time
+ // (due to configuration or priority issues)
+ // The reason was already reported by the callee.
+ dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
+ return false;
+ }
+
+ // try to reserve the local OSD resources. If failing: no harm. We will
+ // be retried by the OSD later on.
+ if (!m_scrubber->reserve_local()) {
+ dout(10) << __func__ << ": failed to reserve locally" << dendl;
+ return false;
+ }
+
+ // can commit to the updated flags now, as nothing will stop the scrub
+ m_planned_scrub = *updated_flags;
+
+ // An interrupted recovery repair could leave this set.
+ state_clear(PG_STATE_REPAIR);
+
+ // Pass control to the scrubber. It is the scrubber that handles the replicas'
+ // resources reservations.
+ m_scrubber->set_op_parameters(m_planned_scrub);
+
+ dout(10) << __func__ << ": queueing" << dendl;
+ m_scrubber->set_queued_or_active();
+ osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority);
+ return true;
+}
+
+double PG::next_deepscrub_interval() const
+{
+ double deep_scrub_interval =
+ pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
+ if (deep_scrub_interval <= 0.0)
+ deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+ return info.history.last_deep_scrub_stamp + deep_scrub_interval;
+}
+
+bool PG::is_time_for_deep(bool allow_deep_scrub,
+ bool allow_scrub,
+ bool has_deep_errors,
+ const requested_scrub_t& planned) const
+{
+ dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? "
+ << allow_deep_scrub << dendl;
+
+ if (!allow_deep_scrub)
+ return false;
+
+ if (planned.need_auto) {
+ dout(10) << __func__ << ": need repair after scrub errors" << dendl;
+ return true;
+ }
+
+ if (ceph_clock_now() >= next_deepscrub_interval()) {
+ dout(20) << __func__ << ": now (" << ceph_clock_now() << ") >= time for deep ("
+ << next_deepscrub_interval() << ")" << dendl;
+ return true;
+ }
+
+ if (has_deep_errors) {
+ osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid
+ << " Deep scrub errors, upgrading scrub to deep-scrub";
+ return true;
+ }
+
+ // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is
+ // called often, we will probably be deep-scrubbing most of the time.
+ if (allow_scrub) {
+ bool deep_coin_flip =
+ (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
+
+ dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep
+ << " deep_coin_flip=" << deep_coin_flip << dendl;
+
+ if (deep_coin_flip)
+ return true;
+ }
+
+ return false;
+}
+
+bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub,
+ bool try_to_auto_repair,
+ bool allow_regular_scrub,
+ bool has_deep_errors,
+ requested_scrub_t& planned) const
+
+{
+ ceph_assert(!planned.must_deep_scrub && !planned.must_repair);
+
+ if (!allow_deep_scrub && has_deep_errors) {
+ osd->clog->error()
+ << "osd." << osd->whoami << " pg " << info.pgid
+ << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
+ return false;
+ }
+
+ if (allow_deep_scrub) {
+ // Initial entry and scheduled scrubs without nodeep_scrub set get here
+
+ planned.time_for_deep =
+ is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned);
+
+ if (try_to_auto_repair) {
+ if (planned.time_for_deep) {
+ dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
+ planned.auto_repair = true;
+ } else if (allow_regular_scrub) {
+ dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found"
+ << dendl;
+ planned.deep_scrub_on_error = true;
+ }
+ }
+ }
+
+ dout(20) << __func__ << " updated flags: " << planned
+ << " allow_regular_scrub: " << allow_regular_scrub << dendl;
+
+ // NOSCRUB so skip regular scrubs
+ if (!allow_regular_scrub && !planned.time_for_deep) {
+ return false;
+ }
+
+ return true;
+}
+
+std::optional<requested_scrub_t> PG::verify_scrub_mode() const
+{
+ dout(10) << __func__ << " processing pg " << info.pgid << dendl;
+
+ bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+ pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
+ bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+ pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
+ bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
+ bool try_to_auto_repair =
+ (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported());
+
+ auto upd_flags = m_planned_scrub;
+
+ upd_flags.time_for_deep = false;
+ // Clear these in case user issues the scrub/repair command during
+ // the scheduling of the scrub/repair (e.g. request reservation)
+ upd_flags.deep_scrub_on_error = false;
+ upd_flags.auto_repair = false;
+
+ if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) {
+ osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid
+ << " Regular scrub request, deep-scrub details will be lost";
+ }
+
+ if (!upd_flags.must_scrub) {
+ // All periodic scrub handling goes here because must_scrub is
+ // always set for must_deep_scrub and must_repair.
+
+ bool can_start_periodic =
+ verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair,
+ allow_regular_scrub, has_deep_errors, upd_flags);
+ if (!can_start_periodic) {
+ return std::nullopt;
+ }
+ }
+
+ // scrubbing while recovering?
+
+ bool prevented_by_recovery =
+ osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery &&
+ (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair);
+
+ if (prevented_by_recovery) {
+ dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl;
+ return std::nullopt;
+ }
+
+ upd_flags.need_auto = false;
+ return upd_flags;
+}
+
+void PG::reg_next_scrub()
+{
+ m_scrubber->reg_next_scrub(m_planned_scrub);
+}
+
+void PG::on_info_history_change()
+{
+ dout(20) << __func__ << dendl;
+ if (m_scrubber) {
+ m_scrubber->unreg_next_scrub();
+ m_scrubber->reg_next_scrub(m_planned_scrub);
+ }
+}
+
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
+{
+ if (m_scrubber) {
+ m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub);
+ }
+}
+
+void PG::clear_ready_to_merge() {
+ osd->clear_ready_to_merge(this);
+}
+
+void PG::queue_want_pg_temp(const vector<int> &wanted) {
+ osd->queue_want_pg_temp(get_pgid().pgid, wanted);
+}
+
+void PG::clear_want_pg_temp() {
+ osd->remove_want_pg_temp(get_pgid().pgid);
+}
+
+void PG::on_role_change() {
+ requeue_ops(waiting_for_peered);
+ plpg_on_role_change();
+}
+
+void PG::on_new_interval()
+{
+ dout(20) << __func__ << ": scrub flags on new interval: " << m_planned_scrub
+ << dendl;
+ projected_last_update = eversion_t();
+ cancel_recovery();
+}
+
+epoch_t PG::oldest_stored_osdmap() {
+ return osd->get_superblock().oldest_map;
+}
+
+OstreamTemp PG::get_clog_info() {
+ return osd->clog->info();
+}
+
+OstreamTemp PG::get_clog_debug() {
+ return osd->clog->debug();
+}
+
+OstreamTemp PG::get_clog_error() {
+ return osd->clog->error();
+}
+
+void PG::schedule_event_after(
+ PGPeeringEventRef event,
+ float delay) {
+ std::lock_guard lock(osd->recovery_request_lock);
+ osd->recovery_request_timer.add_event_after(
+ delay,
+ new QueuePeeringEvt(
+ this,
+ std::move(event)));
+}
+
+void PG::request_local_background_io_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) {
+ osd->local_reserver.request_reservation(
+ pg_id,
+ on_grant ? new QueuePeeringEvt(
+ this, std::move(on_grant)) : nullptr,
+ priority,
+ on_preempt ? new QueuePeeringEvt(
+ this, std::move(on_preempt)) : nullptr);
+}
+
+void PG::update_local_background_io_priority(
+ unsigned priority) {
+ osd->local_reserver.update_priority(
+ pg_id,
+ priority);
+}
+
+void PG::cancel_local_background_io_reservation() {
+ osd->local_reserver.cancel_reservation(
+ pg_id);
+}
+
+void PG::request_remote_recovery_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) {
+ osd->remote_reserver.request_reservation(
+ pg_id,
+ on_grant ? new QueuePeeringEvt(
+ this, std::move(on_grant)) : nullptr,
+ priority,
+ on_preempt ? new QueuePeeringEvt(
+ this, std::move(on_preempt)) : nullptr);
+}
+
+void PG::cancel_remote_recovery_reservation() {
+ osd->remote_reserver.cancel_reservation(
+ pg_id);
+}
+
+void PG::schedule_event_on_commit(
+ ObjectStore::Transaction &t,
+ PGPeeringEventRef on_commit)
+{
+ t.register_on_commit(new QueuePeeringEvt(this, on_commit));
+}
+
+void PG::on_activate(interval_set<snapid_t> snaps)
+{
+ ceph_assert(!m_scrubber->are_callbacks_pending());
+ ceph_assert(callbacks_for_degraded_object.empty());
+ snap_trimq = snaps;
+ release_pg_backoffs();
+ projected_last_update = info.last_update;
+}
+
+void PG::on_active_exit()
+{
+ backfill_reserving = false;
+ agent_stop();
+}
+
+void PG::on_active_advmap(const OSDMapRef &osdmap)
+{
+ const auto& new_removed_snaps = osdmap->get_new_removed_snaps();
+ auto i = new_removed_snaps.find(get_pgid().pool());
+ if (i != new_removed_snaps.end()) {
+ bool bad = false;
+ for (auto j : i->second) {
+ if (snap_trimq.intersects(j.first, j.second)) {
+ decltype(snap_trimq) added, overlap;
+ added.insert(j.first, j.second);
+ overlap.intersection_of(snap_trimq, added);
+ derr << __func__ << " removed_snaps already contains "
+ << overlap << dendl;
+ bad = true;
+ snap_trimq.union_of(added);
+ } else {
+ snap_trimq.insert(j.first, j.second);
+ }
+ }
+ dout(10) << __func__ << " new removed_snaps " << i->second
+ << ", snap_trimq now " << snap_trimq << dendl;
+ ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
+ }
+
+ const auto& new_purged_snaps = osdmap->get_new_purged_snaps();
+ auto j = new_purged_snaps.find(get_pgid().pgid.pool());
+ if (j != new_purged_snaps.end()) {
+ bool bad = false;
+ for (auto k : j->second) {
+ if (!recovery_state.get_info().purged_snaps.contains(k.first, k.second)) {
+ interval_set<snapid_t> rm, overlap;
+ rm.insert(k.first, k.second);
+ overlap.intersection_of(recovery_state.get_info().purged_snaps, rm);
+ derr << __func__ << " purged_snaps does not contain "
+ << rm << ", only " << overlap << dendl;
+ recovery_state.adjust_purged_snaps(
+ [&overlap](auto &purged_snaps) {
+ purged_snaps.subtract(overlap);
+ });
+ // This can currently happen in the normal (if unlikely) course of
+ // events. Because adding snaps to purged_snaps does not increase
+ // the pg version or add a pg log entry, we don't reliably propagate
+ // purged_snaps additions to other OSDs.
+ // One example:
+ // - purge S
+ // - primary and replicas update purged_snaps
+ // - no object updates
+ // - pg mapping changes, new primary on different node
+ // - new primary pg version == eversion_t(), so info is not
+ // propagated.
+ //bad = true;
+ } else {
+ recovery_state.adjust_purged_snaps(
+ [&k](auto &purged_snaps) {
+ purged_snaps.erase(k.first, k.second);
+ });
+ }
+ }
+ dout(10) << __func__ << " new purged_snaps " << j->second
+ << ", now " << recovery_state.get_info().purged_snaps << dendl;
+ ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
+ }
+}
+
+void PG::queue_snap_retrim(snapid_t snap)
+{
+ if (!is_active() ||
+ !is_primary()) {
+ dout(10) << __func__ << " snap " << snap << " - not active and primary"
+ << dendl;
+ return;
+ }
+ if (!snap_trimq.contains(snap)) {
+ snap_trimq.insert(snap);
+ snap_trimq_repeat.insert(snap);
+ dout(20) << __func__ << " snap " << snap
+ << ", trimq now " << snap_trimq
+ << ", repeat " << snap_trimq_repeat << dendl;
+ kick_snap_trim();
+ } else {
+ dout(20) << __func__ << " snap " << snap
+ << " already in trimq " << snap_trimq << dendl;
+ }
+}
+
+void PG::on_active_actmap()
+{
+ if (cct->_conf->osd_check_for_log_corruption)
+ check_log_for_corruption(osd->store);
+
+
+ if (recovery_state.is_active()) {
+ dout(10) << "Active: kicking snap trim" << dendl;
+ kick_snap_trim();
+ }
+
+ if (recovery_state.is_peered() &&
+ !recovery_state.is_clean() &&
+ !recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
+ (!recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) ||
+ recovery_state.is_degraded())) {
+ queue_recovery();
+ }
+}
+
+void PG::on_backfill_reserved()
+{
+ backfill_reserving = false;
+ queue_recovery();
+}
+
+void PG::on_backfill_canceled()
+{
+ if (!waiting_on_backfill.empty()) {
+ waiting_on_backfill.clear();
+ finish_recovery_op(hobject_t::get_max());
+ }
+}
+
+void PG::on_recovery_reserved()
+{
+ queue_recovery();
+}
+
+void PG::set_not_ready_to_merge_target(pg_t pgid, pg_t src)
+{
+ osd->set_not_ready_to_merge_target(pgid, src);
+}
+
+void PG::set_not_ready_to_merge_source(pg_t pgid)
+{
+ osd->set_not_ready_to_merge_source(pgid);
+}
+
+void PG::set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec)
+{
+ osd->set_ready_to_merge_target(this, lu, les, lec);
+}
+
+void PG::set_ready_to_merge_source(eversion_t lu)
+{
+ osd->set_ready_to_merge_source(this, lu);
+}
+
+void PG::send_pg_created(pg_t pgid)
+{
+ osd->send_pg_created(pgid);
+}
+
+ceph::signedspan PG::get_mnow()
+{
+ return osd->get_mnow();
+}
+
+HeartbeatStampsRef PG::get_hb_stamps(int peer)
+{
+ return osd->get_hb_stamps(peer);
+}
+
+void PG::schedule_renew_lease(epoch_t lpr, ceph::timespan delay)
+{
+ auto spgid = info.pgid;
+ auto o = osd;
+ osd->mono_timer.add_event(
+ delay,
+ [o, lpr, spgid]() {
+ o->queue_renew_lease(lpr, spgid);
+ });
+}
+
+void PG::queue_check_readable(epoch_t lpr, ceph::timespan delay)
+{
+ osd->queue_check_readable(info.pgid, lpr, delay);
+}
+
+void PG::rebuild_missing_set_with_deletes(PGLog &pglog)
+{
+ pglog.rebuild_missing_set_with_deletes(
+ osd->store,
+ ch,
+ recovery_state.get_info());
+}
+
+void PG::on_activate_committed()
+{
+ if (!is_primary()) {
+ // waiters
+ if (recovery_state.needs_flush() == 0) {
+ requeue_ops(waiting_for_peered);
+ } else if (!waiting_for_peered.empty()) {
+ dout(10) << __func__ << " flushes in progress, moving "
+ << waiting_for_peered.size() << " items to waiting_for_flush"
+ << dendl;
+ ceph_assert(waiting_for_flush.empty());
+ waiting_for_flush.swap(waiting_for_peered);
+ }
+ }
+}
+
+// Compute pending backfill data
+static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
+{
+ lgeneric_dout(cct, 20) << __func__ << " Adjust local usage "
+ << (local_bytes >> 10) << "KiB"
+ << " primary usage " << (bf_bytes >> 10)
+ << "KiB" << dendl;
+
+ return std::max((int64_t)0, bf_bytes - local_bytes);
+}
+
+
+// We can zero the value of primary num_bytes as just an atomic.
+// However, setting above zero reserves space for backfill and requires
+// the OSDService::stat_lock which protects all OSD usage
+bool PG::try_reserve_recovery_space(
+ int64_t primary_bytes, int64_t local_bytes) {
+ // Use tentative_bacfill_full() to make sure enough
+ // space is available to handle target bytes from primary.
+
+ // TODO: If we passed num_objects from primary we could account for
+ // an estimate of the metadata overhead.
+
+ // TODO: If we had compressed_allocated and compressed_original from primary
+ // we could compute compression ratio and adjust accordingly.
+
+ // XXX: There is no way to get omap overhead and this would only apply
+ // to whatever possibly different partition that is storing the database.
+
+ // update_osd_stat() from heartbeat will do this on a new
+ // statfs using ps->primary_bytes.
+ uint64_t pending_adjustment = 0;
+ if (primary_bytes) {
+ // For erasure coded pool overestimate by a full stripe per object
+ // because we don't know how each objected rounded to the nearest stripe
+ if (pool.info.is_erasure()) {
+ primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+ primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+ info.stats.stats.sum.num_objects;
+ local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+ local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+ info.stats.stats.sum.num_objects;
+ }
+ pending_adjustment = pending_backfill(
+ cct,
+ primary_bytes,
+ local_bytes);
+ dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10)
+ << "KiB"
+ << " local " << (local_bytes >> 10) << "KiB"
+ << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
+ << dendl;
+ }
+
+ // This lock protects not only the stats OSDService but also setting the
+ // pg primary_bytes. That's why we don't immediately unlock
+ std::lock_guard l{osd->stat_lock};
+ osd_stat_t cur_stat = osd->osd_stat;
+ if (cct->_conf->osd_debug_reject_backfill_probability > 0 &&
+ (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
+ dout(10) << "backfill reservation rejected: failure injection"
+ << dendl;
+ return false;
+ } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
+ osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) {
+ dout(10) << "backfill reservation rejected: backfill full"
+ << dendl;
+ return false;
+ } else {
+ // Don't reserve space if skipped reservation check, this is used
+ // to test the other backfill full check AND in case a corruption
+ // of num_bytes requires ignoring that value and trying the
+ // backfill anyway.
+ if (primary_bytes &&
+ !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+ primary_num_bytes.store(primary_bytes);
+ local_num_bytes.store(local_bytes);
+ } else {
+ unreserve_recovery_space();
+ }
+ return true;
+ }
+}
+
+void PG::unreserve_recovery_space() {
+ primary_num_bytes.store(0);
+ local_num_bytes.store(0);
+}
+
+void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
+{
+ ObjectStore::Transaction t;
+ eversion_t trimmed_to = recovery_state.get_last_rollback_info_trimmed_to_applied();
+ for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
+ i != rollback_obs.end();
+ ++i) {
+ if (i->generation < trimmed_to.version) {
+ dout(10) << __func__ << "osd." << osd->whoami
+ << " pg " << info.pgid
+ << " found obsolete rollback obj "
+ << *i << " generation < trimmed_to "
+ << trimmed_to
+ << "...repaired" << dendl;
+ t.remove(coll, *i);
+ }
+ }
+ if (!t.empty()) {
+ derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
+ << dendl;
+ osd->store->queue_transaction(ch, std::move(t), NULL);
+ }
+}
+
+
+void PG::_repair_oinfo_oid(ScrubMap &smap)
+{
+ for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
+ i != smap.objects.rend();
+ ++i) {
+ const hobject_t &hoid = i->first;
+ ScrubMap::object &o = i->second;
+
+ bufferlist bl;
+ if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+ continue;
+ }
+ bl.push_back(o.attrs[OI_ATTR]);
+ object_info_t oi;
+ try {
+ oi.decode(bl);
+ } catch(...) {
+ continue;
+ }
+ if (oi.soid != hoid) {
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+ osd->clog->error() << "osd." << osd->whoami
+ << " found object info error on pg "
+ << info.pgid
+ << " oid " << hoid << " oid in object info: "
+ << oi.soid
+ << "...repaired";
+ // Fix object info
+ oi.soid = hoid;
+ bl.clear();
+ encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+ bufferptr bp(bl.c_str(), bl.length());
+ o.attrs[OI_ATTR] = bp;
+
+ t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
+ int r = osd->store->queue_transaction(ch, std::move(t));
+ if (r != 0) {
+ derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+ }
+}
+
+void PG::repair_object(
+ const hobject_t &soid,
+ const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers,
+ const set<pg_shard_t> &bad_peers)
+{
+ set<pg_shard_t> ok_shards;
+ for (auto &&peer: ok_peers) ok_shards.insert(peer.second);
+
+ dout(10) << "repair_object " << soid
+ << " bad_peers osd.{" << bad_peers << "},"
+ << " ok_peers osd.{" << ok_shards << "}" << dendl;
+
+ const ScrubMap::object &po = ok_peers.back().first;
+ eversion_t v;
+ object_info_t oi;
+ try {
+ bufferlist bv;
+ if (po.attrs.count(OI_ATTR)) {
+ bv.push_back(po.attrs.find(OI_ATTR)->second);
+ }
+ auto bliter = bv.cbegin();
+ decode(oi, bliter);
+ } catch (...) {
+ dout(0) << __func__ << ": Need version of replica, bad object_info_t: "
+ << soid << dendl;
+ ceph_abort();
+ }
+
+ if (bad_peers.count(get_primary())) {
+ // We should only be scrubbing if the PG is clean.
+ ceph_assert(waiting_for_unreadable_object.empty());
+ dout(10) << __func__ << ": primary = " << get_primary() << dendl;
+ }
+
+ /* No need to pass ok_peers, they must not be missing the object, so
+ * force_object_missing will add them to missing_loc anyway */
+ recovery_state.force_object_missing(bad_peers, soid, oi.version);
+}
+
+void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc)
+{
+ dout(20) << __func__ << ": " << desc << " queued at: " << epoch_queued << dendl;
+ ceph_assert(m_scrubber);
+ if (is_active()) {
+ ((*m_scrubber).*fn)(epoch_queued);
+ } else {
+ // pg might be in the process of being deleted
+ dout(5) << __func__ << " refusing to forward. " << (is_clean() ? "(clean) " : "(not clean) ") <<
+ (is_active() ? "(active) " : "(not active) ") << dendl;
+ }
+}
+
+void PG::forward_scrub_event(ScrubSafeAPI fn,
+ epoch_t epoch_queued,
+ Scrub::act_token_t act_token,
+ std::string_view desc)
+{
+ dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued
+ << " token: " << act_token << dendl;
+ ceph_assert(m_scrubber);
+ if (is_active()) {
+ ((*m_scrubber).*fn)(epoch_queued, act_token);
+ } else {
+ // pg might be in the process of being deleted
+ dout(5) << __func__ << " refusing to forward. "
+ << (is_clean() ? "(clean) " : "(not clean) ")
+ << (is_active() ? "(active) " : "(not active) ") << dendl;
+ }
+}
+
+void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << " (op)" << dendl;
+ ceph_assert(m_scrubber);
+ m_scrubber->replica_scrub_op(op);
+}
+
+void PG::replica_scrub(epoch_t epoch_queued,
+ Scrub::act_token_t act_token,
+ [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+ dout(10) << __func__ << " queued at: " << epoch_queued
+ << (is_primary() ? " (primary)" : " (replica)") << dendl;
+ forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token,
+ "StartReplica/nw");
+}
+
+bool PG::ops_blocked_by_scrub() const
+{
+ return !waiting_for_scrub.empty();
+}
+
+Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const
+{
+ return waiting_for_scrub.empty() ? Scrub::scrub_prio_t::low_priority
+ : Scrub::scrub_prio_t::high_priority;
+}
+
+bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
+{
+ if (auto last_reset = get_last_peering_reset();
+ last_reset > reply_epoch || last_reset > query_epoch) {
+ dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch "
+ << query_epoch << " last_peering_reset " << last_reset << dendl;
+ return true;
+ }
+ return false;
+}
+
+struct FlushState {
+ PGRef pg;
+ epoch_t epoch;
+ FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
+ ~FlushState() {
+ std::scoped_lock l{*pg};
+ if (!pg->pg_has_reset_since(epoch)) {
+ pg->recovery_state.complete_flush();
+ }
+ }
+};
+typedef std::shared_ptr<FlushState> FlushStateRef;
+
+void PG::start_flush_on_transaction(ObjectStore::Transaction &t)
+{
+ // flush in progress ops
+ FlushStateRef flush_trigger (std::make_shared<FlushState>(
+ this, get_osdmap_epoch()));
+ t.register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
+ t.register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
+}
+
+bool PG::try_flush_or_schedule_async()
+{
+ Context *c = new QueuePeeringEvt(
+ this, get_osdmap_epoch(), PeeringState::IntervalFlush());
+ if (!ch->flush_commit(c)) {
+ return false;
+ } else {
+ delete c;
+ return true;
+ }
+}
+
+ostream& operator<<(ostream& out, const PG& pg)
+{
+ out << pg.recovery_state;
+
+ // listing all scrub-related flags - both current and "planned next scrub"
+ if (pg.is_scrubbing()) {
+ out << *pg.m_scrubber;
+ }
+ out << pg.m_planned_scrub;
+
+ if (pg.recovery_ops_active)
+ out << " rops=" << pg.recovery_ops_active;
+
+ //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
+ if (pg.recovery_state.have_missing()) {
+ out << " m=" << pg.recovery_state.get_num_missing();
+ if (pg.is_primary()) {
+ uint64_t unfound = pg.recovery_state.get_num_unfound();
+ if (unfound)
+ out << " u=" << unfound;
+ }
+ }
+ if (!pg.is_clean()) {
+ out << " mbc=" << pg.recovery_state.get_missing_by_count();
+ }
+ if (!pg.snap_trimq.empty()) {
+ out << " trimq=";
+ // only show a count if the set is large
+ if (pg.snap_trimq.num_intervals() > 16) {
+ out << pg.snap_trimq.size();
+ if (!pg.snap_trimq_repeat.empty()) {
+ out << "(" << pg.snap_trimq_repeat.size() << ")";
+ }
+ } else {
+ out << pg.snap_trimq;
+ if (!pg.snap_trimq_repeat.empty()) {
+ out << "(" << pg.snap_trimq_repeat << ")";
+ }
+ }
+ }
+ if (!pg.recovery_state.get_info().purged_snaps.empty()) {
+ out << " ps="; // snap trim queue / purged snaps
+ if (pg.recovery_state.get_info().purged_snaps.num_intervals() > 16) {
+ out << pg.recovery_state.get_info().purged_snaps.size();
+ } else {
+ out << pg.recovery_state.get_info().purged_snaps;
+ }
+ }
+
+ out << "]";
+ return out;
+}
+
+bool PG::can_discard_op(OpRequestRef& op)
+{
+ auto m = op->get_req<MOSDOp>();
+ if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
+ dout(20) << " discard " << *m << dendl;
+ return true;
+ }
+
+ if (m->get_map_epoch() < info.history.same_primary_since) {
+ dout(7) << " changed after " << m->get_map_epoch()
+ << ", dropping " << *m << dendl;
+ return true;
+ }
+
+ if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+ !is_primary() &&
+ m->get_map_epoch() < info.history.same_interval_since) {
+ // Note: the Objecter will resend on interval change without the primary
+ // changing if it actually sent to a replica. If the primary hasn't
+ // changed since the send epoch, we got it, and we're primary, it won't
+ // have resent even if the interval did change as it sent it to the primary
+ // (us).
+ return true;
+ }
+
+
+ if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
+ // >= luminous client
+ if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
+ // >= nautilus client
+ if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
+ dout(7) << __func__ << " sent before last_force_op_resend "
+ << pool.info.last_force_op_resend
+ << ", dropping" << *m << dendl;
+ return true;
+ }
+ } else {
+ // == < nautilus client (luminous or mimic)
+ if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
+ dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
+ << pool.info.last_force_op_resend_prenautilus
+ << ", dropping" << *m << dendl;
+ return true;
+ }
+ }
+ if (m->get_map_epoch() < info.history.last_epoch_split) {
+ dout(7) << __func__ << " pg split in "
+ << info.history.last_epoch_split << ", dropping" << dendl;
+ return true;
+ }
+ } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
+ // < luminous client
+ if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
+ dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
+ << pool.info.last_force_op_resend_preluminous
+ << ", dropping" << *m << dendl;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+template<typename T, int MSGTYPE>
+bool PG::can_discard_replica_op(OpRequestRef& op)
+{
+ auto m = op->get_req<T>();
+ ceph_assert(m->get_type() == MSGTYPE);
+
+ int from = m->get_source().num();
+
+ // if a repop is replied after a replica goes down in a new osdmap, and
+ // before the pg advances to this new osdmap, the repop replies before this
+ // repop can be discarded by that replica OSD, because the primary resets the
+ // connection to it when handling the new osdmap marking it down, and also
+ // resets the messenger sesssion when the replica reconnects. to avoid the
+ // out-of-order replies, the messages from that replica should be discarded.
+ OSDMapRef next_map = osd->get_next_osdmap();
+ if (next_map->is_down(from)) {
+ dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl;
+ return true;
+ }
+ /* Mostly, this overlaps with the old_peering_msg
+ * condition. An important exception is pushes
+ * sent by replicas not in the acting set, since
+ * if such a replica goes down it does not cause
+ * a new interval. */
+ if (next_map->get_down_at(from) >= m->map_epoch) {
+ dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl;
+ return true;
+ }
+
+ // same pg?
+ // if pg changes _at all_, we reset and repeer!
+ if (old_peering_msg(m->map_epoch, m->map_epoch)) {
+ dout(10) << "can_discard_replica_op pg changed " << info.history
+ << " after " << m->map_epoch
+ << ", dropping" << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool PG::can_discard_scan(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGScan>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
+
+ if (old_peering_msg(m->map_epoch, m->query_epoch)) {
+ dout(10) << " got old scan, ignoring" << dendl;
+ return true;
+ }
+ return false;
+}
+
+bool PG::can_discard_backfill(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGBackfill>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
+
+ if (old_peering_msg(m->map_epoch, m->query_epoch)) {
+ dout(10) << " got old backfill, ignoring" << dendl;
+ return true;
+ }
+
+ return false;
+
+}
+
+bool PG::can_discard_request(OpRequestRef& op)
+{
+ switch (op->get_req()->get_type()) {
+ case CEPH_MSG_OSD_OP:
+ return can_discard_op(op);
+ case CEPH_MSG_OSD_BACKOFF:
+ return false; // never discard
+ case MSG_OSD_REPOP:
+ return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
+ case MSG_OSD_PG_PUSH:
+ return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
+ case MSG_OSD_PG_PULL:
+ return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
+ case MSG_OSD_PG_PUSH_REPLY:
+ return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
+ case MSG_OSD_REPOPREPLY:
+ return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
+
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
+
+ case MSG_OSD_EC_WRITE:
+ return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
+ case MSG_OSD_EC_WRITE_REPLY:
+ return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
+ case MSG_OSD_EC_READ:
+ return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
+ case MSG_OSD_EC_READ_REPLY:
+ return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
+ case MSG_OSD_REP_SCRUB:
+ return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
+ case MSG_OSD_SCRUB_RESERVE:
+ return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
+ case MSG_OSD_REP_SCRUBMAP:
+ return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
+ case MSG_OSD_PG_UPDATE_LOG_MISSING:
+ return can_discard_replica_op<
+ MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
+ case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+ return can_discard_replica_op<
+ MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
+
+ case MSG_OSD_PG_SCAN:
+ return can_discard_scan(op);
+ case MSG_OSD_PG_BACKFILL:
+ return can_discard_backfill(op);
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ return can_discard_replica_op<MOSDPGBackfillRemove,
+ MSG_OSD_PG_BACKFILL_REMOVE>(op);
+ }
+ return true;
+}
+
+void PG::do_peering_event(PGPeeringEventRef evt, PeeringCtx &rctx)
+{
+ dout(10) << __func__ << ": " << evt->get_desc() << dendl;
+ ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
+ if (old_peering_evt(evt)) {
+ dout(10) << "discard old " << evt->get_desc() << dendl;
+ } else {
+ recovery_state.handle_event(evt, &rctx);
+ }
+ // write_if_dirty regardless of path above to ensure we capture any work
+ // done by OSD::advance_pg().
+ write_if_dirty(rctx.transaction);
+}
+
+void PG::queue_peering_event(PGPeeringEventRef evt)
+{
+ if (old_peering_evt(evt))
+ return;
+ osd->osd->enqueue_peering_evt(info.pgid, evt);
+}
+
+void PG::queue_null(epoch_t msg_epoch,
+ epoch_t query_epoch)
+{
+ dout(10) << "null" << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
+ NullEvt())));
+}
+
+void PG::find_unfound(epoch_t queued, PeeringCtx &rctx)
+{
+ /*
+ * if we couldn't start any recovery ops and things are still
+ * unfound, see if we can discover more missing object locations.
+ * It may be that our initial locations were bad and we errored
+ * out while trying to pull.
+ */
+ if (!recovery_state.discover_all_missing(rctx)) {
+ string action;
+ if (state_test(PG_STATE_BACKFILLING)) {
+ auto evt = PGPeeringEventRef(
+ new PGPeeringEvent(
+ queued,
+ queued,
+ PeeringState::UnfoundBackfill()));
+ queue_peering_event(evt);
+ action = "in backfill";
+ } else if (state_test(PG_STATE_RECOVERING)) {
+ auto evt = PGPeeringEventRef(
+ new PGPeeringEvent(
+ queued,
+ queued,
+ PeeringState::UnfoundRecovery()));
+ queue_peering_event(evt);
+ action = "in recovery";
+ } else {
+ action = "already out of recovery/backfill";
+ }
+ dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
+ } else {
+ dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
+ queue_recovery();
+ }
+}
+
+void PG::handle_advance_map(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ vector<int>& newup, int up_primary,
+ vector<int>& newacting, int acting_primary,
+ PeeringCtx &rctx)
+{
+ dout(10) << __func__ << ": " << osdmap->get_epoch() << dendl;
+ osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
+ recovery_state.advance_map(
+ osdmap,
+ lastmap,
+ newup,
+ up_primary,
+ newacting,
+ acting_primary,
+ rctx);
+}
+
+void PG::handle_activate_map(PeeringCtx &rctx)
+{
+ dout(10) << __func__ << ": " << get_osdmap()->get_epoch()
+ << dendl;
+ recovery_state.activate_map(rctx);
+
+ requeue_map_waiters();
+}
+
+void PG::handle_initialize(PeeringCtx &rctx)
+{
+ dout(10) << __func__ << dendl;
+ PeeringState::Initialize evt;
+ recovery_state.handle_event(evt, &rctx);
+}
+
+
+void PG::handle_query_state(Formatter *f)
+{
+ dout(10) << "handle_query_state" << dendl;
+ PeeringState::QueryState q(f);
+ recovery_state.handle_event(q, 0);
+
+ // This code has moved to after the close of recovery_state array.
+ // I don't think that scrub is a recovery state
+ if (is_primary() && is_active() && m_scrubber && m_scrubber->is_scrub_active()) {
+ m_scrubber->handle_query_state(f);
+ }
+}
+
+void PG::init_collection_pool_opts()
+{
+ auto r = osd->store->set_collection_opts(ch, pool.info.opts);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ derr << __func__ << " set_collection_opts returns error:" << r << dendl;
+ }
+}
+
+void PG::on_pool_change()
+{
+ init_collection_pool_opts();
+ plpg_on_pool_change();
+}
+
+void PG::C_DeleteMore::complete(int r) {
+ ceph_assert(r == 0);
+ pg->lock();
+ if (!pg->pg_has_reset_since(epoch)) {
+ pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
+ }
+ pg->unlock();
+ delete this;
+}
+
+std::pair<ghobject_t, bool> PG::do_delete_work(
+ ObjectStore::Transaction &t,
+ ghobject_t _next)
+{
+ dout(10) << __func__ << dendl;
+
+ {
+ float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
+ if (osd_delete_sleep > 0 && delete_needs_sleep) {
+ epoch_t e = get_osdmap()->get_epoch();
+ PGRef pgref(this);
+ auto delete_requeue_callback = new LambdaContext([this, pgref, e](int r) {
+ dout(20) << "do_delete_work() [cb] wake up at "
+ << ceph_clock_now()
+ << ", re-queuing delete" << dendl;
+ std::scoped_lock locker{*this};
+ delete_needs_sleep = false;
+ if (!pg_has_reset_since(e)) {
+ osd->queue_for_pg_delete(get_pgid(), e);
+ }
+ });
+
+ auto delete_schedule_time = ceph::real_clock::now();
+ delete_schedule_time += ceph::make_timespan(osd_delete_sleep);
+ std::lock_guard l{osd->sleep_lock};
+ osd->sleep_timer.add_event_at(delete_schedule_time,
+ delete_requeue_callback);
+ dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
+ return std::make_pair(_next, true);
+ }
+ }
+
+ delete_needs_sleep = true;
+
+ ghobject_t next;
+
+ vector<ghobject_t> olist;
+ int max = std::min(osd->store->get_ideal_list_max(),
+ (int)cct->_conf->osd_target_transaction_size);
+
+ osd->store->collection_list(
+ ch,
+ _next,
+ ghobject_t::get_max(),
+ max,
+ &olist,
+ &next);
+ dout(20) << __func__ << " " << olist << dendl;
+
+ // make sure we've removed everything
+ // by one more listing from the beginning
+ if (_next != ghobject_t() && olist.empty()) {
+ next = ghobject_t();
+ osd->store->collection_list(
+ ch,
+ next,
+ ghobject_t::get_max(),
+ max,
+ &olist,
+ &next);
+ if (!olist.empty()) {
+ for (auto& oid : olist) {
+ if (oid == pgmeta_oid) {
+ dout(20) << __func__ << " removing pgmeta object " << oid << dendl;
+ } else {
+ dout(0) << __func__ << " additional unexpected onode"
+ <<" new onode has appeared since PG removal started"
+ << oid << dendl;
+ }
+ }
+ }
+ }
+
+ OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+ int64_t num = 0;
+ for (auto& oid : olist) {
+ if (oid == pgmeta_oid) {
+ continue;
+ }
+ if (oid.is_pgmeta()) {
+ osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
+ << " during PG removal";
+ }
+ int r = snap_mapper.remove_oid(oid.hobj, &_t);
+ if (r != 0 && r != -ENOENT) {
+ ceph_abort();
+ }
+ t.remove(coll, oid);
+ ++num;
+ }
+ bool running = true;
+ if (num) {
+ dout(20) << __func__ << " deleting " << num << " objects" << dendl;
+ Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
+ t.register_on_commit(fin);
+ } else {
+ if (cct->_conf->osd_inject_failure_on_pg_removal) {
+ _exit(1);
+ }
+
+ // final flush here to ensure completions drop refs. Of particular concern
+ // are the SnapMapper ContainerContexts.
+ {
+ PGRef pgref(this);
+ PGLog::clear_info_log(info.pgid, &t);
+ t.remove_collection(coll);
+ t.register_on_commit(new ContainerContext<PGRef>(pgref));
+ t.register_on_applied(new ContainerContext<PGRef>(pgref));
+ osd->store->queue_transaction(ch, std::move(t));
+ }
+ ch->flush();
+
+ if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
+ dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
+ ch = osd->store->create_new_collection(coll);
+ create_pg_collection(t,
+ info.pgid,
+ info.pgid.get_split_bits(pool.info.get_pg_num()));
+ init_pg_ondisk(t, info.pgid, &pool.info);
+ recovery_state.reset_last_persisted();
+ } else {
+ recovery_state.set_delete_complete();
+
+ // cancel reserver here, since the PG is about to get deleted and the
+ // exit() methods don't run when that happens.
+ osd->local_reserver.cancel_reservation(info.pgid);
+
+ running = false;
+ }
+ }
+ return {next, running};
+}
+
+int PG::pg_stat_adjust(osd_stat_t *ns)
+{
+ osd_stat_t &new_stat = *ns;
+ if (is_primary()) {
+ return 0;
+ }
+ // Adjust the kb_used by adding pending backfill data
+ uint64_t reserved_num_bytes = get_reserved_num_bytes();
+
+ // For now we don't consider projected space gains here
+ // I suggest we have an optional 2 pass backfill that frees up
+ // space in a first pass. This could be triggered when at nearfull
+ // or near to backfillfull.
+ if (reserved_num_bytes > 0) {
+ // TODO: Handle compression by adjusting by the PGs average
+ // compression precentage.
+ dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
+ << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
+ if (new_stat.statfs.available > reserved_num_bytes)
+ new_stat.statfs.available -= reserved_num_bytes;
+ else
+ new_stat.statfs.available = 0;
+ dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
+ return 1;
+ }
+ return 0;
+}
+
+void PG::dump_pgstate_history(Formatter *f)
+{
+ std::scoped_lock l{*this};
+ recovery_state.dump_history(f);
+}
+
+void PG::dump_missing(Formatter *f)
+{
+ for (auto& i : recovery_state.get_pg_log().get_missing().get_items()) {
+ f->open_object_section("object");
+ f->dump_object("oid", i.first);
+ f->dump_object("missing_info", i.second);
+ if (recovery_state.get_missing_loc().needs_recovery(i.first)) {
+ f->dump_bool(
+ "unfound",
+ recovery_state.get_missing_loc().is_unfound(i.first));
+ f->open_array_section("locations");
+ for (auto l : recovery_state.get_missing_loc().get_locations(i.first)) {
+ f->dump_object("shard", l);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+}
+
+void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
+{
+ std::lock_guard l{pg_stats_publish_lock};
+ if (pg_stats_publish_valid) {
+ f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
+ }
+}
+
+void PG::with_heartbeat_peers(std::function<void(int)> f)
+{
+ std::lock_guard l{heartbeat_peer_lock};
+ for (auto p : heartbeat_peers) {
+ f(p);
+ }
+ for (auto p : probe_targets) {
+ f(p);
+ }
+}
+
+uint64_t PG::get_min_alloc_size() const {
+ return osd->store->get_min_alloc_size();
+}
diff --git a/src/osd/PG.h b/src/osd/PG.h
new file mode 100644
index 000000000..61adae120
--- /dev/null
+++ b/src/osd/PG.h
@@ -0,0 +1,1341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PG_H
+#define CEPH_PG_H
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/container/flat_set.hpp>
+#include "include/mempool.h"
+
+// re-include our assert to clobber boost's
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+
+#include "include/types.h"
+#include "include/stringify.h"
+#include "osd_types.h"
+#include "include/xlist.h"
+#include "SnapMapper.h"
+#include "Session.h"
+#include "common/Timer.h"
+
+#include "PGLog.h"
+#include "OSDMap.h"
+#include "messages/MOSDPGLog.h"
+#include "include/str_list.h"
+#include "PGBackend.h"
+#include "PGPeeringEvent.h"
+#include "PeeringState.h"
+#include "recovery_types.h"
+#include "MissingLoc.h"
+#include "scrubber_common.h"
+
+#include "mgr/OSDPerfMetricTypes.h"
+
+#include <atomic>
+#include <list>
+#include <memory>
+#include <string>
+#include <tuple>
+
+//#define DEBUG_RECOVERY_OIDS // track std::set of recovering oids explicitly, to find counting bugs
+//#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
+
+class OSD;
+class OSDService;
+class OSDShard;
+class OSDShardPGSlot;
+class MOSDPGScan;
+class MOSDPGBackfill;
+class MOSDPGInfo;
+
+class PG;
+struct OpRequest;
+typedef OpRequest::Ref OpRequestRef;
+class MOSDPGLog;
+class DynamicPerfStats;
+class PgScrubber;
+
+namespace Scrub {
+ class Store;
+ class ReplicaReservations;
+ class LocalReservation;
+ class ReservedByRemotePrimary;
+}
+
+#ifdef PG_DEBUG_REFS
+#include "common/tracked_int_ptr.hpp"
+ uint64_t get_with_id(PG *pg);
+ void put_with_id(PG *pg, uint64_t id);
+ typedef TrackedIntPtr<PG> PGRef;
+#else
+ typedef boost::intrusive_ptr<PG> PGRef;
+#endif
+
+class PGRecoveryStats {
+ struct per_state_info {
+ uint64_t enter, exit; // enter/exit counts
+ uint64_t events;
+ utime_t event_time; // time spent processing events
+ utime_t total_time; // total time in state
+ utime_t min_time, max_time;
+
+ // cppcheck-suppress unreachableCode
+ per_state_info() : enter(0), exit(0), events(0) {}
+ };
+ std::map<const char *,per_state_info> info;
+ ceph::mutex lock = ceph::make_mutex("PGRecoverStats::lock");
+
+ public:
+ PGRecoveryStats() = default;
+
+ void reset() {
+ std::lock_guard l(lock);
+ info.clear();
+ }
+ void dump(ostream& out) {
+ std::lock_guard l(lock);
+ for (std::map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
+ per_state_info& i = p->second;
+ out << i.enter << "\t" << i.exit << "\t"
+ << i.events << "\t" << i.event_time << "\t"
+ << i.total_time << "\t"
+ << i.min_time << "\t" << i.max_time << "\t"
+ << p->first << "\n";
+ }
+ }
+
+ void dump_formatted(ceph::Formatter *f) {
+ std::lock_guard l(lock);
+ f->open_array_section("pg_recovery_stats");
+ for (std::map<const char *,per_state_info>::iterator p = info.begin();
+ p != info.end(); ++p) {
+ per_state_info& i = p->second;
+ f->open_object_section("recovery_state");
+ f->dump_int("enter", i.enter);
+ f->dump_int("exit", i.exit);
+ f->dump_int("events", i.events);
+ f->dump_stream("event_time") << i.event_time;
+ f->dump_stream("total_time") << i.total_time;
+ f->dump_stream("min_time") << i.min_time;
+ f->dump_stream("max_time") << i.max_time;
+ std::vector<std::string> states;
+ get_str_vec(p->first, "/", states);
+ f->open_array_section("nested_states");
+ for (std::vector<std::string>::iterator st = states.begin();
+ st != states.end(); ++st) {
+ f->dump_string("state", *st);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ }
+
+ void log_enter(const char *s) {
+ std::lock_guard l(lock);
+ info[s].enter++;
+ }
+ void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
+ std::lock_guard l(lock);
+ per_state_info &i = info[s];
+ i.exit++;
+ i.total_time += dur;
+ if (dur > i.max_time)
+ i.max_time = dur;
+ if (dur < i.min_time || i.min_time == utime_t())
+ i.min_time = dur;
+ i.events += events;
+ i.event_time += event_dur;
+ }
+};
+
+/** PG - Replica Placement Group
+ *
+ */
+
+class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
+ friend struct NamedState;
+ friend class PeeringState;
+ friend class PgScrubber;
+ friend class PrimaryLogScrub;
+ friend class Scrub::ReplicaReservations;
+
+public:
+ const pg_shard_t pg_whoami;
+ const spg_t pg_id;
+
+ std::unique_ptr<ScrubPgIF> m_scrubber;
+
+ /// flags detailing scheduling/operation characteristics of the next scrub
+ requested_scrub_t m_planned_scrub;
+ /// scrubbing state for both Primary & replicas
+ bool is_scrub_active() const { return m_scrubber->is_scrub_active(); }
+
+ /// set when the scrub request is queued, and reset after scrubbing fully
+ /// cleaned up.
+ bool is_scrub_queued_or_active() const { return m_scrubber->is_queued_or_active(); }
+
+public:
+ // -- members --
+ const coll_t coll;
+
+ ObjectStore::CollectionHandle ch;
+
+ // -- methods --
+ std::ostream& gen_prefix(std::ostream& out) const override;
+ CephContext *get_cct() const override {
+ return cct;
+ }
+ unsigned get_subsys() const override {
+ return ceph_subsys_osd;
+ }
+
+ const char* const get_current_state() const {
+ return recovery_state.get_current_state();
+ }
+
+ const OSDMapRef& get_osdmap() const {
+ ceph_assert(is_locked());
+ return recovery_state.get_osdmap();
+ }
+
+ epoch_t get_osdmap_epoch() const override final {
+ return recovery_state.get_osdmap()->get_epoch();
+ }
+
+ PerfCounters &get_peering_perf() override;
+ PerfCounters &get_perf_logger() override;
+ void log_state_enter(const char *state) override;
+ void log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) override;
+
+ void lock(bool no_lockdep = false) const;
+ void unlock() const;
+ bool is_locked() const;
+
+ const spg_t& get_pgid() const {
+ return pg_id;
+ }
+
+ const PGPool& get_pool() const {
+ return pool;
+ }
+ uint64_t get_last_user_version() const {
+ return info.last_user_version;
+ }
+ const pg_history_t& get_history() const {
+ return info.history;
+ }
+ bool get_need_up_thru() const {
+ return recovery_state.get_need_up_thru();
+ }
+ epoch_t get_same_interval_since() const {
+ return info.history.same_interval_since;
+ }
+
+ static void set_last_scrub_stamp(
+ utime_t t, pg_history_t &history, pg_stat_t &stats) {
+ stats.last_scrub_stamp = t;
+ history.last_scrub_stamp = t;
+ }
+
+ void set_last_scrub_stamp(utime_t t) {
+ recovery_state.update_stats(
+ [=](auto &history, auto &stats) {
+ set_last_scrub_stamp(t, history, stats);
+ return true;
+ });
+ }
+
+ static void set_last_deep_scrub_stamp(
+ utime_t t, pg_history_t &history, pg_stat_t &stats) {
+ stats.last_deep_scrub_stamp = t;
+ history.last_deep_scrub_stamp = t;
+ }
+
+ void set_last_deep_scrub_stamp(utime_t t) {
+ recovery_state.update_stats(
+ [=](auto &history, auto &stats) {
+ set_last_deep_scrub_stamp(t, history, stats);
+ return true;
+ });
+ }
+
+ bool is_deleting() const {
+ return recovery_state.is_deleting();
+ }
+ bool is_deleted() const {
+ return recovery_state.is_deleted();
+ }
+ bool is_nonprimary() const {
+ return recovery_state.is_nonprimary();
+ }
+ bool is_primary() const {
+ return recovery_state.is_primary();
+ }
+ bool pg_has_reset_since(epoch_t e) {
+ ceph_assert(is_locked());
+ return recovery_state.pg_has_reset_since(e);
+ }
+
+ bool is_ec_pg() const {
+ return recovery_state.is_ec_pg();
+ }
+ int get_role() const {
+ return recovery_state.get_role();
+ }
+ const std::vector<int> get_acting() const {
+ return recovery_state.get_acting();
+ }
+ const std::set<pg_shard_t> &get_actingset() const {
+ return recovery_state.get_actingset();
+ }
+ int get_acting_primary() const {
+ return recovery_state.get_acting_primary();
+ }
+ pg_shard_t get_primary() const {
+ return recovery_state.get_primary();
+ }
+ const std::vector<int> get_up() const {
+ return recovery_state.get_up();
+ }
+ int get_up_primary() const {
+ return recovery_state.get_up_primary();
+ }
+ const PastIntervals& get_past_intervals() const {
+ return recovery_state.get_past_intervals();
+ }
+ bool is_acting_recovery_backfill(pg_shard_t osd) const {
+ return recovery_state.is_acting_recovery_backfill(osd);
+ }
+ const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+ return recovery_state.get_acting_recovery_backfill();
+ }
+ bool is_acting(pg_shard_t osd) const {
+ return recovery_state.is_acting(osd);
+ }
+ bool is_up(pg_shard_t osd) const {
+ return recovery_state.is_up(osd);
+ }
+ static bool has_shard(bool ec, const std::vector<int>& v, pg_shard_t osd) {
+ return PeeringState::has_shard(ec, v, osd);
+ }
+
+ /// initialize created PG
+ void init(
+ int role,
+ const std::vector<int>& up,
+ int up_primary,
+ const std::vector<int>& acting,
+ int acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pim,
+ bool backfill,
+ ObjectStore::Transaction &t);
+
+ /// read existing pg state off disk
+ void read_state(ObjectStore *store);
+ static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
+
+ static int get_latest_struct_v() {
+ return pg_latest_struct_v;
+ }
+ static int get_compat_struct_v() {
+ return pg_compat_struct_v;
+ }
+ static int read_info(
+ ObjectStore *store, spg_t pgid, const coll_t &coll,
+ pg_info_t &info, PastIntervals &past_intervals,
+ __u8 &);
+ static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
+
+ void rm_backoff(const ceph::ref_t<Backoff>& b);
+
+ void update_snap_mapper_bits(uint32_t bits) {
+ snap_mapper.update_bits(bits);
+ }
+ void start_split_stats(const std::set<spg_t>& childpgs, std::vector<object_stat_sum_t> *v);
+ virtual void split_colls(
+ spg_t child,
+ int split_bits,
+ int seed,
+ const pg_pool_t *pool,
+ ObjectStore::Transaction &t) = 0;
+ void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
+ void merge_from(std::map<spg_t,PGRef>& sources, PeeringCtx &rctx,
+ unsigned split_bits,
+ const pg_merge_meta_t& last_pg_merge_meta);
+ void finish_split_stats(const object_stat_sum_t& stats,
+ ObjectStore::Transaction &t);
+
+ void scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ // a new scrub
+ forward_scrub_event(&ScrubPgIF::initiate_regular_scrub, queued, "StartScrub"sv);
+ }
+
+ /**
+ * a special version of PG::scrub(), which:
+ * - is initiated after repair, and
+ * (not true anymore:)
+ * - is not required to allocate local/remote OSD scrub resources
+ */
+ void recovery_scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ // a new scrub
+ forward_scrub_event(&ScrubPgIF::initiate_scrub_after_repair, queued,
+ "AfterRepairScrub"sv);
+ }
+
+ void replica_scrub(epoch_t queued,
+ Scrub::act_token_t act_token,
+ ThreadPool::TPHandle& handle);
+
+ void replica_scrub_resched(epoch_t queued,
+ Scrub::act_token_t act_token,
+ ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, act_token,
+ "SchedReplica");
+ }
+
+ void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_remotes_reserved, queued, "RemotesReserved"sv);
+ }
+
+ void scrub_send_resources_denied(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_reservation_failure, queued,
+ "ReservationFailure"sv);
+ }
+
+ void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_scrub_resched, queued, "InternalSchedScrub");
+ }
+
+ void scrub_send_pushes_update(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::active_pushes_notification, queued,
+ "ActivePushesUpd"sv);
+ }
+
+ void scrub_send_applied_update(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::update_applied_notification, queued,
+ "UpdatesApplied"sv);
+ }
+
+ void scrub_send_unblocking(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_scrub_unblock, queued, "Unblocked"sv);
+ }
+
+ void scrub_send_digest_update(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::digest_update_notification, queued, "DigestUpdate"sv);
+ }
+
+ void scrub_send_local_map_ready(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_local_map_done, queued, "IntLocalMapDone"sv);
+ }
+
+ void scrub_send_replmaps_ready(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_replica_maps_ready, queued, "GotReplicas"sv);
+ }
+
+ void scrub_send_replica_pushes(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_replica_pushes_upd, queued,
+ "ReplicaPushesUpd"sv);
+ }
+
+ void scrub_send_maps_compared(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_maps_compared, queued, "MapsCompared"sv);
+ }
+
+ void scrub_send_get_next_chunk(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_get_next_chunk, queued, "NextChunk"sv);
+ }
+
+ void scrub_send_scrub_is_finished(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_scrub_is_finished, queued, "ScrubFinished"sv);
+ }
+
+ void scrub_send_chunk_free(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_chunk_free, queued, "SelectedChunkFree"sv);
+ }
+
+ void scrub_send_chunk_busy(epoch_t queued, ThreadPool::TPHandle& handle)
+ {
+ forward_scrub_event(&ScrubPgIF::send_chunk_busy, queued, "ChunkIsBusy"sv);
+ }
+
+ void reg_next_scrub();
+
+ void queue_want_pg_temp(const std::vector<int> &wanted) override;
+ void clear_want_pg_temp() override;
+
+ void on_new_interval() override;
+
+ void on_role_change() override;
+ virtual void plpg_on_role_change() = 0;
+
+ void init_collection_pool_opts();
+ void on_pool_change() override;
+ virtual void plpg_on_pool_change() = 0;
+
+ void on_info_history_change() override;
+
+ void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override;
+
+ uint64_t get_snap_trimq_size() const override {
+ return snap_trimq.size();
+ }
+ unsigned get_target_pg_log_entries() const override;
+
+ void clear_publish_stats() override;
+ void clear_primary_state() override;
+
+ epoch_t oldest_stored_osdmap() override;
+ OstreamTemp get_clog_error() override;
+ OstreamTemp get_clog_info() override;
+ OstreamTemp get_clog_debug() override;
+
+ void schedule_event_after(
+ PGPeeringEventRef event,
+ float delay) override;
+ void request_local_background_io_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) override;
+ void update_local_background_io_priority(
+ unsigned priority) override;
+ void cancel_local_background_io_reservation() override;
+
+ void request_remote_recovery_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) override;
+ void cancel_remote_recovery_reservation() override;
+
+ void schedule_event_on_commit(
+ ObjectStore::Transaction &t,
+ PGPeeringEventRef on_commit) override;
+
+ void on_active_exit() override;
+
+ Context *on_clean() override {
+ if (is_active()) {
+ kick_snap_trim();
+ }
+ requeue_ops(waiting_for_clean_to_primary_repair);
+ return finish_recovery();
+ }
+
+ void on_activate(interval_set<snapid_t> snaps) override;
+
+ void on_activate_committed() override;
+
+ void on_active_actmap() override;
+ void on_active_advmap(const OSDMapRef &osdmap) override;
+
+ void queue_snap_retrim(snapid_t snap);
+
+ void on_backfill_reserved() override;
+ void on_backfill_canceled() override;
+ void on_recovery_reserved() override;
+
+ bool is_forced_recovery_or_backfill() const {
+ return recovery_state.is_forced_recovery_or_backfill();
+ }
+
+ PGLog::LogEntryHandlerRef get_log_handler(
+ ObjectStore::Transaction &t) override {
+ return std::make_unique<PG::PGLogEntryHandler>(this, &t);
+ }
+
+ std::pair<ghobject_t, bool> do_delete_work(ObjectStore::Transaction &t,
+ ghobject_t _next) override;
+
+ void clear_ready_to_merge() override;
+ void set_not_ready_to_merge_target(pg_t pgid, pg_t src) override;
+ void set_not_ready_to_merge_source(pg_t pgid) override;
+ void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) override;
+ void set_ready_to_merge_source(eversion_t lu) override;
+
+ void send_pg_created(pg_t pgid) override;
+
+ ceph::signedspan get_mnow() override;
+ HeartbeatStampsRef get_hb_stamps(int peer) override;
+ void schedule_renew_lease(epoch_t lpr, ceph::timespan delay) override;
+ void queue_check_readable(epoch_t lpr, ceph::timespan delay) override;
+
+ void rebuild_missing_set_with_deletes(PGLog &pglog) override;
+
+ void queue_peering_event(PGPeeringEventRef evt);
+ void do_peering_event(PGPeeringEventRef evt, PeeringCtx &rcx);
+ void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
+ void queue_flushed(epoch_t started_at);
+ void handle_advance_map(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ std::vector<int>& newup, int up_primary,
+ std::vector<int>& newacting, int acting_primary,
+ PeeringCtx &rctx);
+ void handle_activate_map(PeeringCtx &rctx);
+ void handle_initialize(PeeringCtx &rxcx);
+ void handle_query_state(ceph::Formatter *f);
+
+ /**
+ * @param ops_begun returns how many recovery ops the function started
+ * @returns true if any useful work was accomplished; false otherwise
+ */
+ virtual bool start_recovery_ops(
+ uint64_t max,
+ ThreadPool::TPHandle &handle,
+ uint64_t *ops_begun) = 0;
+
+ // more work after the above, but with a PeeringCtx
+ void find_unfound(epoch_t queued, PeeringCtx &rctx);
+
+ virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
+
+ void dump_pgstate_history(ceph::Formatter *f);
+ void dump_missing(ceph::Formatter *f);
+
+ void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
+ void with_heartbeat_peers(std::function<void(int)> f);
+
+ void shutdown();
+ virtual void on_shutdown() = 0;
+
+ bool get_must_scrub() const;
+ bool sched_scrub();
+
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const;
+ /// the version that refers to flags_.priority
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const;
+private:
+ // auxiliaries used by sched_scrub():
+ double next_deepscrub_interval() const;
+
+ /// should we perform deep scrub?
+ bool is_time_for_deep(bool allow_deep_scrub,
+ bool allow_scrub,
+ bool has_deep_errors,
+ const requested_scrub_t& planned) const;
+
+ /**
+ * Verify the various 'next scrub' flags in m_planned_scrub against configuration
+ * and scrub-related timestamps.
+ *
+ * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing)
+ */
+ std::optional<requested_scrub_t> verify_scrub_mode() const;
+
+ bool verify_periodic_scrub_mode(bool allow_deep_scrub,
+ bool try_to_auto_repair,
+ bool allow_regular_scrub,
+ bool has_deep_errors,
+ requested_scrub_t& planned) const;
+
+ using ScrubAPI = void (ScrubPgIF::*)(epoch_t epoch_queued);
+ void forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc);
+ // and for events that carry a meaningful 'activation token'
+ using ScrubSafeAPI = void (ScrubPgIF::*)(epoch_t epoch_queued,
+ Scrub::act_token_t act_token);
+ void forward_scrub_event(ScrubSafeAPI fn,
+ epoch_t epoch_queued,
+ Scrub::act_token_t act_token,
+ std::string_view desc);
+
+public:
+ virtual void do_request(
+ OpRequestRef& op,
+ ThreadPool::TPHandle &handle
+ ) = 0;
+ virtual void clear_cache() = 0;
+ virtual int get_cache_obj_count() = 0;
+
+ virtual void snap_trimmer(epoch_t epoch_queued) = 0;
+ virtual void do_command(
+ const std::string_view& prefix,
+ const cmdmap_t& cmdmap,
+ const ceph::buffer::list& idata,
+ std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish) = 0;
+
+ virtual bool agent_work(int max) = 0;
+ virtual bool agent_work(int max, int agent_flush_quota) = 0;
+ virtual void agent_stop() = 0;
+ virtual void agent_delay() = 0;
+ virtual void agent_clear() = 0;
+ virtual void agent_choose_mode_restart() = 0;
+
+ struct C_DeleteMore : public Context {
+ PGRef pg;
+ epoch_t epoch;
+ C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
+ void finish(int r) override {
+ ceph_abort();
+ }
+ void complete(int r) override;
+ };
+
+ void _delete_some(ObjectStore::Transaction *t);
+
+ virtual void set_dynamic_perf_stats_queries(
+ const std::list<OSDPerfMetricQuery> &queries) {
+ }
+ virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
+ }
+
+ uint64_t get_min_alloc_size() const;
+
+ // reference counting
+#ifdef PG_DEBUG_REFS
+ uint64_t get_with_id();
+ void put_with_id(uint64_t);
+ void dump_live_ids();
+#endif
+ void get(const char* tag);
+ void put(const char* tag);
+ int get_num_ref() {
+ return ref;
+ }
+
+ // ctor
+ PG(OSDService *o, OSDMapRef curmap,
+ const PGPool &pool, spg_t p);
+ ~PG() override;
+
+ // prevent copying
+ explicit PG(const PG& rhs) = delete;
+ PG& operator=(const PG& rhs) = delete;
+
+protected:
+ // -------------
+ // protected
+ OSDService *osd;
+public:
+ OSDShard *osd_shard = nullptr;
+ OSDShardPGSlot *pg_slot = nullptr;
+protected:
+ CephContext *cct;
+
+ // locking and reference counting.
+ // I destroy myself when the reference count hits zero.
+ // lock() should be called before doing anything.
+ // get() should be called on pointer copy (to another thread, etc.).
+ // put() should be called on destruction of some previously copied pointer.
+ // unlock() when done with the current pointer (_most common_).
+ mutable ceph::mutex _lock = ceph::make_mutex("PG::_lock");
+#ifndef CEPH_DEBUG_MUTEX
+ mutable std::thread::id locked_by;
+#endif
+ std::atomic<unsigned int> ref{0};
+
+#ifdef PG_DEBUG_REFS
+ ceph::mutex _ref_id_lock = ceph::make_mutex("PG::_ref_id_lock");
+ std::map<uint64_t, std::string> _live_ids;
+ std::map<std::string, uint64_t> _tag_counts;
+ uint64_t _ref_id = 0;
+
+ friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
+ friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
+#endif
+
+private:
+ friend void intrusive_ptr_add_ref(PG *pg) {
+ pg->get("intptr");
+ }
+ friend void intrusive_ptr_release(PG *pg) {
+ pg->put("intptr");
+ }
+
+
+ // =====================
+
+protected:
+ OSDriver osdriver;
+ SnapMapper snap_mapper;
+
+ virtual PGBackend *get_pgbackend() = 0;
+ virtual const PGBackend* get_pgbackend() const = 0;
+
+protected:
+ void requeue_map_waiters();
+
+protected:
+
+ ZTracer::Endpoint trace_endpoint;
+
+
+protected:
+ __u8 info_struct_v = 0;
+ void upgrade(ObjectStore *store);
+
+protected:
+ ghobject_t pgmeta_oid;
+
+ // ------------------
+ interval_set<snapid_t> snap_trimq;
+ std::set<snapid_t> snap_trimq_repeat;
+
+ /* You should not use these items without taking their respective queue locks
+ * (if they have one) */
+ xlist<PG*>::item stat_queue_item;
+ bool recovery_queued;
+
+ int recovery_ops_active;
+ std::set<pg_shard_t> waiting_on_backfill;
+#ifdef DEBUG_RECOVERY_OIDS
+ multiset<hobject_t> recovering_oids;
+#endif
+
+public:
+ bool dne() { return info.dne(); }
+
+ void send_cluster_message(
+ int osd, MessageRef m, epoch_t epoch, bool share_map_update) override;
+
+protected:
+ epoch_t get_last_peering_reset() const {
+ return recovery_state.get_last_peering_reset();
+ }
+
+ /* heartbeat peers */
+ void set_probe_targets(const std::set<pg_shard_t> &probe_set) override;
+ void clear_probe_targets() override;
+
+ ceph::mutex heartbeat_peer_lock =
+ ceph::make_mutex("PG::heartbeat_peer_lock");
+ std::set<int> heartbeat_peers;
+ std::set<int> probe_targets;
+
+protected:
+ BackfillInterval backfill_info;
+ std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+ bool backfill_reserving;
+
+ // The primary's num_bytes and local num_bytes for this pg, only valid
+ // during backfill for non-primary shards.
+ // Both of these are adjusted for EC to reflect the on-disk bytes
+ std::atomic<int64_t> primary_num_bytes = 0;
+ std::atomic<int64_t> local_num_bytes = 0;
+
+public:
+ // Space reserved for backfill is primary_num_bytes - local_num_bytes
+ // Don't care that difference itself isn't atomic
+ uint64_t get_reserved_num_bytes() {
+ int64_t primary = primary_num_bytes.load();
+ int64_t local = local_num_bytes.load();
+ if (primary > local)
+ return primary - local;
+ else
+ return 0;
+ }
+
+ bool is_remote_backfilling() {
+ return primary_num_bytes.load() > 0;
+ }
+
+ bool try_reserve_recovery_space(int64_t primary, int64_t local) override;
+ void unreserve_recovery_space() override;
+
+ // If num_bytes are inconsistent and local_num- goes negative
+ // it's ok, because it would then be ignored.
+
+ // The value of num_bytes could be negative,
+ // but we don't let local_num_bytes go negative.
+ void add_local_num_bytes(int64_t num_bytes) {
+ if (num_bytes) {
+ int64_t prev_bytes = local_num_bytes.load();
+ int64_t new_bytes;
+ do {
+ new_bytes = prev_bytes + num_bytes;
+ if (new_bytes < 0)
+ new_bytes = 0;
+ } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+ }
+ }
+ void sub_local_num_bytes(int64_t num_bytes) {
+ ceph_assert(num_bytes >= 0);
+ if (num_bytes) {
+ int64_t prev_bytes = local_num_bytes.load();
+ int64_t new_bytes;
+ do {
+ new_bytes = prev_bytes - num_bytes;
+ if (new_bytes < 0)
+ new_bytes = 0;
+ } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+ }
+ }
+ // The value of num_bytes could be negative,
+ // but we don't let info.stats.stats.sum.num_bytes go negative.
+ void add_num_bytes(int64_t num_bytes) {
+ ceph_assert(ceph_mutex_is_locked_by_me(_lock));
+ if (num_bytes) {
+ recovery_state.update_stats(
+ [num_bytes](auto &history, auto &stats) {
+ stats.stats.sum.num_bytes += num_bytes;
+ if (stats.stats.sum.num_bytes < 0) {
+ stats.stats.sum.num_bytes = 0;
+ }
+ return false;
+ });
+ }
+ }
+ void sub_num_bytes(int64_t num_bytes) {
+ ceph_assert(ceph_mutex_is_locked_by_me(_lock));
+ ceph_assert(num_bytes >= 0);
+ if (num_bytes) {
+ recovery_state.update_stats(
+ [num_bytes](auto &history, auto &stats) {
+ stats.stats.sum.num_bytes -= num_bytes;
+ if (stats.stats.sum.num_bytes < 0) {
+ stats.stats.sum.num_bytes = 0;
+ }
+ return false;
+ });
+ }
+ }
+
+ // Only used in testing so not worried about needing the PG lock here
+ int64_t get_stats_num_bytes() {
+ std::lock_guard l{_lock};
+ int num_bytes = info.stats.stats.sum.num_bytes;
+ if (pool.info.is_erasure()) {
+ num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+ // Round up each object by a stripe
+ num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
+ }
+ int64_t lnb = local_num_bytes.load();
+ if (lnb && lnb != num_bytes) {
+ lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
+ << lnb << " vs stats "
+ << info.stats.stats.sum.num_bytes << " / chunk "
+ << get_pgbackend()->get_ec_data_chunk_count()
+ << dendl;
+ }
+ return num_bytes;
+ }
+
+protected:
+
+ /*
+ * blocked request wait hierarchy
+ *
+ * In order to preserve request ordering we need to be careful about the
+ * order in which blocked requests get requeued. Generally speaking, we
+ * push the requests back up to the op_wq in reverse order (most recent
+ * request first) so that they come back out again in the original order.
+ * However, because there are multiple wait queues, we need to requeue
+ * waitlists in order. Generally speaking, we requeue the wait lists
+ * that are checked first.
+ *
+ * Here are the various wait lists, in the order they are used during
+ * request processing, with notes:
+ *
+ * - waiting_for_map
+ * - may start or stop blocking at any time (depending on client epoch)
+ * - waiting_for_peered
+ * - !is_peered()
+ * - only starts blocking on interval change; never restarts
+ * - waiting_for_flush
+ * - flushes_in_progress
+ * - waiting for final flush during activate
+ * - waiting_for_active
+ * - !is_active()
+ * - only starts blocking on interval change; never restarts
+ * - waiting_for_readable
+ * - now > readable_until
+ * - unblocks when we get fresh(er) osd_pings
+ * - waiting_for_scrub
+ * - starts and stops blocking for varying intervals during scrub
+ * - waiting_for_unreadable_object
+ * - never restarts once object is readable (* except for EIO?)
+ * - waiting_for_degraded_object
+ * - never restarts once object is writeable (* except for EIO?)
+ * - waiting_for_blocked_object
+ * - starts and stops based on proxied op activity
+ * - obc rwlocks
+ * - starts and stops based on read/write activity
+ *
+ * Notes:
+ *
+ * 1. During and interval change, we requeue *everything* in the above order.
+ *
+ * 2. When an obc rwlock is released, we check for a scrub block and requeue
+ * the op there if it applies. We ignore the unreadable/degraded/blocked
+ * queues because we assume they cannot apply at that time (this is
+ * probably mostly true).
+ *
+ * 3. The requeue_ops helper will push ops onto the waiting_for_map std::list if
+ * it is non-empty.
+ *
+ * These three behaviors are generally sufficient to maintain ordering, with
+ * the possible exception of cases where we make an object degraded or
+ * unreadable that was previously okay, e.g. when scrub or op processing
+ * encounter an unexpected error. FIXME.
+ */
+
+ // ops with newer maps than our (or blocked behind them)
+ // track these by client, since inter-request ordering doesn't otherwise
+ // matter.
+ std::unordered_map<entity_name_t,std::list<OpRequestRef>> waiting_for_map;
+
+ // ops waiting on peered
+ std::list<OpRequestRef> waiting_for_peered;
+
+ /// ops waiting on readble
+ std::list<OpRequestRef> waiting_for_readable;
+
+ // ops waiting on active (require peered as well)
+ std::list<OpRequestRef> waiting_for_active;
+ std::list<OpRequestRef> waiting_for_flush;
+ std::list<OpRequestRef> waiting_for_scrub;
+
+ std::list<OpRequestRef> waiting_for_cache_not_full;
+ std::list<OpRequestRef> waiting_for_clean_to_primary_repair;
+ std::map<hobject_t, std::list<OpRequestRef>> waiting_for_unreadable_object,
+ waiting_for_degraded_object,
+ waiting_for_blocked_object;
+
+ std::set<hobject_t> objects_blocked_on_cache_full;
+ std::map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
+ std::map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
+
+ // Callbacks should assume pg (and nothing else) is locked
+ std::map<hobject_t, std::list<Context*>> callbacks_for_degraded_object;
+
+ std::map<eversion_t,
+ std::list<
+ std::tuple<OpRequestRef, version_t, int,
+ std::vector<pg_log_op_return_item_t>>>> waiting_for_ondisk;
+
+ void requeue_object_waiters(std::map<hobject_t, std::list<OpRequestRef>>& m);
+ void requeue_op(OpRequestRef op);
+ void requeue_ops(std::list<OpRequestRef> &l);
+
+ // stats that persist lazily
+ object_stat_collection_t unstable_stats;
+
+ // publish stats
+ ceph::mutex pg_stats_publish_lock =
+ ceph::make_mutex("PG::pg_stats_publish_lock");
+ bool pg_stats_publish_valid;
+ pg_stat_t pg_stats_publish;
+
+ friend class TestOpsSocketHook;
+ void publish_stats_to_osd() override;
+
+ bool needs_recovery() const {
+ return recovery_state.needs_recovery();
+ }
+ bool needs_backfill() const {
+ return recovery_state.needs_backfill();
+ }
+
+ bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
+
+ struct PGLogEntryHandler : public PGLog::LogEntryHandler {
+ PG *pg;
+ ObjectStore::Transaction *t;
+ PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
+
+ // LogEntryHandler
+ void remove(const hobject_t &hoid) override {
+ pg->get_pgbackend()->remove(hoid, t);
+ }
+ void try_stash(const hobject_t &hoid, version_t v) override {
+ pg->get_pgbackend()->try_stash(hoid, v, t);
+ }
+ void rollback(const pg_log_entry_t &entry) override {
+ ceph_assert(entry.can_rollback());
+ pg->get_pgbackend()->rollback(entry, t);
+ }
+ void rollforward(const pg_log_entry_t &entry) override {
+ pg->get_pgbackend()->rollforward(entry, t);
+ }
+ void trim(const pg_log_entry_t &entry) override {
+ pg->get_pgbackend()->trim(entry, t);
+ }
+ };
+
+ void update_object_snap_mapping(
+ ObjectStore::Transaction *t, const hobject_t &soid,
+ const std::set<snapid_t> &snaps);
+ void clear_object_snap_mapping(
+ ObjectStore::Transaction *t, const hobject_t &soid);
+ void remove_snap_mapped_object(
+ ObjectStore::Transaction& t, const hobject_t& soid);
+
+ bool have_unfound() const {
+ return recovery_state.have_unfound();
+ }
+ uint64_t get_num_unfound() const {
+ return recovery_state.get_num_unfound();
+ }
+
+ virtual void check_local() = 0;
+
+ void purge_strays();
+
+ void update_heartbeat_peers(std::set<int> peers) override;
+
+ Context *finish_sync_event;
+
+ Context *finish_recovery();
+ void _finish_recovery(Context *c);
+ struct C_PG_FinishRecovery : public Context {
+ PGRef pg;
+ explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
+ void finish(int r) override {
+ pg->_finish_recovery(this);
+ }
+ };
+ void cancel_recovery();
+ void clear_recovery_state();
+ virtual void _clear_recovery_state() = 0;
+ void start_recovery_op(const hobject_t& soid);
+ void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
+
+ virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
+
+ friend class C_OSD_RepModify_Commit;
+ friend struct C_DeleteMore;
+
+ // -- backoff --
+ ceph::mutex backoff_lock = // orders inside Backoff::lock
+ ceph::make_mutex("PG::backoff_lock");
+ std::map<hobject_t,std::set<ceph::ref_t<Backoff>>> backoffs;
+
+ void add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end);
+ void release_backoffs(const hobject_t& begin, const hobject_t& end);
+ void release_backoffs(const hobject_t& o) {
+ release_backoffs(o, o);
+ }
+ void clear_backoffs();
+
+ void add_pg_backoff(const ceph::ref_t<Session>& s) {
+ hobject_t begin = info.pgid.pgid.get_hobj_start();
+ hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+ add_backoff(s, begin, end);
+ }
+public:
+ void release_pg_backoffs() {
+ hobject_t begin = info.pgid.pgid.get_hobj_start();
+ hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+ release_backoffs(begin, end);
+ }
+
+ // -- scrub --
+protected:
+ bool scrub_after_recovery;
+
+ int active_pushes;
+
+ void repair_object(
+ const hobject_t &soid,
+ const std::list<std::pair<ScrubMap::object, pg_shard_t> > &ok_peers,
+ const std::set<pg_shard_t> &bad_peers);
+
+ [[nodiscard]] bool ops_blocked_by_scrub() const;
+ [[nodiscard]] Scrub::scrub_prio_t is_scrub_blocking_ops() const;
+
+ void _repair_oinfo_oid(ScrubMap &map);
+ void _scan_rollback_obs(const std::vector<ghobject_t> &rollback_obs);
+ /**
+ * returns true if [begin, end) is good to scrub at this time
+ * a false return value obliges the implementer to requeue scrub when the
+ * condition preventing scrub clears
+ */
+ virtual bool _range_available_for_scrub(
+ const hobject_t &begin, const hobject_t &end) = 0;
+
+ /**
+ * Initiate the process that will create our scrub map for the Primary.
+ * (triggered by MSG_OSD_REP_SCRUB)
+ */
+ void replica_scrub(OpRequestRef op, ThreadPool::TPHandle &handle);
+
+ // -- recovery state --
+
+ struct QueuePeeringEvt : Context {
+ PGRef pg;
+ PGPeeringEventRef evt;
+
+ template <class EVT>
+ QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
+ pg(pg), evt(std::make_shared<PGPeeringEvent>(epoch, epoch, evt)) {}
+
+ QueuePeeringEvt(PG *pg, PGPeeringEventRef evt) :
+ pg(pg), evt(std::move(evt)) {}
+
+ void finish(int r) override {
+ pg->lock();
+ pg->queue_peering_event(std::move(evt));
+ pg->unlock();
+ }
+ };
+
+
+public:
+ int pg_stat_adjust(osd_stat_t *new_stat);
+protected:
+ bool delete_needs_sleep = false;
+
+protected:
+ bool state_test(uint64_t m) const { return recovery_state.state_test(m); }
+ void state_set(uint64_t m) { recovery_state.state_set(m); }
+ void state_clear(uint64_t m) { recovery_state.state_clear(m); }
+
+ bool is_complete() const {
+ return recovery_state.is_complete();
+ }
+ bool should_send_notify() const {
+ return recovery_state.should_send_notify();
+ }
+
+ bool is_active() const { return recovery_state.is_active(); }
+ bool is_activating() const { return recovery_state.is_activating(); }
+ bool is_peering() const { return recovery_state.is_peering(); }
+ bool is_down() const { return recovery_state.is_down(); }
+ bool is_recovery_unfound() const { return recovery_state.is_recovery_unfound(); }
+ bool is_backfill_unfound() const { return recovery_state.is_backfill_unfound(); }
+ bool is_incomplete() const { return recovery_state.is_incomplete(); }
+ bool is_clean() const { return recovery_state.is_clean(); }
+ bool is_degraded() const { return recovery_state.is_degraded(); }
+ bool is_undersized() const { return recovery_state.is_undersized(); }
+ bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } // Primary only
+ bool is_remapped() const { return recovery_state.is_remapped(); }
+ bool is_peered() const { return recovery_state.is_peered(); }
+ bool is_recovering() const { return recovery_state.is_recovering(); }
+ bool is_premerge() const { return recovery_state.is_premerge(); }
+ bool is_repair() const { return recovery_state.is_repair(); }
+ bool is_laggy() const { return state_test(PG_STATE_LAGGY); }
+ bool is_wait() const { return state_test(PG_STATE_WAIT); }
+
+ bool is_empty() const { return recovery_state.is_empty(); }
+
+ // pg on-disk state
+ void do_pending_flush();
+
+public:
+ void prepare_write(
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ObjectStore::Transaction &t) override;
+
+ void write_if_dirty(PeeringCtx &rctx) {
+ write_if_dirty(rctx.transaction);
+ }
+protected:
+ void write_if_dirty(ObjectStore::Transaction& t) {
+ recovery_state.write_if_dirty(t);
+ }
+
+ PGLog::IndexedLog projected_log;
+ bool check_in_progress_op(
+ const osd_reqid_t &r,
+ eversion_t *version,
+ version_t *user_version,
+ int *return_code,
+ std::vector<pg_log_op_return_item_t> *op_returns) const;
+ eversion_t projected_last_update;
+ eversion_t get_next_version() const {
+ eversion_t at_version(
+ get_osdmap_epoch(),
+ projected_last_update.version+1);
+ ceph_assert(at_version > info.last_update);
+ ceph_assert(at_version > recovery_state.get_pg_log().get_head());
+ ceph_assert(at_version > projected_last_update);
+ return at_version;
+ }
+
+ bool check_log_for_corruption(ObjectStore *store);
+
+ std::string get_corrupt_pg_log_name() const;
+
+ void update_snap_map(
+ const std::vector<pg_log_entry_t> &log_entries,
+ ObjectStore::Transaction& t);
+
+ void filter_snapc(std::vector<snapid_t> &snaps);
+
+ virtual void kick_snap_trim() = 0;
+ virtual void snap_trimmer_scrub_complete() = 0;
+
+ void queue_recovery();
+ void queue_scrub_after_repair();
+ unsigned int get_scrub_priority();
+
+ bool try_flush_or_schedule_async() override;
+ void start_flush_on_transaction(
+ ObjectStore::Transaction &t) override;
+
+ void update_history(const pg_history_t& history) {
+ recovery_state.update_history(history);
+ }
+
+ // OpRequest queueing
+ bool can_discard_op(OpRequestRef& op);
+ bool can_discard_scan(OpRequestRef op);
+ bool can_discard_backfill(OpRequestRef op);
+ bool can_discard_request(OpRequestRef& op);
+
+ template<typename T, int MSGTYPE>
+ bool can_discard_replica_op(OpRequestRef& op);
+
+ bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
+ bool old_peering_evt(PGPeeringEventRef evt) {
+ return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
+ }
+ bool have_same_or_newer_map(epoch_t e) {
+ return e <= get_osdmap_epoch();
+ }
+
+ bool op_has_sufficient_caps(OpRequestRef& op);
+
+ // abstract bits
+ friend struct FlushState;
+
+ friend ostream& operator<<(ostream& out, const PG& pg);
+
+protected:
+ PeeringState recovery_state;
+
+ // ref to recovery_state.pool
+ const PGPool &pool;
+
+ // ref to recovery_state.info
+ const pg_info_t &info;
+};
+
+#endif
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
new file mode 100644
index 000000000..ef2eb5381
--- /dev/null
+++ b/src/osd/PGBackend.cc
@@ -0,0 +1,1324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "ReplicatedBackend.h"
+#include "ScrubStore.h"
+#include "ECBackend.h"
+#include "PGBackend.h"
+#include "OSD.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "OSDMap.h"
+#include "PGLog.h"
+#include "common/LogClient.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::ErasureCodeProfile;
+using ceph::ErasureCodeInterfaceRef;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
+ return pgb->get_parent()->gen_dbg_prefix(*_dout);
+}
+
+void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
+ RecoveryHandle *h)
+{
+ ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
+ for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
+ if (shard == get_parent()->whoami_shard())
+ continue;
+ if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+ dout(20) << __func__ << " will remove " << oid << " " << v << " from "
+ << shard << dendl;
+ h->deletes[shard].push_back(make_pair(oid, v));
+ get_parent()->begin_peer_recover(shard, oid);
+ }
+ }
+}
+
+void PGBackend::send_recovery_deletes(int prio,
+ const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
+{
+ epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
+ for (const auto& p : deletes) {
+ const auto& shard = p.first;
+ const auto& objects = p.second;
+ ConnectionRef con = get_parent()->get_con_osd_cluster(
+ shard.osd,
+ get_osdmap_epoch());
+ if (!con)
+ continue;
+ auto it = objects.begin();
+ while (it != objects.end()) {
+ uint64_t cost = 0;
+ uint64_t deletes = 0;
+ spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
+ MOSDPGRecoveryDelete *msg =
+ new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
+ target_pg,
+ get_osdmap_epoch(),
+ min_epoch);
+ msg->set_priority(prio);
+
+ while (it != objects.end() &&
+ cost < cct->_conf->osd_max_push_cost &&
+ deletes < cct->_conf->osd_max_push_objects) {
+ dout(20) << __func__ << ": sending recovery delete << " << it->first
+ << " " << it->second << " to osd." << shard << dendl;
+ msg->objects.push_back(*it);
+ cost += cct->_conf->osd_push_per_object_cost;
+ ++deletes;
+ ++it;
+ }
+
+ msg->set_cost(cost);
+ get_parent()->send_message_osd_cluster(msg, con);
+ }
+ }
+}
+
+bool PGBackend::handle_message(OpRequestRef op)
+{
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ handle_recovery_delete(op);
+ return true;
+
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ handle_recovery_delete_reply(op);
+ return true;
+
+ default:
+ break;
+ }
+
+ return _handle_message(op);
+}
+
+void PGBackend::handle_recovery_delete(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGRecoveryDelete>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
+ dout(20) << __func__ << " " << op << dendl;
+
+ op->mark_started();
+
+ C_GatherBuilder gather(cct);
+ for (const auto &p : m->objects) {
+ get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
+ }
+
+ auto reply = make_message<MOSDPGRecoveryDeleteReply>();
+ reply->from = get_parent()->whoami_shard();
+ reply->set_priority(m->get_priority());
+ reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->objects = m->objects;
+ ConnectionRef conn = m->get_connection();
+
+ gather.set_finisher(new LambdaContext(
+ [=](int r) {
+ if (r != -EAGAIN) {
+ get_parent()->send_message_osd_cluster(reply, conn.get());
+ }
+ }));
+ gather.activate();
+}
+
+void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGRecoveryDeleteReply>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
+ dout(20) << __func__ << " " << op << dendl;
+
+ for (const auto &p : m->objects) {
+ ObjectRecoveryInfo recovery_info;
+ hobject_t oid = p.first;
+ recovery_info.version = p.second;
+ get_parent()->on_peer_recover(m->from, oid, recovery_info);
+ bool peers_recovered = true;
+ for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
+ if (shard == get_parent()->whoami_shard())
+ continue;
+ if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+ dout(20) << __func__ << " " << oid << " still missing on at least "
+ << shard << dendl;
+ peers_recovered = false;
+ break;
+ }
+ }
+ if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
+ dout(20) << __func__ << " completed recovery, local_missing = "
+ << get_parent()->get_local_missing() << dendl;
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ get_parent()->on_global_recover(p.first, stat_diff, true);
+ }
+ }
+}
+
+void PGBackend::rollback(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t)
+{
+
+ struct RollbackVisitor : public ObjectModDesc::Visitor {
+ const hobject_t &hoid;
+ PGBackend *pg;
+ ObjectStore::Transaction t;
+ RollbackVisitor(
+ const hobject_t &hoid,
+ PGBackend *pg) : hoid(hoid), pg(pg) {}
+ void append(uint64_t old_size) override {
+ ObjectStore::Transaction temp;
+ pg->rollback_append(hoid, old_size, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
+ ObjectStore::Transaction temp;
+ pg->rollback_setattrs(hoid, attrs, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ void rmobject(version_t old_version) override {
+ ObjectStore::Transaction temp;
+ pg->rollback_stash(hoid, old_version, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ void try_rmobject(version_t old_version) override {
+ ObjectStore::Transaction temp;
+ pg->rollback_try_stash(hoid, old_version, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ void create() override {
+ ObjectStore::Transaction temp;
+ pg->rollback_create(hoid, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ void update_snaps(const set<snapid_t> &snaps) override {
+ ObjectStore::Transaction temp;
+ pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ void rollback_extents(
+ version_t gen,
+ const vector<pair<uint64_t, uint64_t> > &extents) override {
+ ObjectStore::Transaction temp;
+ pg->rollback_extents(gen, extents, hoid, &temp);
+ temp.append(t);
+ temp.swap(t);
+ }
+ };
+
+ ceph_assert(entry.mod_desc.can_rollback());
+ RollbackVisitor vis(entry.soid, this);
+ entry.mod_desc.visit(&vis);
+ t->append(vis.t);
+}
+
+struct Trimmer : public ObjectModDesc::Visitor {
+ const hobject_t &soid;
+ PGBackend *pg;
+ ObjectStore::Transaction *t;
+ Trimmer(
+ const hobject_t &soid,
+ PGBackend *pg,
+ ObjectStore::Transaction *t)
+ : soid(soid), pg(pg), t(t) {}
+ void rmobject(version_t old_version) override {
+ pg->trim_rollback_object(
+ soid,
+ old_version,
+ t);
+ }
+ // try_rmobject defaults to rmobject
+ void rollback_extents(
+ version_t gen,
+ const vector<pair<uint64_t, uint64_t> > &extents) override {
+ pg->trim_rollback_object(
+ soid,
+ gen,
+ t);
+ }
+};
+
+void PGBackend::rollforward(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t)
+{
+ auto dpp = get_parent()->get_dpp();
+ ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
+ if (!entry.can_rollback())
+ return;
+ Trimmer trimmer(entry.soid, this, t);
+ entry.mod_desc.visit(&trimmer);
+}
+
+void PGBackend::trim(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t)
+{
+ if (!entry.can_rollback())
+ return;
+ Trimmer trimmer(entry.soid, this, t);
+ entry.mod_desc.visit(&trimmer);
+}
+
+void PGBackend::try_stash(
+ const hobject_t &hoid,
+ version_t v,
+ ObjectStore::Transaction *t)
+{
+ t->try_rename(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
+}
+
+void PGBackend::remove(
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t) {
+ ceph_assert(!hoid.is_temp());
+ t->remove(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ get_parent()->pgb_clear_object_snap_mapping(hoid, t);
+}
+
+void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
+{
+ dout(10) << __func__ << dendl;
+ // clear temp
+ for (set<hobject_t>::iterator i = temp_contents.begin();
+ i != temp_contents.end();
+ ++i) {
+ dout(10) << __func__ << ": Removing oid "
+ << *i << " from the temp collection" << dendl;
+ t->remove(
+ coll,
+ ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ }
+ temp_contents.clear();
+}
+
+int PGBackend::objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ vector<hobject_t> *ls,
+ hobject_t *next)
+{
+ ceph_assert(ls);
+ // Starts with the smallest generation to make sure the result list
+ // has the marker object (it might have multiple generations
+ // though, which would be filtered).
+ ghobject_t _next;
+ if (!begin.is_min())
+ _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
+ ls->reserve(max);
+ int r = 0;
+
+ if (min > max)
+ min = max;
+
+ while (!_next.is_max() && ls->size() < (unsigned)min) {
+ vector<ghobject_t> objects;
+ if (HAVE_FEATURE(parent->min_upacting_features(),
+ OSD_FIXED_COLLECTION_LIST)) {
+ r = store->collection_list(
+ ch,
+ _next,
+ ghobject_t::get_max(),
+ max - ls->size(),
+ &objects,
+ &_next);
+ } else {
+ r = store->collection_list_legacy(
+ ch,
+ _next,
+ ghobject_t::get_max(),
+ max - ls->size(),
+ &objects,
+ &_next);
+ }
+ if (r != 0) {
+ derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
+ break;
+ }
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (i->is_pgmeta() || i->hobj.is_temp()) {
+ continue;
+ }
+ if (i->is_no_gen()) {
+ ls->push_back(i->hobj);
+ }
+ }
+ }
+ if (r == 0)
+ *next = _next.hobj;
+ return r;
+}
+
+int PGBackend::objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ vector<hobject_t> *ls,
+ vector<ghobject_t> *gen_obs)
+{
+ ceph_assert(ls);
+ vector<ghobject_t> objects;
+ int r;
+ if (HAVE_FEATURE(parent->min_upacting_features(),
+ OSD_FIXED_COLLECTION_LIST)) {
+ r = store->collection_list(
+ ch,
+ ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ INT_MAX,
+ &objects,
+ NULL);
+ } else {
+ r = store->collection_list_legacy(
+ ch,
+ ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ INT_MAX,
+ &objects,
+ NULL);
+ }
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (i->is_pgmeta() || i->hobj.is_temp()) {
+ continue;
+ }
+ if (i->is_no_gen()) {
+ ls->push_back(i->hobj);
+ } else if (gen_obs) {
+ gen_obs->push_back(*i);
+ }
+ }
+ return r;
+}
+
+int PGBackend::objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out)
+{
+ bufferptr bp;
+ int r = store->getattr(
+ ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ attr.c_str(),
+ bp);
+ if (r >= 0 && out) {
+ out->clear();
+ out->push_back(std::move(bp));
+ }
+ return r;
+}
+
+int PGBackend::objects_get_attrs(
+ const hobject_t &hoid,
+ map<string, bufferlist> *out)
+{
+ return store->getattrs(
+ ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ *out);
+}
+
+void PGBackend::rollback_setattrs(
+ const hobject_t &hoid,
+ map<string, std::optional<bufferlist> > &old_attrs,
+ ObjectStore::Transaction *t) {
+ map<string, bufferlist> to_set;
+ ceph_assert(!hoid.is_temp());
+ for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
+ i != old_attrs.end();
+ ++i) {
+ if (i->second) {
+ to_set[i->first] = *(i->second);
+ } else {
+ t->rmattr(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ i->first);
+ }
+ }
+ t->setattrs(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ to_set);
+}
+
+void PGBackend::rollback_append(
+ const hobject_t &hoid,
+ uint64_t old_size,
+ ObjectStore::Transaction *t) {
+ ceph_assert(!hoid.is_temp());
+ t->truncate(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ old_size);
+}
+
+void PGBackend::rollback_stash(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t) {
+ ceph_assert(!hoid.is_temp());
+ t->remove(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ t->collection_move_rename(
+ coll,
+ ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+}
+
+void PGBackend::rollback_try_stash(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t) {
+ ceph_assert(!hoid.is_temp());
+ t->remove(
+ coll,
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ t->try_rename(
+ coll,
+ ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
+ ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+}
+
+void PGBackend::rollback_extents(
+ version_t gen,
+ const vector<pair<uint64_t, uint64_t> > &extents,
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t) {
+ auto shard = get_parent()->whoami_shard().shard;
+ for (auto &&extent: extents) {
+ t->clone_range(
+ coll,
+ ghobject_t(hoid, gen, shard),
+ ghobject_t(hoid, ghobject_t::NO_GEN, shard),
+ extent.first,
+ extent.second,
+ extent.first);
+ }
+ t->remove(
+ coll,
+ ghobject_t(hoid, gen, shard));
+}
+
+void PGBackend::trim_rollback_object(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t) {
+ ceph_assert(!hoid.is_temp());
+ t->remove(
+ coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
+}
+
+PGBackend *PGBackend::build_pg_backend(
+ const pg_pool_t &pool,
+ const map<string,string>& profile,
+ Listener *l,
+ coll_t coll,
+ ObjectStore::CollectionHandle &ch,
+ ObjectStore *store,
+ CephContext *cct)
+{
+ ErasureCodeProfile ec_profile = profile;
+ switch (pool.type) {
+ case pg_pool_t::TYPE_REPLICATED: {
+ return new ReplicatedBackend(l, coll, ch, store, cct);
+ }
+ case pg_pool_t::TYPE_ERASURE: {
+ ErasureCodeInterfaceRef ec_impl;
+ stringstream ss;
+ ceph::ErasureCodePluginRegistry::instance().factory(
+ profile.find("plugin")->second,
+ cct->_conf.get_val<std::string>("erasure_code_dir"),
+ ec_profile,
+ &ec_impl,
+ &ss);
+ ceph_assert(ec_impl);
+ return new ECBackend(
+ l,
+ coll,
+ ch,
+ store,
+ cct,
+ ec_impl,
+ pool.stripe_width);
+ }
+ default:
+ ceph_abort();
+ return NULL;
+ }
+}
+
+int PGBackend::be_scan_list(
+ ScrubMap &map,
+ ScrubMapBuilder &pos)
+{
+ dout(10) << __func__ << " " << pos << dendl;
+ ceph_assert(!pos.done());
+ ceph_assert(pos.pos < pos.ls.size());
+ hobject_t& poid = pos.ls[pos.pos];
+
+ struct stat st;
+ int r = store->stat(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &st,
+ true);
+ if (r == 0) {
+ ScrubMap::object &o = map.objects[poid];
+ o.size = st.st_size;
+ ceph_assert(!o.negative);
+ store->getattrs(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ o.attrs);
+
+ if (pos.deep) {
+ r = be_deep_scrub(poid, map, pos, o);
+ }
+ dout(25) << __func__ << " " << poid << dendl;
+ } else if (r == -ENOENT) {
+ dout(25) << __func__ << " " << poid << " got " << r
+ << ", skipping" << dendl;
+ } else if (r == -EIO) {
+ dout(25) << __func__ << " " << poid << " got " << r
+ << ", stat_error" << dendl;
+ ScrubMap::object &o = map.objects[poid];
+ o.stat_error = true;
+ } else {
+ derr << __func__ << " got: " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ if (r == -EINPROGRESS) {
+ return -EINPROGRESS;
+ }
+ pos.next_object();
+ return 0;
+}
+
+bool PGBackend::be_compare_scrub_objects(
+ pg_shard_t auth_shard,
+ const ScrubMap::object &auth,
+ const object_info_t& auth_oi,
+ const ScrubMap::object &candidate,
+ shard_info_wrapper &shard_result,
+ inconsistent_obj_wrapper &obj_result,
+ ostream &errorstream,
+ bool has_snapset)
+{
+ enum { CLEAN, FOUND_ERROR } error = CLEAN;
+ if (auth.digest_present && candidate.digest_present) {
+ if (auth.digest != candidate.digest) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "data_digest 0x" << std::hex << candidate.digest
+ << " != data_digest 0x" << auth.digest << std::dec
+ << " from shard " << auth_shard;
+ obj_result.set_data_digest_mismatch();
+ }
+ }
+ if (auth.omap_digest_present && candidate.omap_digest_present) {
+ if (auth.omap_digest != candidate.omap_digest) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
+ << " != omap_digest 0x" << auth.omap_digest << std::dec
+ << " from shard " << auth_shard;
+ obj_result.set_omap_digest_mismatch();
+ }
+ }
+ if (parent->get_pool().is_replicated()) {
+ if (auth_oi.is_data_digest() && candidate.digest_present) {
+ if (auth_oi.data_digest != candidate.digest) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "data_digest 0x" << std::hex << candidate.digest
+ << " != data_digest 0x" << auth_oi.data_digest << std::dec
+ << " from auth oi " << auth_oi;
+ shard_result.set_data_digest_mismatch_info();
+ }
+ }
+ if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
+ if (auth_oi.omap_digest != candidate.omap_digest) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
+ << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
+ << " from auth oi " << auth_oi;
+ shard_result.set_omap_digest_mismatch_info();
+ }
+ }
+ }
+ if (candidate.stat_error)
+ return error == FOUND_ERROR;
+ if (!shard_result.has_info_missing()
+ && !shard_result.has_info_corrupted()) {
+ bufferlist can_bl, auth_bl;
+ auto can_attr = candidate.attrs.find(OI_ATTR);
+ auto auth_attr = auth.attrs.find(OI_ATTR);
+
+ ceph_assert(auth_attr != auth.attrs.end());
+ ceph_assert(can_attr != candidate.attrs.end());
+
+ can_bl.push_back(can_attr->second);
+ auth_bl.push_back(auth_attr->second);
+ if (!can_bl.contents_equal(auth_bl)) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ obj_result.set_object_info_inconsistency();
+ errorstream << "object info inconsistent ";
+ }
+ }
+ if (has_snapset) {
+ if (!shard_result.has_snapset_missing()
+ && !shard_result.has_snapset_corrupted()) {
+ bufferlist can_bl, auth_bl;
+ auto can_attr = candidate.attrs.find(SS_ATTR);
+ auto auth_attr = auth.attrs.find(SS_ATTR);
+
+ ceph_assert(auth_attr != auth.attrs.end());
+ ceph_assert(can_attr != candidate.attrs.end());
+
+ can_bl.push_back(can_attr->second);
+ auth_bl.push_back(auth_attr->second);
+ if (!can_bl.contents_equal(auth_bl)) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ obj_result.set_snapset_inconsistency();
+ errorstream << "snapset inconsistent ";
+ }
+ }
+ }
+ if (parent->get_pool().is_erasure()) {
+ if (!shard_result.has_hinfo_missing()
+ && !shard_result.has_hinfo_corrupted()) {
+ bufferlist can_bl, auth_bl;
+ auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
+ auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
+
+ ceph_assert(auth_hi != auth.attrs.end());
+ ceph_assert(can_hi != candidate.attrs.end());
+
+ can_bl.push_back(can_hi->second);
+ auth_bl.push_back(auth_hi->second);
+ if (!can_bl.contents_equal(auth_bl)) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ obj_result.set_hinfo_inconsistency();
+ errorstream << "hinfo inconsistent ";
+ }
+ }
+ }
+ uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
+ if (oi_size != candidate.size) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "size " << candidate.size
+ << " != size " << oi_size
+ << " from auth oi " << auth_oi;
+ shard_result.set_size_mismatch_info();
+ }
+ if (auth.size != candidate.size) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "size " << candidate.size
+ << " != size " << auth.size
+ << " from shard " << auth_shard;
+ obj_result.set_size_mismatch();
+ }
+ // If the replica is too large and we didn't already count it for this object
+ //
+ if (candidate.size > cct->_conf->osd_max_object_size
+ && !obj_result.has_size_too_large()) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "size " << candidate.size
+ << " > " << cct->_conf->osd_max_object_size
+ << " is too large";
+ obj_result.set_size_too_large();
+ }
+ for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
+ i != auth.attrs.end();
+ ++i) {
+ // We check system keys seperately
+ if (i->first == OI_ATTR || i->first[0] != '_')
+ continue;
+ if (!candidate.attrs.count(i->first)) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "attr name mismatch '" << i->first << "'";
+ obj_result.set_attr_name_mismatch();
+ } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "attr value mismatch '" << i->first << "'";
+ obj_result.set_attr_value_mismatch();
+ }
+ }
+ for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
+ i != candidate.attrs.end();
+ ++i) {
+ // We check system keys seperately
+ if (i->first == OI_ATTR || i->first[0] != '_')
+ continue;
+ if (!auth.attrs.count(i->first)) {
+ if (error != CLEAN)
+ errorstream << ", ";
+ error = FOUND_ERROR;
+ errorstream << "attr name mismatch '" << i->first << "'";
+ obj_result.set_attr_name_mismatch();
+ }
+ }
+ return error == FOUND_ERROR;
+}
+
+static int dcount(const object_info_t &oi)
+{
+ int count = 0;
+ if (oi.is_data_digest())
+ count++;
+ if (oi.is_omap_digest())
+ count++;
+ return count;
+}
+
+map<pg_shard_t, ScrubMap *>::const_iterator
+ PGBackend::be_select_auth_object(
+ const hobject_t &obj,
+ const map<pg_shard_t,ScrubMap*> &maps,
+ object_info_t *auth_oi,
+ map<pg_shard_t, shard_info_wrapper> &shard_map,
+ bool &digest_match,
+ spg_t pgid,
+ ostream &errorstream)
+{
+ eversion_t auth_version;
+
+ // Create list of shards with primary first so it will be auth copy all
+ // other things being equal.
+ list<pg_shard_t> shards;
+ for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
+ j != maps.end();
+ ++j) {
+ if (j->first == get_parent()->whoami_shard())
+ continue;
+ shards.push_back(j->first);
+ }
+ shards.push_front(get_parent()->whoami_shard());
+
+ map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+ digest_match = true;
+ for (auto &l : shards) {
+ ostringstream shard_errorstream;
+ bool error = false;
+ map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
+ map<hobject_t, ScrubMap::object>::iterator i =
+ j->second->objects.find(obj);
+ if (i == j->second->objects.end()) {
+ continue;
+ }
+ auto& shard_info = shard_map[j->first];
+ if (j->first == get_parent()->whoami_shard())
+ shard_info.primary = true;
+ if (i->second.read_error) {
+ shard_info.set_read_error();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a read error";
+ }
+ if (i->second.ec_hash_mismatch) {
+ shard_info.set_ec_hash_mismatch();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had an ec hash mismatch";
+ }
+ if (i->second.ec_size_mismatch) {
+ shard_info.set_ec_size_mismatch();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had an ec size mismatch";
+ }
+
+ object_info_t oi;
+ bufferlist bl;
+ map<string, bufferptr>::iterator k;
+ SnapSet ss;
+ bufferlist ss_bl, hk_bl;
+
+ if (i->second.stat_error) {
+ shard_info.set_stat_error();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a stat error";
+ // With stat_error no further checking
+ // We don't need to also see a missing_object_info_attr
+ goto out;
+ }
+
+ // We won't pick an auth copy if the snapset is missing or won't decode.
+ ceph_assert(!obj.is_snapdir());
+ if (obj.is_head()) {
+ k = i->second.attrs.find(SS_ATTR);
+ if (k == i->second.attrs.end()) {
+ shard_info.set_snapset_missing();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a missing snapset key";
+ } else {
+ ss_bl.push_back(k->second);
+ try {
+ auto bliter = ss_bl.cbegin();
+ decode(ss, bliter);
+ } catch (...) {
+ // invalid snapset, probably corrupt
+ shard_info.set_snapset_corrupted();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a corrupt snapset";
+ }
+ }
+ }
+
+ if (parent->get_pool().is_erasure()) {
+ ECUtil::HashInfo hi;
+ k = i->second.attrs.find(ECUtil::get_hinfo_key());
+ if (k == i->second.attrs.end()) {
+ shard_info.set_hinfo_missing();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a missing hinfo key";
+ } else {
+ hk_bl.push_back(k->second);
+ try {
+ auto bliter = hk_bl.cbegin();
+ decode(hi, bliter);
+ } catch (...) {
+ // invalid snapset, probably corrupt
+ shard_info.set_hinfo_corrupted();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a corrupt hinfo";
+ }
+ }
+ }
+
+ k = i->second.attrs.find(OI_ATTR);
+ if (k == i->second.attrs.end()) {
+ // no object info on object, probably corrupt
+ shard_info.set_info_missing();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a missing info key";
+ goto out;
+ }
+ bl.push_back(k->second);
+ try {
+ auto bliter = bl.cbegin();
+ decode(oi, bliter);
+ } catch (...) {
+ // invalid object info, probably corrupt
+ shard_info.set_info_corrupted();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate had a corrupt info";
+ goto out;
+ }
+
+ // This is automatically corrected in PG::_repair_oinfo_oid()
+ ceph_assert(oi.soid == obj);
+
+ if (i->second.size != be_get_ondisk_size(oi.size)) {
+ shard_info.set_obj_size_info_mismatch();
+ if (error)
+ shard_errorstream << ", ";
+ error = true;
+ shard_errorstream << "candidate size " << i->second.size << " info size "
+ << oi.size << " mismatch";
+ }
+
+ // digest_match will only be true if computed digests are the same
+ if (auth_version != eversion_t()
+ && auth->second->objects[obj].digest_present
+ && i->second.digest_present
+ && auth->second->objects[obj].digest != i->second.digest) {
+ digest_match = false;
+ dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
+ << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
+ << dendl;
+ }
+
+ // Don't use this particular shard due to previous errors
+ // XXX: For now we can't pick one shard for repair and another's object info or snapset
+ if (shard_info.errors)
+ goto out;
+
+ if (auth_version == eversion_t() || oi.version > auth_version ||
+ (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
+ auth = j;
+ *auth_oi = oi;
+ auth_version = oi.version;
+ }
+
+out:
+ if (error)
+ errorstream << pgid.pgid << " shard " << l << " soid " << obj
+ << " : " << shard_errorstream.str() << "\n";
+ // Keep scanning other shards
+ }
+ dout(10) << __func__ << ": selecting osd " << auth->first
+ << " for obj " << obj
+ << " with oi " << *auth_oi
+ << dendl;
+ return auth;
+}
+
+void PGBackend::be_compare_scrubmaps(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ bool repair,
+ map<hobject_t, set<pg_shard_t>> &missing,
+ map<hobject_t, set<pg_shard_t>> &inconsistent,
+ map<hobject_t, list<pg_shard_t>> &authoritative,
+ map<hobject_t, pair<std::optional<uint32_t>,
+ std::optional<uint32_t>>> &missing_digest,
+ int &shallow_errors, int &deep_errors,
+ Scrub::Store *store,
+ const spg_t& pgid,
+ const vector<int> &acting,
+ ostream &errorstream)
+{
+ utime_t now = ceph_clock_now();
+
+ // Check maps against master set and each other
+ for (set<hobject_t>::const_iterator k = master_set.begin();
+ k != master_set.end();
+ ++k) {
+ object_info_t auth_oi;
+ map<pg_shard_t, shard_info_wrapper> shard_map;
+
+ inconsistent_obj_wrapper object_error{*k};
+
+ bool digest_match;
+ map<pg_shard_t, ScrubMap *>::const_iterator auth =
+ be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match,
+ pgid, errorstream);
+
+ list<pg_shard_t> auth_list;
+ set<pg_shard_t> object_errors;
+ if (auth == maps.end()) {
+ object_error.set_version(0);
+ object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
+ deep_errors, get_parent()->whoami_shard());
+ if (object_error.has_deep_errors())
+ ++deep_errors;
+ else if (object_error.has_shallow_errors())
+ ++shallow_errors;
+ store->add_object_error(k->pool, object_error);
+ errorstream << pgid.pgid << " soid " << *k
+ << " : failed to pick suitable object info\n";
+ continue;
+ }
+ object_error.set_version(auth_oi.user_version);
+ ScrubMap::object& auth_object = auth->second->objects[*k];
+ set<pg_shard_t> cur_missing;
+ set<pg_shard_t> cur_inconsistent;
+ bool fix_digest = false;
+
+ for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
+ if (j == auth)
+ shard_map[auth->first].selected_oi = true;
+ if (j->second->objects.count(*k)) {
+ shard_map[j->first].set_object(j->second->objects[*k]);
+ // Compare
+ stringstream ss;
+ bool found = be_compare_scrub_objects(auth->first,
+ auth_object,
+ auth_oi,
+ j->second->objects[*k],
+ shard_map[j->first],
+ object_error,
+ ss,
+ k->has_snapset());
+
+ dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
+ << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
+ << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
+ << dendl;
+ // If all replicas match, but they don't match object_info we can
+ // repair it by using missing_digest mechanism
+ if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
+ && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
+ && auth_object.digest_present) {
+ // Set in missing_digests
+ fix_digest = true;
+ // Clear the error
+ shard_map[j->first].clear_data_digest_mismatch_info();
+ errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n";
+ }
+ // Some errors might have already been set in be_select_auth_object()
+ if (shard_map[j->first].errors != 0) {
+ cur_inconsistent.insert(j->first);
+ if (shard_map[j->first].has_deep_errors())
+ ++deep_errors;
+ else
+ ++shallow_errors;
+ // Only true if be_compare_scrub_objects() found errors and put something
+ // in ss.
+ if (found)
+ errorstream << pgid << " shard " << j->first << " soid " << *k
+ << " : " << ss.str() << "\n";
+ } else if (found) {
+ // Track possible shard to use as authoritative, if needed
+ // There are errors, without identifying the shard
+ object_errors.insert(j->first);
+ errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n";
+ } else {
+ // XXX: The auth shard might get here that we don't know
+ // that it has the "correct" data.
+ auth_list.push_back(j->first);
+ }
+ } else {
+ cur_missing.insert(j->first);
+ shard_map[j->first].set_missing();
+ shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
+ // Can't have any other errors if there is no information available
+ ++shallow_errors;
+ errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n";
+ }
+ object_error.add_shard(j->first, shard_map[j->first]);
+ }
+
+ if (auth_list.empty()) {
+ if (object_errors.empty()) {
+ errorstream << pgid.pgid << " soid " << *k
+ << " : failed to pick suitable auth object\n";
+ goto out;
+ }
+ // Object errors exist and nothing in auth_list
+ // Prefer the auth shard otherwise take first from list.
+ pg_shard_t shard;
+ if (object_errors.count(auth->first)) {
+ shard = auth->first;
+ } else {
+ shard = *(object_errors.begin());
+ }
+ auth_list.push_back(shard);
+ object_errors.erase(shard);
+ }
+ // At this point auth_list is populated, so we add the object errors shards
+ // as inconsistent.
+ cur_inconsistent.insert(object_errors.begin(), object_errors.end());
+ if (!cur_missing.empty()) {
+ missing[*k] = cur_missing;
+ }
+ if (!cur_inconsistent.empty()) {
+ inconsistent[*k] = cur_inconsistent;
+ }
+
+ if (fix_digest) {
+ std::optional<uint32_t> data_digest, omap_digest;
+ ceph_assert(auth_object.digest_present);
+ data_digest = auth_object.digest;
+ if (auth_object.omap_digest_present) {
+ omap_digest = auth_object.omap_digest;
+ }
+ missing_digest[*k] = make_pair(data_digest, omap_digest);
+ }
+ if (!cur_inconsistent.empty() || !cur_missing.empty()) {
+ authoritative[*k] = auth_list;
+ } else if (!fix_digest && parent->get_pool().is_replicated()) {
+ enum {
+ NO = 0,
+ MAYBE = 1,
+ FORCE = 2,
+ } update = NO;
+
+ if (auth_object.digest_present && !auth_oi.is_data_digest()) {
+ dout(20) << __func__ << " missing data digest on " << *k << dendl;
+ update = MAYBE;
+ }
+ if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
+ dout(20) << __func__ << " missing omap digest on " << *k << dendl;
+ update = MAYBE;
+ }
+
+ // recorded digest != actual digest?
+ if (auth_oi.is_data_digest() && auth_object.digest_present &&
+ auth_oi.data_digest != auth_object.digest) {
+ ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info());
+ errorstream << pgid << " recorded data digest 0x"
+ << std::hex << auth_oi.data_digest << " != on disk 0x"
+ << auth_object.digest << std::dec << " on " << auth_oi.soid
+ << "\n";
+ if (repair)
+ update = FORCE;
+ }
+ if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
+ auth_oi.omap_digest != auth_object.omap_digest) {
+ ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info());
+ errorstream << pgid << " recorded omap digest 0x"
+ << std::hex << auth_oi.omap_digest << " != on disk 0x"
+ << auth_object.omap_digest << std::dec
+ << " on " << auth_oi.soid << "\n";
+ if (repair)
+ update = FORCE;
+ }
+
+ if (update != NO) {
+ utime_t age = now - auth_oi.local_mtime;
+ if (update == FORCE ||
+ age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
+ std::optional<uint32_t> data_digest, omap_digest;
+ if (auth_object.digest_present) {
+ data_digest = auth_object.digest;
+ dout(20) << __func__ << " will update data digest on " << *k << dendl;
+ }
+ if (auth_object.omap_digest_present) {
+ omap_digest = auth_object.omap_digest;
+ dout(20) << __func__ << " will update omap digest on " << *k << dendl;
+ }
+ missing_digest[*k] = make_pair(data_digest, omap_digest);
+ } else {
+ dout(20) << __func__ << " missing digest but age " << age
+ << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
+ << " on " << *k << dendl;
+ }
+ }
+ }
+out:
+ if (object_error.has_deep_errors())
+ ++deep_errors;
+ else if (object_error.has_shallow_errors())
+ ++shallow_errors;
+ if (object_error.errors || object_error.union_shards.errors) {
+ store->add_object_error(k->pool, object_error);
+ }
+ }
+}
+
+void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps,
+ const set<hobject_t> &master_set,
+ omap_stat_t& omap_stats,
+ ostream &warnstream) const
+{
+ bool needs_omap_check = false;
+ for (const auto& map : maps) {
+ if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) {
+ needs_omap_check = true;
+ break;
+ }
+ }
+
+ if (!needs_omap_check) {
+ return; // Nothing to do
+ }
+
+ // Iterate through objects and update omap stats
+ for (const auto& k : master_set) {
+ for (const auto& map : maps) {
+ if (map.first != get_parent()->primary_shard()) {
+ // Only set omap stats for the primary
+ continue;
+ }
+ auto it = map.second->objects.find(k);
+ if (it == map.second->objects.end())
+ continue;
+ ScrubMap::object& obj = it->second;
+ omap_stats.omap_bytes += obj.object_omap_bytes;
+ omap_stats.omap_keys += obj.object_omap_keys;
+ if (obj.large_omap_object_found) {
+ pg_t pg;
+ auto osdmap = get_osdmap();
+ osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg);
+ pg_t mpg = osdmap->raw_pg_to_pg(pg);
+ omap_stats.large_omap_objects++;
+ warnstream << "Large omap object found. Object: " << k
+ << " PG: " << pg << " (" << mpg << ")"
+ << " Key count: " << obj.large_omap_object_key_count
+ << " Size (bytes): " << obj.large_omap_object_value_size
+ << '\n';
+ break;
+ }
+ }
+ }
+}
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
new file mode 100644
index 000000000..12bdfc0d1
--- /dev/null
+++ b/src/osd/PGBackend.h
@@ -0,0 +1,641 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "common/WorkQueue.h"
+#include "include/Context.h"
+#include "os/ObjectStore.h"
+#include "common/LogClient.h"
+#include <string>
+#include "PGTransaction.h"
+#include "common/ostream_temp.h"
+
+namespace Scrub {
+ class Store;
+}
+struct shard_info_wrapper;
+struct inconsistent_obj_wrapper;
+
+//forward declaration
+class OSDMap;
+class PGLog;
+typedef std::shared_ptr<const OSDMap> OSDMapRef;
+
+ /**
+ * PGBackend
+ *
+ * PGBackend defines an interface for logic handling IO and
+ * replication on RADOS objects. The PGBackend implementation
+ * is responsible for:
+ *
+ * 1) Handling client operations
+ * 2) Handling object recovery
+ * 3) Handling object access
+ * 4) Handling scrub, deep-scrub, repair
+ */
+ class PGBackend {
+ public:
+ CephContext* cct;
+ protected:
+ ObjectStore *store;
+ const coll_t coll;
+ ObjectStore::CollectionHandle &ch;
+ public:
+ /**
+ * Provides interfaces for PGBackend callbacks
+ *
+ * The intention is that the parent calls into the PGBackend
+ * implementation holding a lock and that the callbacks are
+ * called under the same locks.
+ */
+ class Listener {
+ public:
+ /// Debugging
+ virtual DoutPrefixProvider *get_dpp() = 0;
+
+ /// Recovery
+
+ /**
+ * Called with the transaction recovering oid
+ */
+ virtual void on_local_recover(
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ bool is_delete,
+ ObjectStore::Transaction *t
+ ) = 0;
+
+ /**
+ * Called when transaction recovering oid is durable and
+ * applied on all replicas
+ */
+ virtual void on_global_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ bool is_delete
+ ) = 0;
+
+ /**
+ * Called when peer is recovered
+ */
+ virtual void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info
+ ) = 0;
+
+ virtual void begin_peer_recover(
+ pg_shard_t peer,
+ const hobject_t oid) = 0;
+
+ virtual void apply_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats) = 0;
+
+ /**
+ * Called when a read from a std::set of replicas/primary fails
+ */
+ virtual void on_failed_pull(
+ const std::set<pg_shard_t> &from,
+ const hobject_t &soid,
+ const eversion_t &v
+ ) = 0;
+
+ /**
+ * Called when a pull on soid cannot be completed due to
+ * down peers
+ */
+ virtual void cancel_pull(
+ const hobject_t &soid) = 0;
+
+ /**
+ * Called to remove an object.
+ */
+ virtual void remove_missing_object(
+ const hobject_t &oid,
+ eversion_t v,
+ Context *on_complete) = 0;
+
+ /**
+ * Bless a context
+ *
+ * Wraps a context in whatever outer layers the parent usually
+ * uses to call into the PGBackend
+ */
+ virtual Context *bless_context(Context *c) = 0;
+ virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+ virtual GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+ virtual void send_message(int to_osd, Message *m) = 0;
+ virtual void queue_transaction(
+ ObjectStore::Transaction&& t,
+ OpRequestRef op = OpRequestRef()
+ ) = 0;
+ virtual void queue_transactions(
+ std::vector<ObjectStore::Transaction>& tls,
+ OpRequestRef op = OpRequestRef()
+ ) = 0;
+ virtual epoch_t get_interval_start_epoch() const = 0;
+ virtual epoch_t get_last_peering_reset_epoch() const = 0;
+
+ virtual const std::set<pg_shard_t> &get_acting_recovery_backfill_shards() const = 0;
+ virtual const std::set<pg_shard_t> &get_acting_shards() const = 0;
+ virtual const std::set<pg_shard_t> &get_backfill_shards() const = 0;
+
+ virtual std::ostream& gen_dbg_prefix(std::ostream& out) const = 0;
+
+ virtual const std::map<hobject_t, std::set<pg_shard_t>> &get_missing_loc_shards()
+ const = 0;
+
+ virtual const pg_missing_tracker_t &get_local_missing() const = 0;
+ virtual void add_local_next_event(const pg_log_entry_t& e) = 0;
+ virtual const std::map<pg_shard_t, pg_missing_t> &get_shard_missing()
+ const = 0;
+ virtual const pg_missing_const_i * maybe_get_shard_missing(
+ pg_shard_t peer) const {
+ if (peer == primary_shard()) {
+ return &get_local_missing();
+ } else {
+ std::map<pg_shard_t, pg_missing_t>::const_iterator i =
+ get_shard_missing().find(peer);
+ if (i == get_shard_missing().end()) {
+ return nullptr;
+ } else {
+ return &(i->second);
+ }
+ }
+ }
+ virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
+ auto m = maybe_get_shard_missing(peer);
+ ceph_assert(m);
+ return *m;
+ }
+
+ virtual const std::map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
+ virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
+ if (peer == primary_shard()) {
+ return get_info();
+ } else {
+ std::map<pg_shard_t, pg_info_t>::const_iterator i =
+ get_shard_info().find(peer);
+ ceph_assert(i != get_shard_info().end());
+ return i->second;
+ }
+ }
+
+ virtual const PGLog &get_log() const = 0;
+ virtual bool pgb_is_primary() const = 0;
+ virtual const OSDMapRef& pgb_get_osdmap() const = 0;
+ virtual epoch_t pgb_get_osdmap_epoch() const = 0;
+ virtual const pg_info_t &get_info() const = 0;
+ virtual const pg_pool_t &get_pool() const = 0;
+
+ virtual ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ const std::map<std::string, ceph::buffer::list> &attrs) = 0;
+
+ virtual bool try_lock_for_read(
+ const hobject_t &hoid,
+ ObcLockManager &manager) = 0;
+
+ virtual void release_locks(ObcLockManager &manager) = 0;
+
+ virtual void op_applied(
+ const eversion_t &applied_version) = 0;
+
+ virtual bool should_send_op(
+ pg_shard_t peer,
+ const hobject_t &hoid) = 0;
+
+ virtual bool pg_is_undersized() const = 0;
+ virtual bool pg_is_repair() const = 0;
+
+ virtual void log_operation(
+ std::vector<pg_log_entry_t>&& logv,
+ const std::optional<pg_hit_set_history_t> &hset_history,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk,
+ bool transaction_applied,
+ ObjectStore::Transaction &t,
+ bool async = false) = 0;
+
+ virtual void pgb_set_object_snap_mapping(
+ const hobject_t &soid,
+ const std::set<snapid_t> &snaps,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void pgb_clear_object_snap_mapping(
+ const hobject_t &soid,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void update_peer_last_complete_ondisk(
+ pg_shard_t fromosd,
+ eversion_t lcod) = 0;
+
+ virtual void update_last_complete_ondisk(
+ eversion_t lcod) = 0;
+
+ virtual void update_stats(
+ const pg_stat_t &stat) = 0;
+
+ virtual void schedule_recovery_work(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+ virtual pg_shard_t whoami_shard() const = 0;
+ int whoami() const {
+ return whoami_shard().osd;
+ }
+ spg_t whoami_spg_t() const {
+ return get_info().pgid;
+ }
+
+ virtual spg_t primary_spg_t() const = 0;
+ virtual pg_shard_t primary_shard() const = 0;
+ virtual uint64_t min_peer_features() const = 0;
+ virtual uint64_t min_upacting_features() const = 0;
+ virtual hobject_t get_temp_recovery_object(const hobject_t& target,
+ eversion_t version) = 0;
+
+ virtual void send_message_osd_cluster(
+ int peer, Message *m, epoch_t from_epoch) = 0;
+ virtual void send_message_osd_cluster(
+ std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch) = 0;
+ virtual void send_message_osd_cluster(
+ MessageRef, Connection *con) = 0;
+ virtual void send_message_osd_cluster(
+ Message *m, const ConnectionRef& con) = 0;
+ virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
+ virtual entity_name_t get_cluster_msgr_name() = 0;
+
+ virtual PerfCounters *get_logger() = 0;
+
+ virtual ceph_tid_t get_tid() = 0;
+
+ virtual OstreamTemp clog_error() = 0;
+ virtual OstreamTemp clog_warn() = 0;
+
+ virtual bool check_failsafe_full() = 0;
+
+ virtual bool pg_is_repair() = 0;
+ virtual void inc_osd_stat_repaired() = 0;
+ virtual bool pg_is_remote_backfilling() = 0;
+ virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
+ virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
+ virtual void pg_add_num_bytes(int64_t num_bytes) = 0;
+ virtual void pg_sub_num_bytes(int64_t num_bytes) = 0;
+ virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
+ virtual ~Listener() {}
+ };
+ Listener *parent;
+ Listener *get_parent() const { return parent; }
+ PGBackend(CephContext* cct, Listener *l, ObjectStore *store, const coll_t &coll,
+ ObjectStore::CollectionHandle &ch) :
+ cct(cct),
+ store(store),
+ coll(coll),
+ ch(ch),
+ parent(l) {}
+ bool is_primary() const { return get_parent()->pgb_is_primary(); }
+ const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+ epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
+ const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+ std::ostream& gen_prefix(std::ostream& out) const {
+ return parent->gen_dbg_prefix(out);
+ }
+
+ /**
+ * RecoveryHandle
+ *
+ * We may want to recover multiple objects in the same std::set of
+ * messages. RecoveryHandle is an interface for the opaque
+ * object used by the implementation to store the details of
+ * the pending recovery operations.
+ */
+ struct RecoveryHandle {
+ bool cache_dont_need;
+ std::map<pg_shard_t, std::vector<std::pair<hobject_t, eversion_t> > > deletes;
+
+ RecoveryHandle(): cache_dont_need(false) {}
+ virtual ~RecoveryHandle() {}
+ };
+
+ /// Get a fresh recovery operation
+ virtual RecoveryHandle *open_recovery_op() = 0;
+
+ /// run_recovery_op: finish the operation represented by h
+ virtual void run_recovery_op(
+ RecoveryHandle *h, ///< [in] op to finish
+ int priority ///< [in] msg priority
+ ) = 0;
+
+ void recover_delete_object(const hobject_t &oid, eversion_t v,
+ RecoveryHandle *h);
+ void send_recovery_deletes(int prio,
+ const std::map<pg_shard_t, std::vector<std::pair<hobject_t, eversion_t> > > &deletes);
+
+ /**
+ * recover_object
+ *
+ * Triggers a recovery operation on the specified hobject_t
+ * onreadable must be called before onwriteable
+ *
+ * On each replica (primary included), get_parent()->on_not_missing()
+ * must be called when the transaction finalizing the recovery
+ * is queued. Similarly, get_parent()->on_readable() must be called
+ * when the transaction is applied in the backing store.
+ *
+ * get_parent()->on_not_degraded() should be called on the primary
+ * when writes can resume on the object.
+ *
+ * obc may be NULL if the primary lacks the object.
+ *
+ * head may be NULL only if the head/snapdir is missing
+ *
+ * @param missing [in] std::set of info, missing pairs for queried nodes
+ * @param overlaps [in] mapping of object to file offset overlaps
+ */
+ virtual int recover_object(
+ const hobject_t &hoid, ///< [in] object to recover
+ eversion_t v, ///< [in] version to recover
+ ObjectContextRef head, ///< [in] context of the head/snapdir object
+ ObjectContextRef obc, ///< [in] context of the object
+ RecoveryHandle *h ///< [in,out] handle to attach recovery op to
+ ) = 0;
+
+ /**
+ * true if PGBackend can handle this message while inactive
+ *
+ * If it returns true, handle_message *must* also return true
+ */
+ virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
+
+ /// gives PGBackend a crack at an incoming message
+ bool handle_message(
+ OpRequestRef op ///< [in] message received
+ ); ///< @return true if the message was handled
+
+ /// the variant of handle_message that is overridden by child classes
+ virtual bool _handle_message(OpRequestRef op) = 0;
+
+ virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
+
+
+ /**
+ * clean up any temporary on-disk state due to a pg interval change
+ */
+ void on_change_cleanup(ObjectStore::Transaction *t);
+ /**
+ * implementation should clear itself, contexts blessed prior to on_change
+ * won't be called after on_change()
+ */
+ virtual void on_change() = 0;
+ virtual void clear_recovery_state() = 0;
+
+ virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
+ virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
+ virtual int get_ec_data_chunk_count() const { return 0; };
+ virtual int get_ec_stripe_chunk_size() const { return 0; };
+
+ virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
+
+ private:
+ std::set<hobject_t> temp_contents;
+ public:
+ // Track contents of temp collection, clear on reset
+ void add_temp_obj(const hobject_t &oid) {
+ temp_contents.insert(oid);
+ }
+ void add_temp_objs(const std::set<hobject_t> &oids) {
+ temp_contents.insert(oids.begin(), oids.end());
+ }
+ void clear_temp_obj(const hobject_t &oid) {
+ temp_contents.erase(oid);
+ }
+ void clear_temp_objs(const std::set<hobject_t> &oids) {
+ for (std::set<hobject_t>::const_iterator i = oids.begin();
+ i != oids.end();
+ ++i) {
+ temp_contents.erase(*i);
+ }
+ }
+
+ virtual ~PGBackend() {}
+
+ /// execute implementation specific transaction
+ virtual void submit_transaction(
+ const hobject_t &hoid, ///< [in] object
+ const object_stat_sum_t &delta_stats,///< [in] stat change
+ const eversion_t &at_version, ///< [in] version
+ PGTransactionUPtr &&t, ///< [in] trans to execute (move)
+ const eversion_t &trim_to, ///< [in] trim log to here
+ const eversion_t &min_last_complete_ondisk, ///< [in] lower bound on
+ /// committed version
+ std::vector<pg_log_entry_t>&& log_entries, ///< [in] log entries for t
+ /// [in] hitset history (if updated with this transaction)
+ std::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit, ///< [in] called when all commit
+ ceph_tid_t tid, ///< [in] tid
+ osd_reqid_t reqid, ///< [in] reqid
+ OpRequestRef op ///< [in] op
+ ) = 0;
+
+ /// submit callback to be called in order with pending writes
+ virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
+
+ void try_stash(
+ const hobject_t &hoid,
+ version_t v,
+ ObjectStore::Transaction *t);
+
+ void rollback(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t);
+
+ friend class LRBTrimmer;
+ void rollforward(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t);
+
+ void trim(
+ const pg_log_entry_t &entry,
+ ObjectStore::Transaction *t);
+
+ void remove(
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t);
+
+ protected:
+
+ void handle_recovery_delete(OpRequestRef op);
+ void handle_recovery_delete_reply(OpRequestRef op);
+
+ /// Reapply old attributes
+ void rollback_setattrs(
+ const hobject_t &hoid,
+ std::map<std::string, std::optional<ceph::buffer::list> > &old_attrs,
+ ObjectStore::Transaction *t);
+
+ /// Truncate object to rollback append
+ virtual void rollback_append(
+ const hobject_t &hoid,
+ uint64_t old_size,
+ ObjectStore::Transaction *t);
+
+ /// Unstash object to rollback stash
+ void rollback_stash(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t);
+
+ /// Unstash object to rollback stash
+ void rollback_try_stash(
+ const hobject_t &hoid,
+ version_t old_version,
+ ObjectStore::Transaction *t);
+
+ /// Delete object to rollback create
+ void rollback_create(
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t) {
+ remove(hoid, t);
+ }
+
+ /// Clone the extents back into place
+ void rollback_extents(
+ version_t gen,
+ const std::vector<std::pair<uint64_t, uint64_t> > &extents,
+ const hobject_t &hoid,
+ ObjectStore::Transaction *t);
+ public:
+
+ /// Trim object stashed at version
+ void trim_rollback_object(
+ const hobject_t &hoid,
+ version_t gen,
+ ObjectStore::Transaction *t);
+
+ /// Std::list objects in collection
+ int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ std::vector<hobject_t> *ls,
+ hobject_t *next);
+
+ int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ std::vector<hobject_t> *ls,
+ std::vector<ghobject_t> *gen_obs=0);
+
+ int objects_get_attr(
+ const hobject_t &hoid,
+ const std::string &attr,
+ ceph::buffer::list *out);
+
+ virtual int objects_get_attrs(
+ const hobject_t &hoid,
+ std::map<std::string, ceph::buffer::list> *out);
+
+ virtual int objects_read_sync(
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ ceph::buffer::list *bl) = 0;
+
+ virtual int objects_readv_sync(
+ const hobject_t &hoid,
+ std::map<uint64_t, uint64_t>&& m,
+ uint32_t op_flags,
+ ceph::buffer::list *bl) {
+ return -EOPNOTSUPP;
+ }
+
+ virtual void objects_read_async(
+ const hobject_t &hoid,
+ const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ std::pair<ceph::buffer::list*, Context*> > > &to_read,
+ Context *on_complete, bool fast_read = false) = 0;
+
+ virtual bool auto_repair_supported() const = 0;
+ int be_scan_list(
+ ScrubMap &map,
+ ScrubMapBuilder &pos);
+ bool be_compare_scrub_objects(
+ pg_shard_t auth_shard,
+ const ScrubMap::object &auth,
+ const object_info_t& auth_oi,
+ const ScrubMap::object &candidate,
+ shard_info_wrapper& shard_error,
+ inconsistent_obj_wrapper &result,
+ std::ostream &errorstream,
+ bool has_snapset);
+ std::map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
+ const hobject_t &obj,
+ const std::map<pg_shard_t,ScrubMap*> &maps,
+ object_info_t *auth_oi,
+ std::map<pg_shard_t, shard_info_wrapper> &shard_map,
+ bool &digest_match,
+ spg_t pgid,
+ std::ostream &errorstream);
+ void be_compare_scrubmaps(
+ const std::map<pg_shard_t,ScrubMap*> &maps,
+ const std::set<hobject_t> &master_set,
+ bool repair,
+ std::map<hobject_t, std::set<pg_shard_t>> &missing,
+ std::map<hobject_t, std::set<pg_shard_t>> &inconsistent,
+ std::map<hobject_t, std::list<pg_shard_t>> &authoritative,
+ std::map<hobject_t, std::pair<std::optional<uint32_t>,
+ std::optional<uint32_t>>> &missing_digest,
+ int &shallow_errors, int &deep_errors,
+ Scrub::Store *store,
+ const spg_t& pgid,
+ const std::vector<int> &acting,
+ std::ostream &errorstream);
+ virtual uint64_t be_get_ondisk_size(
+ uint64_t logical_size) = 0;
+ virtual int be_deep_scrub(
+ const hobject_t &oid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) = 0;
+ void be_omap_checks(
+ const std::map<pg_shard_t,ScrubMap*> &maps,
+ const std::set<hobject_t> &master_set,
+ omap_stat_t& omap_stats,
+ std::ostream &warnstream) const;
+
+ static PGBackend *build_pg_backend(
+ const pg_pool_t &pool,
+ const std::map<std::string,std::string>& profile,
+ Listener *l,
+ coll_t coll,
+ ObjectStore::CollectionHandle &ch,
+ ObjectStore *store,
+ CephContext *cct);
+};
+
+#endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
new file mode 100644
index 000000000..c881dbabe
--- /dev/null
+++ b/src/osd/PGLog.cc
@@ -0,0 +1,1189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "PGLog.h"
+#include "include/unordered_map.h"
+#include "common/ceph_context.h"
+
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
+{
+ return pglog->gen_prefix(*_dout);
+}
+
+//////////////////// PGLog::IndexedLog ////////////////////
+
+void PGLog::IndexedLog::split_out_child(
+ pg_t child_pgid,
+ unsigned split_bits,
+ PGLog::IndexedLog *target)
+{
+ unindex();
+ *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
+ index();
+ target->index();
+ reset_rollback_info_trimmed_to_riter();
+}
+
+void PGLog::IndexedLog::trim(
+ CephContext* cct,
+ eversion_t s,
+ set<eversion_t> *trimmed,
+ set<string>* trimmed_dups,
+ eversion_t *write_from_dups)
+{
+ lgeneric_subdout(cct, osd, 10) << "IndexedLog::trim s=" << s << dendl;
+ ceph_assert(s <= can_rollback_to);
+ if (complete_to != log.end())
+ lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
+
+ auto earliest_dup_version =
+ log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
+ ? 0u
+ : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1;
+
+ lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
+ while (!log.empty()) {
+ const pg_log_entry_t &e = *log.begin();
+ if (e.version > s)
+ break;
+ lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
+ if (trimmed)
+ trimmed->emplace(e.version);
+
+ unindex(e); // remove from index,
+
+ // add to dup list
+ if (e.version.version >= earliest_dup_version) {
+ if (write_from_dups != nullptr && *write_from_dups > e.version) {
+ lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+ *write_from_dups = e.version;
+ }
+ dups.push_back(pg_log_dup_t(e));
+ index(dups.back());
+ uint32_t idx = 0;
+ for (const auto& extra : e.extra_reqids) {
+ int return_code = e.return_code;
+ if (return_code >= 0) {
+ auto it = e.extra_reqid_return_codes.find(idx);
+ if (it != e.extra_reqid_return_codes.end()) {
+ return_code = it->second;
+ // FIXME: we aren't setting op_returns for these extra_reqids
+ }
+ }
+ ++idx;
+
+ // note: extras have the same version as outer op
+ dups.push_back(pg_log_dup_t(e.version, extra.second,
+ extra.first, return_code));
+ index(dups.back());
+ }
+ }
+
+ bool reset_complete_to = false;
+ // we are trimming past complete_to, so reset complete_to
+ if (complete_to != log.end() && e.version >= complete_to->version)
+ reset_complete_to = true;
+ if (rollback_info_trimmed_to_riter == log.rend() ||
+ e.version == rollback_info_trimmed_to_riter->version) {
+ log.pop_front();
+ rollback_info_trimmed_to_riter = log.rend();
+ } else {
+ log.pop_front();
+ }
+
+ // reset complete_to to the beginning of the log
+ if (reset_complete_to) {
+ complete_to = log.begin();
+ if (complete_to != log.end()) {
+ lgeneric_subdout(cct, osd, 20) << " moving complete_to to "
+ << log.begin()->version << dendl;
+ } else {
+ lgeneric_subdout(cct, osd, 20) << " log is now empty" << dendl;
+ }
+ }
+ }
+
+ // we can hit an inflated `dups` b/c of https://tracker.ceph.com/issues/53729
+ // the idea is to slowly trim them over a prolonged period of time and mix
+ // omap deletes with writes (if we're here, a new log entry got added) to
+ // neither: 1) blow size of single Transaction nor 2) generate-n-accumulate
+ // large amount of tombstones in BlueStore's RocksDB.
+ // if trimming immediately is a must, then the ceph-objectstore-tool is
+ // the way to go.
+ const size_t max_dups = cct->_conf->osd_pg_log_dups_tracked;
+ for (size_t max_dups_to_trim = cct->_conf->osd_pg_log_trim_max;
+ max_dups_to_trim > 0 && dups.size() > max_dups;
+ max_dups_to_trim--) {
+ const auto& e = *dups.begin();
+ lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
+ if (trimmed_dups)
+ trimmed_dups->insert(e.get_key_name());
+ unindex(e);
+ dups.pop_front();
+ }
+
+ // raise tail?
+ if (tail < s)
+ tail = s;
+ lgeneric_subdout(cct, osd, 20) << "IndexedLog::trim after trim"
+ << " dups.size()=" << dups.size()
+ << " tail=" << tail
+ << " s=" << s << dendl;
+}
+
+ostream& PGLog::IndexedLog::print(ostream& out) const
+{
+ out << *this << std::endl;
+ for (auto p = log.begin(); p != log.end(); ++p) {
+ out << *p << " " <<
+ (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
+ std::endl;
+ ceph_assert(!p->reqid_is_indexed() || logged_req(p->reqid));
+ }
+
+ for (auto p = dups.begin(); p != dups.end(); ++p) {
+ out << *p << std::endl;
+ }
+
+ return out;
+}
+
+//////////////////// PGLog ////////////////////
+
+void PGLog::reset_backfill()
+{
+ missing.clear();
+}
+
+void PGLog::clear() {
+ missing.clear();
+ log.clear();
+ log_keys_debug.clear();
+ undirty();
+}
+
+void PGLog::clear_info_log(
+ spg_t pgid,
+ ObjectStore::Transaction *t) {
+ coll_t coll(pgid);
+ t->remove(coll, pgid.make_pgmeta_oid());
+}
+
+void PGLog::trim(
+ eversion_t trim_to,
+ pg_info_t &info,
+ bool transaction_applied,
+ bool async)
+{
+ dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
+ // trim?
+ if (trim_to > log.tail) {
+ dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
+ // Don't assert for async_recovery_targets or backfill_targets
+ // or whenever there are missing items
+ if (transaction_applied && !async && (missing.num_missing() == 0))
+ ceph_assert(trim_to <= info.last_complete);
+
+ dout(10) << "trim " << log << " to " << trim_to << dendl;
+ log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
+ info.log_tail = log.tail;
+ if (log.complete_to != log.log.end())
+ dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
+ }
+}
+
+void PGLog::proc_replica_log(
+ pg_info_t &oinfo,
+ const pg_log_t &olog,
+ pg_missing_t& omissing,
+ pg_shard_t from) const
+{
+ dout(10) << "proc_replica_log for osd." << from << ": "
+ << oinfo << " " << olog << " " << omissing << dendl;
+
+ if (olog.head < log.tail) {
+ dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
+ << "for divergent objects" << dendl;
+ return;
+ }
+ if (olog.head == log.head) {
+ dout(10) << __func__ << ": osd." << from << " same log head, not looking "
+ << "for divergent objects" << dendl;
+ return;
+ }
+
+ /*
+ basically what we're doing here is rewinding the remote log,
+ dropping divergent entries, until we find something that matches
+ our master log. we then reset last_update to reflect the new
+ point up to which missing is accurate.
+
+ later, in activate(), missing will get wound forward again and
+ we will send the peer enough log to arrive at the same state.
+ */
+
+ for (auto i = omissing.get_items().begin();
+ i != omissing.get_items().end();
+ ++i) {
+ dout(20) << " before missing " << i->first << " need " << i->second.need
+ << " have " << i->second.have << dendl;
+ }
+
+ auto first_non_divergent = log.log.rbegin();
+ while (1) {
+ if (first_non_divergent == log.log.rend())
+ break;
+ if (first_non_divergent->version <= olog.head) {
+ dout(20) << "merge_log point (usually last shared) is "
+ << *first_non_divergent << dendl;
+ break;
+ }
+ ++first_non_divergent;
+ }
+
+ /* Because olog.head >= log.tail, we know that both pgs must at least have
+ * the event represented by log.tail. Similarly, because log.head >= olog.tail,
+ * we know that the event represented by olog.tail must be common to both logs.
+ * Furthermore, the event represented by a log tail was necessarily trimmed,
+ * thus neither olog.tail nor log.tail can be divergent. It's
+ * possible that olog/log contain no actual events between olog.head and
+ * max(log.tail, olog.tail), however, since they might have been split out.
+ * Thus, if we cannot find an event e such that
+ * log.tail <= e.version <= log.head, the last_update must actually be
+ * max(log.tail, olog.tail).
+ */
+ eversion_t limit = std::max(olog.tail, log.tail);
+ eversion_t lu =
+ (first_non_divergent == log.log.rend() ||
+ first_non_divergent->version < limit) ?
+ limit :
+ first_non_divergent->version;
+
+ // we merge and adjust the replica's log, rollback the rollbackable divergent entry,
+ // remove the unrollbackable divergent entry and mark the according object as missing.
+ // the rollback boundary must choose crt of the olog which going to be merged.
+ // The replica log's(olog) crt will not be modified, so it could get passed
+ // to _merge_divergent_entries() directly.
+ IndexedLog folog(olog);
+ auto divergent = folog.rewind_from_head(lu);
+ _merge_divergent_entries(
+ folog,
+ divergent,
+ oinfo,
+ olog.get_can_rollback_to(),
+ omissing,
+ 0,
+ this);
+
+ if (lu < oinfo.last_update) {
+ dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
+ oinfo.last_update = lu;
+ }
+
+ if (omissing.have_missing()) {
+ eversion_t first_missing =
+ omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
+ oinfo.last_complete = eversion_t();
+ for (auto i = olog.log.begin(); i != olog.log.end(); ++i) {
+ if (i->version < first_missing)
+ oinfo.last_complete = i->version;
+ else
+ break;
+ }
+ } else {
+ oinfo.last_complete = oinfo.last_update;
+ }
+} // proc_replica_log
+
+/**
+ * rewind divergent entries at the head of the log
+ *
+ * This rewinds entries off the head of our log that are divergent.
+ * This is used by replicas during activation.
+ *
+ * @param newhead new head to rewind to
+ */
+void PGLog::rewind_divergent_log(eversion_t newhead,
+ pg_info_t &info, LogEntryHandler *rollbacker,
+ bool &dirty_info, bool &dirty_big_info)
+{
+ dout(10) << "rewind_divergent_log truncate divergent future " <<
+ newhead << dendl;
+
+ // We need to preserve the original crt before it gets updated in rewind_from_head().
+ // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
+ // a divergent entry or not.
+ eversion_t original_crt = log.get_can_rollback_to();
+ dout(20) << __func__ << " original_crt = " << original_crt << dendl;
+ if (info.last_complete > newhead)
+ info.last_complete = newhead;
+
+ auto divergent = log.rewind_from_head(newhead);
+ if (!divergent.empty()) {
+ mark_dirty_from(divergent.front().version);
+ }
+ for (auto &&entry: divergent) {
+ dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
+ }
+ info.last_update = newhead;
+
+ _merge_divergent_entries(
+ log,
+ divergent,
+ info,
+ original_crt,
+ missing,
+ rollbacker,
+ this);
+
+ dirty_info = true;
+ dirty_big_info = true;
+}
+
+void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
+ pg_info_t &info, LogEntryHandler *rollbacker,
+ bool &dirty_info, bool &dirty_big_info)
+{
+ dout(10) << "merge_log " << olog << " from osd." << fromosd
+ << " into " << log << dendl;
+
+ // Check preconditions
+
+ // If our log is empty, the incoming log needs to have not been trimmed.
+ ceph_assert(!log.null() || olog.tail == eversion_t());
+ // The logs must overlap.
+ ceph_assert(log.head >= olog.tail && olog.head >= log.tail);
+
+ for (auto i = missing.get_items().begin();
+ i != missing.get_items().end();
+ ++i) {
+ dout(20) << "pg_missing_t sobject: " << i->first << dendl;
+ }
+
+ bool changed = false;
+
+ // extend on tail?
+ // this is just filling in history. it does not affect our
+ // missing set, as that should already be consistent with our
+ // current log.
+ eversion_t orig_tail = log.tail;
+ if (olog.tail < log.tail) {
+ dout(10) << "merge_log extending tail to " << olog.tail << dendl;
+ auto from = olog.log.begin();
+ auto to = from;
+ eversion_t last;
+ for (; to != olog.log.end(); ++to) {
+ if (to->version > log.tail)
+ break;
+ log.index(*to);
+ dout(15) << *to << dendl;
+ last = to->version;
+ }
+ mark_dirty_to(last);
+
+ // splice into our log.
+ log.log.splice(log.log.begin(),
+ std::move(olog.log), from, to);
+
+ info.log_tail = log.tail = olog.tail;
+ changed = true;
+ }
+
+ if (oinfo.stats.reported_seq < info.stats.reported_seq || // make sure reported always increases
+ oinfo.stats.reported_epoch < info.stats.reported_epoch) {
+ oinfo.stats.reported_seq = info.stats.reported_seq;
+ oinfo.stats.reported_epoch = info.stats.reported_epoch;
+ }
+ if (info.last_backfill.is_max())
+ info.stats = oinfo.stats;
+ info.hit_set = oinfo.hit_set;
+
+ // do we have divergent entries to throw out?
+ if (olog.head < log.head) {
+ rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
+ changed = true;
+ }
+
+ // extend on head?
+ if (olog.head > log.head) {
+ dout(10) << "merge_log extending head to " << olog.head << dendl;
+
+ // find start point in olog
+ auto to = olog.log.end();
+ auto from = olog.log.end();
+ eversion_t lower_bound = std::max(olog.tail, orig_tail);
+ while (1) {
+ if (from == olog.log.begin())
+ break;
+ --from;
+ dout(20) << " ? " << *from << dendl;
+ if (from->version <= log.head) {
+ lower_bound = std::max(lower_bound, from->version);
+ ++from;
+ break;
+ }
+ }
+ dout(20) << "merge_log cut point (usually last shared) is "
+ << lower_bound << dendl;
+ mark_dirty_from(lower_bound);
+
+ // We need to preserve the original crt before it gets updated in rewind_from_head().
+ // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
+ // a divergent entry or not.
+ eversion_t original_crt = log.get_can_rollback_to();
+ dout(20) << __func__ << " original_crt = " << original_crt << dendl;
+ auto divergent = log.rewind_from_head(lower_bound);
+ // move aside divergent items
+ for (auto &&oe: divergent) {
+ dout(10) << "merge_log divergent " << oe << dendl;
+ }
+ log.roll_forward_to(log.head, rollbacker);
+
+ mempool::osd_pglog::list<pg_log_entry_t> new_entries;
+ new_entries.splice(new_entries.end(), olog.log, from, to);
+ append_log_entries_update_missing(
+ info.last_backfill,
+ new_entries,
+ false,
+ &log,
+ missing,
+ rollbacker,
+ this);
+
+ _merge_divergent_entries(
+ log,
+ divergent,
+ info,
+ original_crt,
+ missing,
+ rollbacker,
+ this);
+
+ info.last_update = log.head = olog.head;
+
+ // We cannot rollback into the new log entries
+ log.skip_can_rollback_to_to_head();
+
+ info.last_user_version = oinfo.last_user_version;
+ info.purged_snaps = oinfo.purged_snaps;
+ // update num_missing too
+ // we might have appended some more missing objects above
+ info.stats.stats.sum.num_objects_missing = missing.num_missing();
+
+ changed = true;
+ }
+
+ // now handle dups
+ if (merge_log_dups(olog)) {
+ changed = true;
+ }
+
+ dout(10) << "merge_log result " << log << " " << missing <<
+ " changed=" << changed << dendl;
+
+ if (changed) {
+ dirty_info = true;
+ dirty_big_info = true;
+ }
+}
+
+
+// returns true if any changes were made to log.dups
+bool PGLog::merge_log_dups(const pg_log_t& olog) {
+ dout(5) << __func__
+ << " log.dups.size()=" << log.dups.size()
+ << "olog.dups.size()=" << olog.dups.size() << dendl;
+ bool changed = false;
+
+ if (!olog.dups.empty()) {
+ if (log.dups.empty()) {
+ dout(10) << "merge_log copying olog dups to log " <<
+ olog.dups.front().version << " to " <<
+ olog.dups.back().version << dendl;
+ changed = true;
+ dirty_from_dups = eversion_t();
+ dirty_to_dups = eversion_t::max();
+ // since our log.dups is empty just copy them
+ for (const auto& i : olog.dups) {
+ log.dups.push_back(i);
+ log.index(log.dups.back());
+ }
+ } else {
+ // since our log.dups is not empty try to extend on each end
+
+ if (olog.dups.back().version > log.dups.back().version) {
+ // extend the dups's tail (i.e., newer dups)
+ dout(10) << "merge_log extending dups tail to " <<
+ olog.dups.back().version << dendl;
+ changed = true;
+
+ auto log_tail_version = log.dups.back().version;
+
+ auto insert_cursor = log.dups.end();
+ eversion_t last_shared = eversion_t::max();
+ for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
+ if (i->version <= log_tail_version) break;
+ log.dups.insert(insert_cursor, *i);
+ last_shared = i->version;
+
+ auto prev = insert_cursor;
+ --prev;
+ // be sure to pass reference of copy in log.dups
+ log.index(*prev);
+
+ --insert_cursor; // make sure we insert in reverse order
+ }
+ mark_dirty_from_dups(last_shared);
+ }
+
+ if (olog.dups.front().version < log.dups.front().version) {
+ // extend the dups's head (i.e., older dups)
+ dout(10) << "merge_log extending dups head to " <<
+ olog.dups.front().version << dendl;
+ changed = true;
+
+ eversion_t last;
+ auto insert_cursor = log.dups.begin();
+ for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
+ if (i->version >= insert_cursor->version) break;
+ log.dups.insert(insert_cursor, *i);
+ last = i->version;
+ auto prev = insert_cursor;
+ --prev;
+ // be sure to pass address of copy in log.dups
+ log.index(*prev);
+ }
+ mark_dirty_to_dups(last);
+ }
+ }
+ }
+
+ // remove any dup entries that overlap with pglog
+ if (!log.dups.empty() && log.dups.back().version > log.tail) {
+ dout(10) << "merge_log removed dups overlapping log entries (" <<
+ log.tail << "," << log.dups.back().version << "]" << dendl;
+ changed = true;
+
+ while (!log.dups.empty() && log.dups.back().version > log.tail) {
+ log.unindex(log.dups.back());
+ mark_dirty_from_dups(log.dups.back().version);
+ log.dups.pop_back();
+ }
+ }
+
+ dout(5) << "end of " << __func__ << " changed=" << changed
+ << " log.dups.size()=" << log.dups.size()
+ << " olog.dups.size()=" << olog.dups.size() << dendl;
+
+ return changed;
+}
+
+void PGLog::check() {
+ if (!pg_log_debug)
+ return;
+ if (log.log.size() != log_keys_debug.size()) {
+ derr << "log.log.size() != log_keys_debug.size()" << dendl;
+ derr << "actual log:" << dendl;
+ for (auto i = log.log.begin(); i != log.log.end(); ++i) {
+ derr << " " << *i << dendl;
+ }
+ derr << "log_keys_debug:" << dendl;
+ for (auto i = log_keys_debug.begin();
+ i != log_keys_debug.end();
+ ++i) {
+ derr << " " << *i << dendl;
+ }
+ }
+ ceph_assert(log.log.size() == log_keys_debug.size());
+ for (auto i = log.log.begin(); i != log.log.end(); ++i) {
+ ceph_assert(log_keys_debug.count(i->get_key_name()));
+ }
+}
+
+// non-static
+void PGLog::write_log_and_missing(
+ ObjectStore::Transaction& t,
+ map<string,bufferlist> *km,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
+ bool require_rollback)
+{
+ if (needs_write()) {
+ dout(6) << "write_log_and_missing with: "
+ << "dirty_to: " << dirty_to
+ << ", dirty_from: " << dirty_from
+ << ", writeout_from: " << writeout_from
+ << ", trimmed: " << trimmed
+ << ", trimmed_dups: " << trimmed_dups
+ << ", clear_divergent_priors: " << clear_divergent_priors
+ << dendl;
+ _write_log_and_missing(
+ t, km, log, coll, log_oid,
+ dirty_to,
+ dirty_from,
+ writeout_from,
+ std::move(trimmed),
+ std::move(trimmed_dups),
+ missing,
+ !touched_log,
+ require_rollback,
+ clear_divergent_priors,
+ dirty_to_dups,
+ dirty_from_dups,
+ write_from_dups,
+ &may_include_deletes_in_missing_dirty,
+ (pg_log_debug ? &log_keys_debug : nullptr),
+ this);
+ undirty();
+ } else {
+ dout(10) << "log is not dirty" << dendl;
+ }
+}
+
+// static
+void PGLog::write_log_and_missing_wo_missing(
+ ObjectStore::Transaction& t,
+ map<string,bufferlist> *km,
+ pg_log_t &log,
+ const coll_t& coll, const ghobject_t &log_oid,
+ map<eversion_t, hobject_t> &divergent_priors,
+ bool require_rollback,
+ const DoutPrefixProvider *dpp
+ )
+{
+ _write_log_and_missing_wo_missing(
+ t, km, log, coll, log_oid,
+ divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
+ true, true, require_rollback,
+ eversion_t::max(), eversion_t(), eversion_t(), nullptr, dpp);
+}
+
+// static
+void PGLog::write_log_and_missing(
+ ObjectStore::Transaction& t,
+ map<string,bufferlist> *km,
+ pg_log_t &log,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
+ const pg_missing_tracker_t &missing,
+ bool require_rollback,
+ bool *may_include_deletes_in_missing_dirty,
+ const DoutPrefixProvider *dpp)
+{
+ _write_log_and_missing(
+ t, km, log, coll, log_oid,
+ eversion_t::max(),
+ eversion_t(),
+ eversion_t(),
+ set<eversion_t>(),
+ set<string>(),
+ missing,
+ true, require_rollback, false,
+ eversion_t::max(),
+ eversion_t(),
+ eversion_t(),
+ may_include_deletes_in_missing_dirty, nullptr, dpp);
+}
+
+// static
+void PGLog::_write_log_and_missing_wo_missing(
+ ObjectStore::Transaction& t,
+ map<string,bufferlist> *km,
+ pg_log_t &log,
+ const coll_t& coll, const ghobject_t &log_oid,
+ map<eversion_t, hobject_t> &divergent_priors,
+ eversion_t dirty_to,
+ eversion_t dirty_from,
+ eversion_t writeout_from,
+ bool dirty_divergent_priors,
+ bool touch_log,
+ bool require_rollback,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
+ set<string> *log_keys_debug,
+ const DoutPrefixProvider *dpp
+ )
+{
+ ldpp_dout(dpp, 10) << "_write_log_and_missing_wo_missing, clearing up to " << dirty_to
+ << " dirty_to_dups=" << dirty_to_dups
+ << " dirty_from_dups=" << dirty_from_dups
+ << " write_from_dups=" << write_from_dups << dendl;
+ if (touch_log)
+ t.touch(coll, log_oid);
+ if (dirty_to != eversion_t()) {
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ eversion_t().get_key_name(), dirty_to.get_key_name());
+ clear_up_to(log_keys_debug, dirty_to.get_key_name());
+ }
+ if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
+ // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ dirty_from.get_key_name(), eversion_t::max().get_key_name());
+ clear_after(log_keys_debug, dirty_from.get_key_name());
+ }
+
+ for (auto p = log.log.begin();
+ p != log.log.end() && p->version <= dirty_to;
+ ++p) {
+ bufferlist bl(sizeof(*p) * 2);
+ p->encode_with_checksum(bl);
+ (*km)[p->get_key_name()] = std::move(bl);
+ }
+
+ for (auto p = log.log.rbegin();
+ p != log.log.rend() &&
+ (p->version >= dirty_from || p->version >= writeout_from) &&
+ p->version >= dirty_to;
+ ++p) {
+ bufferlist bl(sizeof(*p) * 2);
+ p->encode_with_checksum(bl);
+ (*km)[p->get_key_name()] = std::move(bl);
+ }
+
+ if (log_keys_debug) {
+ for (auto i = (*km).begin();
+ i != (*km).end();
+ ++i) {
+ if (i->first[0] == '_')
+ continue;
+ ceph_assert(!log_keys_debug->count(i->first));
+ log_keys_debug->insert(i->first);
+ }
+ }
+
+ // process dups after log_keys_debug is filled, so dups do not
+ // end up in that set
+ if (dirty_to_dups != eversion_t()) {
+ pg_log_dup_t min, dirty_to_dup;
+ dirty_to_dup.version = dirty_to_dups;
+ ldpp_dout(dpp, 10) << __func__ << " remove dups min=" << min.get_key_name()
+ << " to dirty_to_dup=" << dirty_to_dup.get_key_name() << dendl;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ min.get_key_name(), dirty_to_dup.get_key_name());
+ }
+ if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+ pg_log_dup_t max, dirty_from_dup;
+ max.version = eversion_t::max();
+ dirty_from_dup.version = dirty_from_dups;
+ ldpp_dout(dpp, 10) << __func__ << " remove dups dirty_from_dup="
+ << dirty_from_dup.get_key_name()
+ << " to max=" << max.get_key_name() << dendl;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ dirty_from_dup.get_key_name(), max.get_key_name());
+ }
+
+ ldpp_dout(dpp, 10) << __func__ << " going to encode log.dups.size()="
+ << log.dups.size() << dendl;
+ for (const auto& entry : log.dups) {
+ if (entry.version > dirty_to_dups)
+ break;
+ bufferlist bl;
+ encode(entry, bl);
+ (*km)[entry.get_key_name()] = std::move(bl);
+ }
+ ldpp_dout(dpp, 10) << __func__ << " 1st round encoded log.dups.size()="
+ << log.dups.size() << dendl;
+ for (auto p = log.dups.rbegin();
+ p != log.dups.rend() &&
+ (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+ p->version >= dirty_to_dups;
+ ++p) {
+ bufferlist bl;
+ encode(*p, bl);
+ (*km)[p->get_key_name()] = std::move(bl);
+ }
+ ldpp_dout(dpp, 10) << __func__ << " 2st round encoded log.dups.size()="
+ << log.dups.size() << dendl;
+
+ if (dirty_divergent_priors) {
+ ldpp_dout(dpp, 10) << "write_log_and_missing: writing divergent_priors"
+ << dendl;
+ encode(divergent_priors, (*km)["divergent_priors"]);
+ }
+ if (require_rollback) {
+ encode(
+ log.get_can_rollback_to(),
+ (*km)["can_rollback_to"]);
+ encode(
+ log.get_rollback_info_trimmed_to(),
+ (*km)["rollback_info_trimmed_to"]);
+ }
+ ldpp_dout(dpp, 10) << "end of " << __func__ << dendl;
+}
+
+// static
+void PGLog::_write_log_and_missing(
+ ObjectStore::Transaction& t,
+ map<string,bufferlist>* km,
+ pg_log_t &log,
+ const coll_t& coll, const ghobject_t &log_oid,
+ eversion_t dirty_to,
+ eversion_t dirty_from,
+ eversion_t writeout_from,
+ set<eversion_t> &&trimmed,
+ set<string> &&trimmed_dups,
+ const pg_missing_tracker_t &missing,
+ bool touch_log,
+ bool require_rollback,
+ bool clear_divergent_priors,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
+ bool *may_include_deletes_in_missing_dirty, // in/out param
+ set<string> *log_keys_debug,
+ const DoutPrefixProvider *dpp
+ ) {
+ ldpp_dout(dpp, 10) << __func__ << " clearing up to " << dirty_to
+ << " dirty_to_dups=" << dirty_to_dups
+ << " dirty_from_dups=" << dirty_from_dups
+ << " write_from_dups=" << write_from_dups
+ << " trimmed_dups.size()=" << trimmed_dups.size() << dendl;
+ set<string> to_remove;
+ to_remove.swap(trimmed_dups);
+ for (auto& t : trimmed) {
+ string key = t.get_key_name();
+ if (log_keys_debug) {
+ auto it = log_keys_debug->find(key);
+ ceph_assert(it != log_keys_debug->end());
+ log_keys_debug->erase(it);
+ }
+ to_remove.emplace(std::move(key));
+ }
+ trimmed.clear();
+
+ if (touch_log)
+ t.touch(coll, log_oid);
+ if (dirty_to != eversion_t()) {
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ eversion_t().get_key_name(), dirty_to.get_key_name());
+ clear_up_to(log_keys_debug, dirty_to.get_key_name());
+ }
+ if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
+ ldpp_dout(dpp, 10) << "write_log_and_missing, clearing from "
+ << dirty_from << dendl;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ dirty_from.get_key_name(), eversion_t::max().get_key_name());
+ clear_after(log_keys_debug, dirty_from.get_key_name());
+ }
+
+ for (auto p = log.log.begin();
+ p != log.log.end() && p->version <= dirty_to;
+ ++p) {
+ bufferlist bl(sizeof(*p) * 2);
+ p->encode_with_checksum(bl);
+ (*km)[p->get_key_name()] = std::move(bl);
+ }
+
+ for (auto p = log.log.rbegin();
+ p != log.log.rend() &&
+ (p->version >= dirty_from || p->version >= writeout_from) &&
+ p->version >= dirty_to;
+ ++p) {
+ bufferlist bl(sizeof(*p) * 2);
+ p->encode_with_checksum(bl);
+ (*km)[p->get_key_name()] = std::move(bl);
+ }
+
+ if (log_keys_debug) {
+ for (auto i = (*km).begin();
+ i != (*km).end();
+ ++i) {
+ if (i->first[0] == '_')
+ continue;
+ ceph_assert(!log_keys_debug->count(i->first));
+ log_keys_debug->insert(i->first);
+ }
+ }
+
+ // process dups after log_keys_debug is filled, so dups do not
+ // end up in that set
+ if (dirty_to_dups != eversion_t()) {
+ pg_log_dup_t min, dirty_to_dup;
+ dirty_to_dup.version = dirty_to_dups;
+ ldpp_dout(dpp, 10) << __func__ << " remove dups min=" << min.get_key_name()
+ << " to dirty_to_dup=" << dirty_to_dup.get_key_name() << dendl;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ min.get_key_name(), dirty_to_dup.get_key_name());
+ }
+ if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+ pg_log_dup_t max, dirty_from_dup;
+ max.version = eversion_t::max();
+ dirty_from_dup.version = dirty_from_dups;
+ ldpp_dout(dpp, 10) << __func__ << " remove dups dirty_from_dup="
+ << dirty_from_dup.get_key_name()
+ << " to max=" << max.get_key_name() << dendl;
+ t.omap_rmkeyrange(
+ coll, log_oid,
+ dirty_from_dup.get_key_name(), max.get_key_name());
+ }
+
+ ldpp_dout(dpp, 10) << __func__ << " going to encode log.dups.size()="
+ << log.dups.size() << dendl;
+ for (const auto& entry : log.dups) {
+ if (entry.version > dirty_to_dups)
+ break;
+ bufferlist bl;
+ encode(entry, bl);
+ (*km)[entry.get_key_name()] = std::move(bl);
+ }
+ ldpp_dout(dpp, 10) << __func__ << " 1st round encoded log.dups.size()="
+ << log.dups.size() << dendl;
+
+ for (auto p = log.dups.rbegin();
+ p != log.dups.rend() &&
+ (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+ p->version >= dirty_to_dups;
+ ++p) {
+ bufferlist bl;
+ encode(*p, bl);
+ (*km)[p->get_key_name()] = std::move(bl);
+ }
+ ldpp_dout(dpp, 10) << __func__ << " 2st round encoded log.dups.size()="
+ << log.dups.size() << dendl;
+
+ if (clear_divergent_priors) {
+ ldpp_dout(dpp, 10) << "write_log_and_missing: writing divergent_priors"
+ << dendl;
+ to_remove.insert("divergent_priors");
+ }
+ // since we encode individual missing items instead of a whole
+ // missing set, we need another key to store this bit of state
+ if (*may_include_deletes_in_missing_dirty) {
+ (*km)["may_include_deletes_in_missing"] = bufferlist();
+ *may_include_deletes_in_missing_dirty = false;
+ }
+ missing.get_changed(
+ [&](const hobject_t &obj) {
+ string key = string("missing/") + obj.to_str();
+ pg_missing_item item;
+ if (!missing.is_missing(obj, &item)) {
+ to_remove.insert(key);
+ } else {
+ encode(make_pair(obj, item), (*km)[key], CEPH_FEATUREMASK_SERVER_OCTOPUS);
+ }
+ });
+ if (require_rollback) {
+ encode(
+ log.get_can_rollback_to(),
+ (*km)["can_rollback_to"]);
+ encode(
+ log.get_rollback_info_trimmed_to(),
+ (*km)["rollback_info_trimmed_to"]);
+ }
+
+ if (!to_remove.empty())
+ t.omap_rmkeys(coll, log_oid, to_remove);
+ ldpp_dout(dpp, 10) << "end of " << __func__ << dendl;
+}
+
+void PGLog::rebuild_missing_set_with_deletes(
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ const pg_info_t &info)
+{
+ // save entries not generated from the current log (e.g. added due
+ // to repair, EIO handling, or divergent_priors).
+ map<hobject_t, pg_missing_item> extra_missing;
+ for (const auto& p : missing.get_items()) {
+ if (!log.logged_object(p.first)) {
+ dout(20) << __func__ << " extra missing entry: " << p.first
+ << " " << p.second << dendl;
+ extra_missing[p.first] = p.second;
+ }
+ }
+ missing.clear();
+
+ // go through the log and add items that are not present or older
+ // versions on disk, just as if we were reading the log + metadata
+ // off disk originally
+ set<hobject_t> did;
+ for (auto i = log.log.rbegin();
+ i != log.log.rend();
+ ++i) {
+ if (i->version <= info.last_complete)
+ break;
+ if (i->soid > info.last_backfill ||
+ i->is_error() ||
+ did.find(i->soid) != did.end())
+ continue;
+ did.insert(i->soid);
+
+ bufferlist bv;
+ int r = store->getattr(
+ ch,
+ ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
+ OI_ATTR,
+ bv);
+ dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
+
+ if (r >= 0) {
+ object_info_t oi(bv);
+ dout(20) << __func__ << " store version = " << oi.version << dendl;
+ if (oi.version < i->version) {
+ missing.add(i->soid, i->version, oi.version, i->is_delete());
+ }
+ } else {
+ missing.add(i->soid, i->version, eversion_t(), i->is_delete());
+ }
+ }
+
+ for (const auto& p : extra_missing) {
+ missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
+ }
+
+ set_missing_may_contain_deletes();
+}
+
+#ifdef WITH_SEASTAR
+
+namespace {
+ struct FuturizedStoreLogReader {
+ crimson::os::FuturizedStore &store;
+ const pg_info_t &info;
+ PGLog::IndexedLog &log;
+ std::set<std::string>* log_keys_debug = NULL;
+ pg_missing_tracker_t &missing;
+ const DoutPrefixProvider *dpp;
+
+ eversion_t on_disk_can_rollback_to;
+ eversion_t on_disk_rollback_info_trimmed_to;
+
+ std::map<eversion_t, hobject_t> divergent_priors;
+ bool must_rebuild = false;
+ std::list<pg_log_entry_t> entries;
+ std::list<pg_log_dup_t> dups;
+
+ std::optional<std::string> next;
+
+ void process_entry(crimson::os::FuturizedStore::OmapIteratorRef &p) {
+ if (p->key()[0] == '_')
+ return;
+ //Copy ceph::buffer::list before creating iterator
+ auto bl = p->value();
+ auto bp = bl.cbegin();
+ if (p->key() == "divergent_priors") {
+ decode(divergent_priors, bp);
+ ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
+ << " divergent_priors" << dendl;
+ ceph_assert("crimson shouldn't have had divergent_priors" == 0);
+ } else if (p->key() == "can_rollback_to") {
+ decode(on_disk_can_rollback_to, bp);
+ } else if (p->key() == "rollback_info_trimmed_to") {
+ decode(on_disk_rollback_info_trimmed_to, bp);
+ } else if (p->key() == "may_include_deletes_in_missing") {
+ missing.may_include_deletes = true;
+ } else if (p->key().substr(0, 7) == std::string("missing")) {
+ hobject_t oid;
+ pg_missing_item item;
+ decode(oid, bp);
+ decode(item, bp);
+ if (item.is_delete()) {
+ ceph_assert(missing.may_include_deletes);
+ }
+ missing.add(oid, std::move(item));
+ } else if (p->key().substr(0, 4) == std::string("dup_")) {
+ pg_log_dup_t dup;
+ decode(dup, bp);
+ if (!dups.empty()) {
+ ceph_assert(dups.back().version < dup.version);
+ }
+ dups.push_back(dup);
+ } else {
+ pg_log_entry_t e;
+ e.decode_with_checksum(bp);
+ ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
+ if (!entries.empty()) {
+ pg_log_entry_t last_e(entries.back());
+ ceph_assert(last_e.version.version < e.version.version);
+ ceph_assert(last_e.version.epoch <= e.version.epoch);
+ }
+ entries.push_back(e);
+ if (log_keys_debug)
+ log_keys_debug->insert(e.get_key_name());
+ }
+ }
+
+ seastar::future<> read(crimson::os::CollectionRef ch,
+ ghobject_t pgmeta_oid) {
+ // will get overridden if recorded
+ on_disk_can_rollback_to = info.last_update;
+ missing.may_include_deletes = false;
+
+ return store.get_omap_iterator(ch, pgmeta_oid).then([this](auto iter) {
+ return seastar::do_until([iter] { return !iter->valid(); },
+ [iter, this]() mutable {
+ process_entry(iter);
+ return iter->next();
+ });
+ }).then([this] {
+ log = PGLog::IndexedLog(
+ info.last_update,
+ info.log_tail,
+ on_disk_can_rollback_to,
+ on_disk_rollback_info_trimmed_to,
+ std::move(entries),
+ std::move(dups));
+ });
+ }
+ };
+}
+
+seastar::future<> PGLog::read_log_and_missing_crimson(
+ crimson::os::FuturizedStore &store,
+ crimson::os::CollectionRef ch,
+ const pg_info_t &info,
+ IndexedLog &log,
+ std::set<std::string>* log_keys_debug,
+ pg_missing_tracker_t &missing,
+ ghobject_t pgmeta_oid,
+ const DoutPrefixProvider *dpp)
+{
+ ldpp_dout(dpp, 20) << "read_log_and_missing coll "
+ << ch->get_cid()
+ << " " << pgmeta_oid << dendl;
+ return seastar::do_with(FuturizedStoreLogReader{
+ store, info, log, log_keys_debug,
+ missing, dpp},
+ [ch, pgmeta_oid](FuturizedStoreLogReader& reader) {
+ return reader.read(ch, pgmeta_oid);
+ });
+}
+
+#endif
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
new file mode 100644
index 000000000..69ca1d20c
--- /dev/null
+++ b/src/osd/PGLog.h
@@ -0,0 +1,1697 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#pragma once
+
+// re-include our assert to clobber boost's
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#include "osd_types.h"
+#include "os/ObjectStore.h"
+#include <list>
+
+#ifdef WITH_SEASTAR
+#include <seastar/core/future.hh>
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/cyanstore/cyan_collection.h"
+#endif
+
+/** @name PG Log
+ *
+ * The pg log serves three primary purposes:
+ *
+ * 1) improving recovery speed
+ *
+ * 2) detecting duplicate ops
+ *
+ * 3) making erasure coded updates safe
+ *
+ * For (1), the main data type is pg_log_entry_t. this is indexed in
+ * memory by the IndexedLog class - this is where most of the logic
+ * surrounding pg log is kept, even though the low level types are in
+ * src/osd/osd_types.h
+ *
+ * (2) uses a type which is a subset of the full log entry, containing
+ * just the pieces we need to identify and respond to a duplicate
+ * request.
+ *
+ * As we trim the log, we convert pg_log_entry_t to smaller
+ * pg_log_dup_t, and finally remove them once we reach a higher
+ * limit. This is controlled by a few options:
+ *
+ * osd_min_pg_log_entries osd_max_pg_log_entries
+ * osd_pg_log_dups_tracked
+ *
+ * For example, with a min of 100, max of 1000, and dups tracked of
+ * 3000, the log entries and dups stored would span the following
+ * versions, assuming the current earliest is version 1:
+ *
+ * version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ]
+ *
+ * after osd_pg_log_trim_min subsequent writes to this PG, the log
+ * would be trimmed to look like:
+ *
+ * version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ]
+ *
+ * (3) means tracking the previous state of an object, so that we can
+ * rollback to that prior state if necessary. It's only used for
+ * erasure coding. Consider an erasure code of 4+2, for example.
+ *
+ * This means we split the object into 4 pieces (called shards) and
+ * compute 2 parity shards. Each of these shards is stored on a
+ * separate OSD. As long as 4 shards are the same version, we can
+ * recover the remaining 2 by computation. Imagine during a write, 3
+ * of the osds go down and restart, resulting in shards 0,1,2
+ * reflecting version A and shards 3,4,5 reflecting version B, after
+ * the write.
+ *
+ * If we had no way to reconstruct version A for another shard, we
+ * would have lost the object.
+ *
+ * The actual data for rollback is stored in a look-aside object and
+ * is removed once the EC write commits on all shards. The pg log just
+ * stores the versions so we can tell how far we can rollback, and a
+ * description of the type of operation for each log entry. Beyond
+ * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor
+ * for more details on this.
+ *
+ * An important implication of this is that although the pg log length
+ * is normally bounded, under extreme conditions, with many EC I/Os
+ * outstanding, the log may grow beyond that point because we need to
+ * keep the rollback information for all outstanding EC I/O.
+ *
+ * For more on pg log bounds, see where it is calculated in
+ * PeeringState::calc_trim_to_aggressive().
+ *
+ * For more details on how peering uses the pg log, and architectural
+ * reasons for its existence, see:
+ *
+ * doc/dev/osd_internals/log_based_pg.rst
+ *
+ */
+
+constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0;
+constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1;
+constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2;
+constexpr auto PGLOG_INDEXED_DUPS = 1 << 3;
+constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS
+ | PGLOG_INDEXED_CALLER_OPS
+ | PGLOG_INDEXED_EXTRA_CALLER_OPS
+ | PGLOG_INDEXED_DUPS;
+
+struct PGLog : DoutPrefixProvider {
+ std::ostream& gen_prefix(std::ostream& out) const override {
+ return out;
+ }
+ unsigned get_subsys() const override {
+ return static_cast<unsigned>(ceph_subsys_osd);
+ }
+ CephContext *get_cct() const override {
+ return cct;
+ }
+
+ ////////////////////////////// sub classes //////////////////////////////
+ struct LogEntryHandler {
+ virtual void rollback(
+ const pg_log_entry_t &entry) = 0;
+ virtual void rollforward(
+ const pg_log_entry_t &entry) = 0;
+ virtual void trim(
+ const pg_log_entry_t &entry) = 0;
+ virtual void remove(
+ const hobject_t &hoid) = 0;
+ virtual void try_stash(
+ const hobject_t &hoid,
+ version_t v) = 0;
+ virtual ~LogEntryHandler() {}
+ };
+ using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>;
+
+public:
+ /**
+ * IndexLog - adds in-memory index of the log, by oid.
+ * plus some methods to manipulate it all.
+ */
+ struct IndexedLog : public pg_log_t {
+ mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful!
+ mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
+ mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
+ mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
+
+ // recovery pointers
+ std::list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
+ version_t last_requested = 0; // last object requested by primary
+
+ //
+ private:
+ mutable __u16 indexed_data = 0;
+ /**
+ * rollback_info_trimmed_to_riter points to the first log entry <=
+ * rollback_info_trimmed_to
+ *
+ * It's a reverse_iterator because rend() is a natural representation for
+ * tail, and rbegin() works nicely for head.
+ */
+ mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
+ rollback_info_trimmed_to_riter;
+
+ /*
+ * return true if we need to mark the pglog as dirty
+ */
+ template <typename F>
+ bool advance_can_rollback_to(eversion_t to, F &&f) {
+ bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to;
+ if (dirty_log) {
+ if (to > can_rollback_to)
+ can_rollback_to = to;
+
+ if (to > rollback_info_trimmed_to)
+ rollback_info_trimmed_to = to;
+ }
+
+ while (rollback_info_trimmed_to_riter != log.rbegin()) {
+ --rollback_info_trimmed_to_riter;
+ if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
+ ++rollback_info_trimmed_to_riter;
+ break;
+ }
+ f(*rollback_info_trimmed_to_riter);
+ }
+
+ return dirty_log;
+ }
+
+ void reset_rollback_info_trimmed_to_riter() {
+ rollback_info_trimmed_to_riter = log.rbegin();
+ while (rollback_info_trimmed_to_riter != log.rend() &&
+ rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
+ ++rollback_info_trimmed_to_riter;
+ }
+
+ // indexes objects, caller ops and extra caller ops
+ public:
+ IndexedLog() :
+ complete_to(log.end()),
+ last_requested(0),
+ indexed_data(0),
+ rollback_info_trimmed_to_riter(log.rbegin())
+ { }
+
+ template <typename... Args>
+ explicit IndexedLog(Args&&... args) :
+ pg_log_t(std::forward<Args>(args)...),
+ complete_to(log.end()),
+ last_requested(0),
+ indexed_data(0),
+ rollback_info_trimmed_to_riter(log.rbegin())
+ {
+ reset_rollback_info_trimmed_to_riter();
+ index();
+ }
+
+ IndexedLog(const IndexedLog &rhs) :
+ pg_log_t(rhs),
+ complete_to(log.end()),
+ last_requested(rhs.last_requested),
+ indexed_data(0),
+ rollback_info_trimmed_to_riter(log.rbegin())
+ {
+ reset_rollback_info_trimmed_to_riter();
+ index(rhs.indexed_data);
+ }
+
+ IndexedLog &operator=(const IndexedLog &rhs) {
+ this->~IndexedLog();
+ new (this) IndexedLog(rhs);
+ return *this;
+ }
+
+ void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
+ advance_can_rollback_to(
+ to,
+ [&](pg_log_entry_t &entry) {
+ h->trim(entry);
+ });
+ }
+ bool roll_forward_to(eversion_t to, LogEntryHandler *h) {
+ return advance_can_rollback_to(
+ to,
+ [&](pg_log_entry_t &entry) {
+ h->rollforward(entry);
+ });
+ }
+
+ void skip_can_rollback_to_to_head() {
+ advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
+ }
+
+ mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
+ auto divergent = pg_log_t::rewind_from_head(newhead);
+ index();
+ reset_rollback_info_trimmed_to_riter();
+ return divergent;
+ }
+
+ template <typename T>
+ void scan_log_after(
+ const eversion_t &bound, ///< [in] scan entries > bound
+ T &&f) const {
+ auto iter = log.rbegin();
+ while (iter != log.rend() && iter->version > bound)
+ ++iter;
+
+ while (true) {
+ if (iter == log.rbegin())
+ break;
+ f(*(--iter));
+ }
+ }
+
+ /****/
+ void claim_log_and_clear_rollback_info(const pg_log_t& o) {
+ // we must have already trimmed the old entries
+ ceph_assert(rollback_info_trimmed_to == head);
+ ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
+
+ *this = IndexedLog(o);
+
+ skip_can_rollback_to_to_head();
+ index();
+ }
+
+ void split_out_child(
+ pg_t child_pgid,
+ unsigned split_bits,
+ IndexedLog *target);
+
+ void zero() {
+ // we must have already trimmed the old entries
+ ceph_assert(rollback_info_trimmed_to == head);
+ ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
+
+ unindex();
+ pg_log_t::clear();
+ rollback_info_trimmed_to_riter = log.rbegin();
+ reset_recovery_pointers();
+ }
+ void clear() {
+ skip_can_rollback_to_to_head();
+ zero();
+ }
+ void reset_recovery_pointers() {
+ complete_to = log.end();
+ last_requested = 0;
+ }
+
+ bool logged_object(const hobject_t& oid) const {
+ if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
+ index_objects();
+ }
+ return objects.count(oid);
+ }
+
+ bool logged_req(const osd_reqid_t &r) const {
+ if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
+ index_caller_ops();
+ }
+ if (!caller_ops.count(r)) {
+ if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
+ index_extra_caller_ops();
+ }
+ return extra_caller_ops.count(r);
+ }
+ return true;
+ }
+
+ bool get_request(
+ const osd_reqid_t &r,
+ eversion_t *version,
+ version_t *user_version,
+ int *return_code,
+ std::vector<pg_log_op_return_item_t> *op_returns) const
+ {
+ ceph_assert(version);
+ ceph_assert(user_version);
+ ceph_assert(return_code);
+ if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
+ index_caller_ops();
+ }
+ auto p = caller_ops.find(r);
+ if (p != caller_ops.end()) {
+ *version = p->second->version;
+ *user_version = p->second->user_version;
+ *return_code = p->second->return_code;
+ *op_returns = p->second->op_returns;
+ return true;
+ }
+
+ // warning: we will return *a* request for this reqid, but not
+ // necessarily the most recent.
+ if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
+ index_extra_caller_ops();
+ }
+ p = extra_caller_ops.find(r);
+ if (p != extra_caller_ops.end()) {
+ uint32_t idx = 0;
+ for (auto i = p->second->extra_reqids.begin();
+ i != p->second->extra_reqids.end();
+ ++idx, ++i) {
+ if (i->first == r) {
+ *version = p->second->version;
+ *user_version = i->second;
+ *return_code = p->second->return_code;
+ *op_returns = p->second->op_returns;
+ if (*return_code >= 0) {
+ auto it = p->second->extra_reqid_return_codes.find(idx);
+ if (it != p->second->extra_reqid_return_codes.end()) {
+ *return_code = it->second;
+ }
+ }
+ return true;
+ }
+ }
+ ceph_abort_msg("in extra_caller_ops but not extra_reqids");
+ }
+
+ if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
+ index_dups();
+ }
+ auto q = dup_index.find(r);
+ if (q != dup_index.end()) {
+ *version = q->second->version;
+ *user_version = q->second->user_version;
+ *return_code = q->second->return_code;
+ *op_returns = q->second->op_returns;
+ return true;
+ }
+
+ return false;
+ }
+
+ bool has_write_since(const hobject_t &oid, const eversion_t &bound) const {
+ for (auto i = log.rbegin(); i != log.rend(); ++i) {
+ if (i->version <= bound)
+ return false;
+ if (i->soid.get_head() == oid.get_head())
+ return true;
+ }
+ return false;
+ }
+
+ /// get a (bounded) std::list of recent reqids for the given object
+ void get_object_reqids(const hobject_t& oid, unsigned max,
+ mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *pls,
+ mempool::osd_pglog::map<uint32_t, int> *return_codes) const {
+ // make sure object is present at least once before we do an
+ // O(n) search.
+ if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
+ index_objects();
+ }
+ if (objects.count(oid) == 0)
+ return;
+
+ for (auto i = log.rbegin(); i != log.rend(); ++i) {
+ if (i->soid == oid) {
+ if (i->reqid_is_indexed()) {
+ if (i->op == pg_log_entry_t::ERROR) {
+ // propagate op errors to the cache tier's PG log
+ return_codes->emplace(pls->size(), i->return_code);
+ }
+ pls->push_back(std::make_pair(i->reqid, i->user_version));
+ }
+
+ pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
+ if (pls->size() >= max) {
+ if (pls->size() > max) {
+ pls->resize(max);
+ }
+ return;
+ }
+ }
+ }
+ }
+
+ void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
+ // if to_index is 0, no need to run any of this code, especially
+ // loop below; this can happen with copy constructor for
+ // IndexedLog (and indirectly through assignment operator)
+ if (!to_index) return;
+
+ if (to_index & PGLOG_INDEXED_OBJECTS)
+ objects.clear();
+ if (to_index & PGLOG_INDEXED_CALLER_OPS)
+ caller_ops.clear();
+ if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
+ extra_caller_ops.clear();
+ if (to_index & PGLOG_INDEXED_DUPS) {
+ dup_index.clear();
+ for (auto& i : dups) {
+ dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
+ }
+ }
+
+ constexpr __u16 any_log_entry_index =
+ PGLOG_INDEXED_OBJECTS |
+ PGLOG_INDEXED_CALLER_OPS |
+ PGLOG_INDEXED_EXTRA_CALLER_OPS;
+
+ if (to_index & any_log_entry_index) {
+ for (auto i = log.begin(); i != log.end(); ++i) {
+ if (to_index & PGLOG_INDEXED_OBJECTS) {
+ if (i->object_is_indexed()) {
+ objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
+ }
+ }
+
+ if (to_index & PGLOG_INDEXED_CALLER_OPS) {
+ if (i->reqid_is_indexed()) {
+ caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
+ }
+ }
+
+ if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+ for (auto j = i->extra_reqids.begin();
+ j != i->extra_reqids.end();
+ ++j) {
+ extra_caller_ops.insert(
+ std::make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
+ }
+ }
+ }
+ }
+
+ indexed_data |= to_index;
+ }
+
+ void index_objects() const {
+ index(PGLOG_INDEXED_OBJECTS);
+ }
+
+ void index_caller_ops() const {
+ index(PGLOG_INDEXED_CALLER_OPS);
+ }
+
+ void index_extra_caller_ops() const {
+ index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
+ }
+
+ void index_dups() const {
+ index(PGLOG_INDEXED_DUPS);
+ }
+
+ void index(pg_log_entry_t& e) {
+ if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
+ if (objects.count(e.soid) == 0 ||
+ objects[e.soid]->version < e.version)
+ objects[e.soid] = &e;
+ }
+ if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
+ // divergent merge_log indexes new before unindexing old
+ if (e.reqid_is_indexed()) {
+ caller_ops[e.reqid] = &e;
+ }
+ }
+ if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+ for (auto j = e.extra_reqids.begin();
+ j != e.extra_reqids.end();
+ ++j) {
+ extra_caller_ops.insert(std::make_pair(j->first, &e));
+ }
+ }
+ }
+
+ void unindex() {
+ objects.clear();
+ caller_ops.clear();
+ extra_caller_ops.clear();
+ dup_index.clear();
+ indexed_data = 0;
+ }
+
+ void unindex(const pg_log_entry_t& e) {
+ // NOTE: this only works if we remove from the _tail_ of the log!
+ if (indexed_data & PGLOG_INDEXED_OBJECTS) {
+ auto it = objects.find(e.soid);
+ if (it != objects.end() && it->second->version == e.version)
+ objects.erase(it);
+ }
+ if (e.reqid_is_indexed()) {
+ if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
+ auto it = caller_ops.find(e.reqid);
+ // divergent merge_log indexes new before unindexing old
+ if (it != caller_ops.end() && it->second == &e)
+ caller_ops.erase(it);
+ }
+ }
+ if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+ for (auto j = e.extra_reqids.begin();
+ j != e.extra_reqids.end();
+ ++j) {
+ for (auto k = extra_caller_ops.find(j->first);
+ k != extra_caller_ops.end() && k->first == j->first;
+ ++k) {
+ if (k->second == &e) {
+ extra_caller_ops.erase(k);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ void index(pg_log_dup_t& e) {
+ if (indexed_data & PGLOG_INDEXED_DUPS) {
+ dup_index[e.reqid] = &e;
+ }
+ }
+
+ void unindex(const pg_log_dup_t& e) {
+ if (indexed_data & PGLOG_INDEXED_DUPS) {
+ auto i = dup_index.find(e.reqid);
+ if (i != dup_index.end()) {
+ dup_index.erase(i);
+ }
+ }
+ }
+
+ // actors
+ void add(const pg_log_entry_t& e, bool applied = true) {
+ if (!applied) {
+ ceph_assert(get_can_rollback_to() == head);
+ }
+
+ // make sure our buffers don't pin bigger buffers
+ e.mod_desc.trim_bl();
+
+ // add to log
+ log.push_back(e);
+
+ // riter previously pointed to the previous entry
+ if (rollback_info_trimmed_to_riter == log.rbegin())
+ ++rollback_info_trimmed_to_riter;
+
+ ceph_assert(e.version > head);
+ ceph_assert(head.version == 0 || e.version.version > head.version);
+ head = e.version;
+
+ // to our index
+ if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
+ objects[e.soid] = &(log.back());
+ }
+ if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
+ if (e.reqid_is_indexed()) {
+ caller_ops[e.reqid] = &(log.back());
+ }
+ }
+
+ if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+ for (auto j = e.extra_reqids.begin();
+ j != e.extra_reqids.end();
+ ++j) {
+ extra_caller_ops.insert(std::make_pair(j->first, &(log.back())));
+ }
+ }
+
+ if (!applied) {
+ skip_can_rollback_to_to_head();
+ }
+ } // add
+
+ void trim(
+ CephContext* cct,
+ eversion_t s,
+ std::set<eversion_t> *trimmed,
+ std::set<std::string>* trimmed_dups,
+ eversion_t *write_from_dups);
+
+ std::ostream& print(std::ostream& out) const;
+ }; // IndexedLog
+
+
+protected:
+ //////////////////// data members ////////////////////
+
+ pg_missing_tracker_t missing;
+ IndexedLog log;
+
+ eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
+ eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
+ eversion_t writeout_from; ///< must writout keys >= writeout_from
+ std::set<eversion_t> trimmed; ///< must clear keys in trimmed
+ eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups
+ eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups
+ eversion_t write_from_dups; ///< must write keys >= write_from_dups
+ std::set<std::string> trimmed_dups; ///< must clear keys in trimmed_dups
+ CephContext *cct;
+ bool pg_log_debug;
+ /// Log is clean on [dirty_to, dirty_from)
+ bool touched_log;
+ bool dirty_log;
+ bool clear_divergent_priors;
+ bool may_include_deletes_in_missing_dirty = false;
+
+ void mark_dirty_to(eversion_t to) {
+ if (to > dirty_to)
+ dirty_to = to;
+ }
+ void mark_dirty_from(eversion_t from) {
+ if (from < dirty_from)
+ dirty_from = from;
+ }
+ void mark_writeout_from(eversion_t from) {
+ if (from < writeout_from)
+ writeout_from = from;
+ }
+ void mark_dirty_to_dups(eversion_t to) {
+ if (to > dirty_to_dups)
+ dirty_to_dups = to;
+ }
+ void mark_dirty_from_dups(eversion_t from) {
+ if (from < dirty_from_dups)
+ dirty_from_dups = from;
+ }
+public:
+ bool needs_write() const {
+ return !touched_log || is_dirty();
+ }
+
+ bool is_dirty() const {
+ return dirty_log ||
+ (dirty_to != eversion_t()) ||
+ (dirty_from != eversion_t::max()) ||
+ (writeout_from != eversion_t::max()) ||
+ !(trimmed.empty()) ||
+ !missing.is_clean() ||
+ !(trimmed_dups.empty()) ||
+ (dirty_to_dups != eversion_t()) ||
+ (dirty_from_dups != eversion_t::max()) ||
+ (write_from_dups != eversion_t::max()) ||
+ may_include_deletes_in_missing_dirty;
+ }
+
+ void mark_log_for_rewrite() {
+ mark_dirty_to(eversion_t::max());
+ mark_dirty_from(eversion_t());
+ mark_dirty_to_dups(eversion_t::max());
+ mark_dirty_from_dups(eversion_t());
+ touched_log = false;
+ }
+ bool get_may_include_deletes_in_missing_dirty() const {
+ return may_include_deletes_in_missing_dirty;
+ }
+protected:
+
+ /// DEBUG
+ std::set<std::string> log_keys_debug;
+ static void clear_after(std::set<std::string> *log_keys_debug, const std::string &lb) {
+ if (!log_keys_debug)
+ return;
+ for (auto i = log_keys_debug->lower_bound(lb);
+ i != log_keys_debug->end();
+ log_keys_debug->erase(i++));
+ }
+ static void clear_up_to(std::set<std::string> *log_keys_debug, const std::string &ub) {
+ if (!log_keys_debug)
+ return;
+ for (auto i = log_keys_debug->begin();
+ i != log_keys_debug->end() && *i < ub;
+ log_keys_debug->erase(i++));
+ }
+
+ void check();
+ void undirty() {
+ dirty_to = eversion_t();
+ dirty_from = eversion_t::max();
+ touched_log = true;
+ dirty_log = false;
+ trimmed.clear();
+ trimmed_dups.clear();
+ writeout_from = eversion_t::max();
+ check();
+ missing.flush();
+ dirty_to_dups = eversion_t();
+ dirty_from_dups = eversion_t::max();
+ write_from_dups = eversion_t::max();
+ }
+public:
+
+ // cppcheck-suppress noExplicitConstructor
+ PGLog(CephContext *cct) :
+ dirty_from(eversion_t::max()),
+ writeout_from(eversion_t::max()),
+ dirty_from_dups(eversion_t::max()),
+ write_from_dups(eversion_t::max()),
+ cct(cct),
+ pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
+ touched_log(false),
+ dirty_log(false),
+ clear_divergent_priors(false)
+ { }
+
+ void reset_backfill();
+
+ void clear();
+
+ //////////////////// get or std::set missing ////////////////////
+
+ const pg_missing_tracker_t& get_missing() const { return missing; }
+
+ void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) {
+ missing.add(oid, need, have, is_delete);
+ }
+
+ void missing_add_next_entry(const pg_log_entry_t& e) {
+ missing.add_next_event(e);
+ }
+
+ //////////////////// get or std::set log ////////////////////
+
+ const IndexedLog &get_log() const { return log; }
+
+ const eversion_t &get_tail() const { return log.tail; }
+
+ void set_tail(eversion_t tail) { log.tail = tail; }
+
+ const eversion_t &get_head() const { return log.head; }
+
+ void set_head(eversion_t head) { log.head = head; }
+
+ void set_last_requested(version_t last_requested) {
+ log.last_requested = last_requested;
+ }
+
+ void index() { log.index(); }
+
+ void unindex() { log.unindex(); }
+
+ void add(const pg_log_entry_t& e, bool applied = true) {
+ mark_writeout_from(e.version);
+ log.add(e, applied);
+ }
+
+ void reset_recovery_pointers() { log.reset_recovery_pointers(); }
+
+ static void clear_info_log(
+ spg_t pgid,
+ ObjectStore::Transaction *t);
+
+ void trim(
+ eversion_t trim_to,
+ pg_info_t &info,
+ bool transaction_applied = true,
+ bool async = false);
+
+ void roll_forward_to(
+ eversion_t roll_forward_to,
+ LogEntryHandler *h) {
+ if (log.roll_forward_to(
+ roll_forward_to,
+ h))
+ dirty_log = true;
+ }
+
+ eversion_t get_can_rollback_to() const {
+ return log.get_can_rollback_to();
+ }
+
+ void roll_forward(LogEntryHandler *h) {
+ roll_forward_to(
+ log.head,
+ h);
+ }
+
+ void skip_rollforward() {
+ log.skip_can_rollback_to_to_head();
+ }
+
+ //////////////////// get or std::set log & missing ////////////////////
+
+ void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
+ log.trim_rollback_info_to(log.head, h);
+ log.claim_log_and_clear_rollback_info(o);
+ missing.clear();
+ mark_dirty_to(eversion_t::max());
+ mark_dirty_to_dups(eversion_t::max());
+ }
+
+ void split_into(
+ pg_t child_pgid,
+ unsigned split_bits,
+ PGLog *opg_log) {
+ log.split_out_child(child_pgid, split_bits, &opg_log->log);
+ missing.split_into(child_pgid, split_bits, &(opg_log->missing));
+ opg_log->mark_dirty_to(eversion_t::max());
+ opg_log->mark_dirty_to_dups(eversion_t::max());
+ mark_dirty_to(eversion_t::max());
+ mark_dirty_to_dups(eversion_t::max());
+ if (missing.may_include_deletes) {
+ opg_log->set_missing_may_contain_deletes();
+ }
+ }
+
+ void merge_from(
+ const std::vector<PGLog*>& sources,
+ eversion_t last_update) {
+ unindex();
+ missing.clear();
+
+ std::vector<pg_log_t*> slogs;
+ for (auto s : sources) {
+ slogs.push_back(&s->log);
+ }
+ log.merge_from(slogs, last_update);
+
+ index();
+
+ mark_log_for_rewrite();
+ }
+
+ void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
+ if (missing.is_missing(oid, v)) {
+ missing.got(oid, v);
+ info.stats.stats.sum.num_objects_missing = missing.num_missing();
+
+ // raise last_complete?
+ if (missing.get_items().empty()) {
+ log.complete_to = log.log.end();
+ info.last_complete = info.last_update;
+ }
+ auto oldest_need = missing.get_oldest_need();
+ while (log.complete_to != log.log.end()) {
+ if (oldest_need <= log.complete_to->version)
+ break;
+ if (info.last_complete < log.complete_to->version)
+ info.last_complete = log.complete_to->version;
+ ++log.complete_to;
+ }
+ }
+
+ ceph_assert(log.get_can_rollback_to() >= v);
+ }
+
+ void reset_complete_to(pg_info_t *info) {
+ if (log.log.empty()) // caller is split_into()
+ return;
+ log.complete_to = log.log.begin();
+ ceph_assert(log.complete_to != log.log.end());
+ auto oldest_need = missing.get_oldest_need();
+ if (oldest_need != eversion_t()) {
+ while (log.complete_to->version < oldest_need) {
+ ++log.complete_to;
+ ceph_assert(log.complete_to != log.log.end());
+ }
+ }
+ if (!info)
+ return;
+ if (log.complete_to == log.log.begin()) {
+ info->last_complete = eversion_t();
+ } else {
+ --log.complete_to;
+ info->last_complete = log.complete_to->version;
+ ++log.complete_to;
+ }
+ }
+
+ void activate_not_complete(pg_info_t &info) {
+ reset_complete_to(&info);
+ log.last_requested = 0;
+ }
+
+ void proc_replica_log(pg_info_t &oinfo,
+ const pg_log_t &olog,
+ pg_missing_t& omissing, pg_shard_t from) const;
+
+ void set_missing_may_contain_deletes() {
+ missing.may_include_deletes = true;
+ may_include_deletes_in_missing_dirty = true;
+ }
+
+ void rebuild_missing_set_with_deletes(ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ const pg_info_t &info);
+
+protected:
+ static void split_by_object(
+ mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
+ while (!entries.empty()) {
+ auto &out_list = (*out_entries)[entries.front().soid];
+ out_list.splice(out_list.end(), entries, entries.begin());
+ }
+ }
+
+ /**
+ * _merge_object_divergent_entries
+ *
+ * There are 5 distinct cases:
+ * 1) There is a more recent update: in this case we assume we adjusted the
+ * store and missing during merge_log
+ * 2) The first entry in the divergent sequence is a create. This might
+ * either be because the object is a clone or because prior_version is
+ * eversion_t(). In this case the object does not exist and we must
+ * adjust missing and the store to match.
+ * 3) We are currently missing the object. In this case, we adjust the
+ * missing to our prior_version taking care to add a divergent_prior
+ * if necessary
+ * 4) We can rollback all of the entries. In this case, we do so using
+ * the rollbacker and return -- the object does not go into missing.
+ * 5) We cannot rollback at least 1 of the entries. In this case, we
+ * clear the object out of the store and add a missing entry at
+ * prior_version taking care to add a divergent_prior if
+ * necessary.
+ */
+ template <typename missing_type>
+ static void _merge_object_divergent_entries(
+ const IndexedLog &log, ///< [in] log to merge against
+ const hobject_t &hoid, ///< [in] object we are merging
+ const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
+ const pg_info_t &info, ///< [in] info for merging entries
+ eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog
+ missing_type &missing, ///< [in,out] missing to adjust, use
+ LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
+ const DoutPrefixProvider *dpp ///< [in] logging provider
+ ) {
+ ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
+ << " entries: " << orig_entries << dendl;
+
+ if (hoid > info.last_backfill) {
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
+ << dendl;
+ return;
+ }
+
+ // entries is non-empty
+ ceph_assert(!orig_entries.empty());
+ // strip out and ignore ERROR entries
+ mempool::osd_pglog::list<pg_log_entry_t> entries;
+ eversion_t last;
+ bool seen_non_error = false;
+ for (auto i = orig_entries.begin();
+ i != orig_entries.end();
+ ++i) {
+ // all entries are on hoid
+ ceph_assert(i->soid == hoid);
+ // did not see error entries before this entry and this entry is not error
+ // then this entry is the first non error entry
+ bool first_non_error = ! seen_non_error && ! i->is_error();
+ if (! i->is_error() ) {
+ // see a non error entry now
+ seen_non_error = true;
+ }
+
+ // No need to check the first entry since it prior_version is unavailable
+ // in the std::list
+ // No need to check if the prior_version is the minimal version
+ // No need to check the first non-error entry since the leading error
+ // entries are not its prior version
+ if (i != orig_entries.begin() && i->prior_version != eversion_t() &&
+ ! first_non_error) {
+ // in increasing order of version
+ ceph_assert(i->version > last);
+ // prior_version correct (unless it is an ERROR entry)
+ ceph_assert(i->prior_version == last || i->is_error());
+ }
+ if (i->is_error()) {
+ ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
+ entries.push_back(*i);
+ last = i->version;
+ }
+ }
+ if (entries.empty()) {
+ ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
+ return;
+ }
+
+ const eversion_t prior_version = entries.begin()->prior_version;
+ const eversion_t first_divergent_update = entries.begin()->version;
+ const eversion_t last_divergent_update = entries.rbegin()->version;
+ const bool object_not_in_store =
+ !missing.is_missing(hoid) &&
+ entries.rbegin()->is_delete();
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: "
+ << object_not_in_store << dendl;
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " prior_version: " << prior_version
+ << " first_divergent_update: " << first_divergent_update
+ << " last_divergent_update: " << last_divergent_update
+ << dendl;
+
+ auto objiter = log.objects.find(hoid);
+ if (objiter != log.objects.end() &&
+ objiter->second->version >= first_divergent_update) {
+ /// Case 1)
+ ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
+ << *objiter->second << ", already merged" << dendl;
+
+ ceph_assert(objiter->second->version > last_divergent_update);
+
+ // ensure missing has been updated appropriately
+ if (objiter->second->is_update() ||
+ (missing.may_include_deletes && objiter->second->is_delete())) {
+ ceph_assert(missing.is_missing(hoid) &&
+ missing.get_items().at(hoid).need == objiter->second->version);
+ } else {
+ ceph_assert(!missing.is_missing(hoid));
+ }
+ missing.revise_have(hoid, eversion_t());
+ missing.mark_fully_dirty(hoid);
+ if (rollbacker) {
+ if (!object_not_in_store) {
+ rollbacker->remove(hoid);
+ }
+ for (auto &&i: entries) {
+ rollbacker->trim(i);
+ }
+ }
+ return;
+ }
+
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ <<" has no more recent entries in log" << dendl;
+ if (prior_version == eversion_t() || entries.front().is_clone()) {
+ /// Case 2)
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " prior_version or op type indicates creation,"
+ << " deleting"
+ << dendl;
+ if (missing.is_missing(hoid))
+ missing.rm(missing.get_items().find(hoid));
+ if (rollbacker) {
+ if (!object_not_in_store) {
+ rollbacker->remove(hoid);
+ }
+ for (auto &&i: entries) {
+ rollbacker->trim(i);
+ }
+ }
+ return;
+ }
+
+ if (missing.is_missing(hoid)) {
+ /// Case 3)
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " missing, " << missing.get_items().at(hoid)
+ << " adjusting" << dendl;
+
+ if (missing.get_items().at(hoid).have == prior_version) {
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " missing.have is prior_version " << prior_version
+ << " removing from missing" << dendl;
+ missing.rm(missing.get_items().find(hoid));
+ } else {
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " missing.have is " << missing.get_items().at(hoid).have
+ << ", adjusting" << dendl;
+ missing.revise_need(hoid, prior_version, false);
+ if (prior_version <= info.log_tail) {
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " prior_version " << prior_version
+ << " <= info.log_tail "
+ << info.log_tail << dendl;
+ }
+ }
+ if (rollbacker) {
+ for (auto &&i: entries) {
+ rollbacker->trim(i);
+ }
+ }
+ return;
+ }
+
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " must be rolled back or recovered,"
+ << " attempting to rollback"
+ << dendl;
+ bool can_rollback = true;
+ // We are going to make an important decision based on the
+ // olog_can_rollback_to value we have received, better known it.
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " olog_can_rollback_to: "
+ << olog_can_rollback_to << dendl;
+ /// Distinguish between 4) and 5)
+ for (auto i = entries.rbegin(); i != entries.rend(); ++i) {
+ if (!i->can_rollback() || i->version <= olog_can_rollback_to) {
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
+ << *i << dendl;
+ can_rollback = false;
+ break;
+ }
+ }
+
+ if (can_rollback) {
+ /// Case 4)
+ for (auto i = entries.rbegin(); i != entries.rend(); ++i) {
+ ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to);
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " rolling back " << *i << dendl;
+ if (rollbacker)
+ rollbacker->rollback(*i);
+ }
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " rolled back" << dendl;
+ return;
+ } else {
+ /// Case 5)
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
+ << "removing and adding to missing" << dendl;
+ if (rollbacker) {
+ if (!object_not_in_store)
+ rollbacker->remove(hoid);
+ for (auto &&i: entries) {
+ rollbacker->trim(i);
+ }
+ }
+ missing.add(hoid, prior_version, eversion_t(), false);
+ if (prior_version <= info.log_tail) {
+ ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+ << " prior_version " << prior_version
+ << " <= info.log_tail "
+ << info.log_tail << dendl;
+ }
+ }
+ }
+
+ /// Merge all entries using above
+ template <typename missing_type>
+ static void _merge_divergent_entries(
+ const IndexedLog &log, ///< [in] log to merge against
+ mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge
+ const pg_info_t &oinfo, ///< [in] info for merging entries
+ eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog
+ missing_type &omissing, ///< [in,out] missing to adjust, use
+ LogEntryHandler *rollbacker, ///< [in] optional rollbacker object
+ const DoutPrefixProvider *dpp ///< [in] logging provider
+ ) {
+ std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
+ split_by_object(entries, &split);
+ for (auto i = split.begin(); i != split.end(); ++i) {
+ _merge_object_divergent_entries(
+ log,
+ i->first,
+ i->second,
+ oinfo,
+ olog_can_rollback_to,
+ omissing,
+ rollbacker,
+ dpp);
+ }
+ }
+
+ /**
+ * Exists for use in TestPGLog for simply testing single divergent log
+ * cases
+ */
+ void merge_old_entry(
+ ObjectStore::Transaction& t,
+ const pg_log_entry_t& oe,
+ const pg_info_t& info,
+ LogEntryHandler *rollbacker) {
+ mempool::osd_pglog::list<pg_log_entry_t> entries;
+ entries.push_back(oe);
+ _merge_object_divergent_entries(
+ log,
+ oe.soid,
+ entries,
+ info,
+ log.get_can_rollback_to(),
+ missing,
+ rollbacker,
+ this);
+ }
+
+ bool merge_log_dups(const pg_log_t& olog);
+
+public:
+
+ void rewind_divergent_log(eversion_t newhead,
+ pg_info_t &info,
+ LogEntryHandler *rollbacker,
+ bool &dirty_info,
+ bool &dirty_big_info);
+
+ void merge_log(pg_info_t &oinfo,
+ pg_log_t&& olog,
+ pg_shard_t from,
+ pg_info_t &info, LogEntryHandler *rollbacker,
+ bool &dirty_info, bool &dirty_big_info);
+
+ template <typename missing_type>
+ static bool append_log_entries_update_missing(
+ const hobject_t &last_backfill,
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ bool maintain_rollback,
+ IndexedLog *log,
+ missing_type &missing,
+ LogEntryHandler *rollbacker,
+ const DoutPrefixProvider *dpp) {
+ bool invalidate_stats = false;
+ if (log && !entries.empty()) {
+ ceph_assert(log->head < entries.begin()->version);
+ }
+ for (auto p = entries.begin(); p != entries.end(); ++p) {
+ invalidate_stats = invalidate_stats || !p->is_error();
+ if (log) {
+ ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
+ log->add(*p);
+ }
+ if (p->soid <= last_backfill &&
+ !p->is_error()) {
+ if (missing.may_include_deletes) {
+ missing.add_next_event(*p);
+ } else {
+ if (p->is_delete()) {
+ missing.rm(p->soid, p->version);
+ } else {
+ missing.add_next_event(*p);
+ }
+ if (rollbacker) {
+ // hack to match PG::mark_all_unfound_lost
+ if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
+ rollbacker->try_stash(p->soid, p->version.version);
+ } else if (p->is_delete()) {
+ rollbacker->remove(p->soid);
+ }
+ }
+ }
+ }
+ }
+ return invalidate_stats;
+ }
+ bool append_new_log_entries(
+ const hobject_t &last_backfill,
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ LogEntryHandler *rollbacker) {
+ bool invalidate_stats = append_log_entries_update_missing(
+ last_backfill,
+ entries,
+ true,
+ &log,
+ missing,
+ rollbacker,
+ this);
+ if (!entries.empty()) {
+ mark_writeout_from(entries.begin()->version);
+ if (entries.begin()->is_lost_delete()) {
+ // hack: since lost deletes queue recovery directly, and don't
+ // go through activate_not_complete() again, our complete_to
+ // iterator may still point at log.end(). Reset it to point
+ // before these new lost_delete entries. This only occurs
+ // when lost+delete entries are initially added, which is
+ // always in a std::list of solely lost_delete entries, so it is
+ // sufficient to check whether the first entry is a
+ // lost_delete
+ reset_complete_to(nullptr);
+ }
+ }
+ return invalidate_stats;
+ }
+
+ void write_log_and_missing(
+ ObjectStore::Transaction& t,
+ std::map<std::string,ceph::buffer::list> *km,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
+ bool require_rollback);
+
+ static void write_log_and_missing_wo_missing(
+ ObjectStore::Transaction& t,
+ std::map<std::string,ceph::buffer::list>* km,
+ pg_log_t &log,
+ const coll_t& coll,
+ const ghobject_t &log_oid, std::map<eversion_t, hobject_t> &divergent_priors,
+ bool require_rollback,
+ const DoutPrefixProvider *dpp = nullptr);
+
+ static void write_log_and_missing(
+ ObjectStore::Transaction& t,
+ std::map<std::string,ceph::buffer::list>* km,
+ pg_log_t &log,
+ const coll_t& coll,
+ const ghobject_t &log_oid,
+ const pg_missing_tracker_t &missing,
+ bool require_rollback,
+ bool *rebuilt_missing_set_with_deletes,
+ const DoutPrefixProvider *dpp = nullptr);
+
+ static void _write_log_and_missing_wo_missing(
+ ObjectStore::Transaction& t,
+ std::map<std::string,ceph::buffer::list>* km,
+ pg_log_t &log,
+ const coll_t& coll, const ghobject_t &log_oid,
+ std::map<eversion_t, hobject_t> &divergent_priors,
+ eversion_t dirty_to,
+ eversion_t dirty_from,
+ eversion_t writeout_from,
+ bool dirty_divergent_priors,
+ bool touch_log,
+ bool require_rollback,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
+ std::set<std::string> *log_keys_debug,
+ const DoutPrefixProvider *dpp = nullptr
+ );
+
+ static void _write_log_and_missing(
+ ObjectStore::Transaction& t,
+ std::map<std::string,ceph::buffer::list>* km,
+ pg_log_t &log,
+ const coll_t& coll, const ghobject_t &log_oid,
+ eversion_t dirty_to,
+ eversion_t dirty_from,
+ eversion_t writeout_from,
+ std::set<eversion_t> &&trimmed,
+ std::set<std::string> &&trimmed_dups,
+ const pg_missing_tracker_t &missing,
+ bool touch_log,
+ bool require_rollback,
+ bool clear_divergent_priors,
+ eversion_t dirty_to_dups,
+ eversion_t dirty_from_dups,
+ eversion_t write_from_dups,
+ bool *may_include_deletes_in_missing_dirty,
+ std::set<std::string> *log_keys_debug,
+ const DoutPrefixProvider *dpp = nullptr
+ );
+
+ void read_log_and_missing(
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t pgmeta_oid,
+ const pg_info_t &info,
+ std::ostringstream &oss,
+ bool tolerate_divergent_missing_log,
+ bool debug_verify_stored_missing = false
+ ) {
+ return read_log_and_missing(
+ cct, store, ch, pgmeta_oid, info,
+ log, missing, oss,
+ tolerate_divergent_missing_log,
+ &clear_divergent_priors,
+ this,
+ (pg_log_debug ? &log_keys_debug : nullptr),
+ debug_verify_stored_missing);
+ }
+
+ template <typename missing_type>
+ static void read_log_and_missing(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle &ch,
+ ghobject_t pgmeta_oid,
+ const pg_info_t &info,
+ IndexedLog &log,
+ missing_type &missing,
+ std::ostringstream &oss,
+ bool tolerate_divergent_missing_log,
+ bool *clear_divergent_priors = nullptr,
+ const DoutPrefixProvider *dpp = nullptr,
+ std::set<std::string> *log_keys_debug = nullptr,
+ bool debug_verify_stored_missing = false
+ ) {
+ ldpp_dout(dpp, 10) << "read_log_and_missing coll " << ch->cid
+ << " " << pgmeta_oid << dendl;
+ size_t total_dups = 0;
+
+ // legacy?
+ struct stat st;
+ int r = store->stat(ch, pgmeta_oid, &st);
+ ceph_assert(r == 0);
+ ceph_assert(st.st_size == 0);
+
+ // will get overridden below if it had been recorded
+ eversion_t on_disk_can_rollback_to = info.last_update;
+ eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
+ ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch,
+ pgmeta_oid);
+ std::map<eversion_t, hobject_t> divergent_priors;
+ bool must_rebuild = false;
+ missing.may_include_deletes = false;
+ std::list<pg_log_entry_t> entries;
+ std::list<pg_log_dup_t> dups;
+ const auto NUM_DUPS_WARN_THRESHOLD = 2*cct->_conf->osd_pg_log_dups_tracked;
+ if (p) {
+ using ceph::decode;
+ for (p->seek_to_first(); p->valid() ; p->next()) {
+ // non-log pgmeta_oid keys are prefixed with _; skip those
+ if (p->key()[0] == '_')
+ continue;
+ auto bl = p->value();//Copy ceph::buffer::list before creating iterator
+ auto bp = bl.cbegin();
+ if (p->key() == "divergent_priors") {
+ decode(divergent_priors, bp);
+ ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
+ << " divergent_priors" << dendl;
+ must_rebuild = true;
+ debug_verify_stored_missing = false;
+ } else if (p->key() == "can_rollback_to") {
+ decode(on_disk_can_rollback_to, bp);
+ } else if (p->key() == "rollback_info_trimmed_to") {
+ decode(on_disk_rollback_info_trimmed_to, bp);
+ } else if (p->key() == "may_include_deletes_in_missing") {
+ missing.may_include_deletes = true;
+ } else if (p->key().substr(0, 7) == std::string("missing")) {
+ hobject_t oid;
+ pg_missing_item item;
+ decode(oid, bp);
+ decode(item, bp);
+ ldpp_dout(dpp, 20) << "read_log_and_missing " << item << dendl;
+ if (item.is_delete()) {
+ ceph_assert(missing.may_include_deletes);
+ }
+ missing.add(oid, std::move(item));
+ } else if (p->key().substr(0, 4) == std::string("dup_")) {
+ ++total_dups;
+ pg_log_dup_t dup;
+ decode(dup, bp);
+ if (!dups.empty()) {
+ ceph_assert(dups.back().version < dup.version);
+ }
+ if (dups.size() == NUM_DUPS_WARN_THRESHOLD) {
+ ldpp_dout(dpp, 0) << "read_log_and_missing WARN num of dups exceeded "
+ << NUM_DUPS_WARN_THRESHOLD << "."
+ << " You can be hit by THE DUPS BUG"
+ << " https://tracker.ceph.com/issues/53729."
+ << " Consider ceph-objectstore-tool --op trim-pg-log-dups"
+ << dendl;
+ }
+ dups.push_back(dup);
+ } else {
+ pg_log_entry_t e;
+ e.decode_with_checksum(bp);
+ ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
+ if (!entries.empty()) {
+ pg_log_entry_t last_e(entries.back());
+ ceph_assert(last_e.version.version < e.version.version);
+ ceph_assert(last_e.version.epoch <= e.version.epoch);
+ }
+ entries.push_back(e);
+ if (log_keys_debug)
+ log_keys_debug->insert(e.get_key_name());
+ }
+ }
+ }
+ log = IndexedLog(
+ info.last_update,
+ info.log_tail,
+ on_disk_can_rollback_to,
+ on_disk_rollback_info_trimmed_to,
+ std::move(entries),
+ std::move(dups));
+
+ if (must_rebuild || debug_verify_stored_missing) {
+ // build missing
+ if (debug_verify_stored_missing || info.last_complete < info.last_update) {
+ ldpp_dout(dpp, 10)
+ << "read_log_and_missing checking for missing items over interval ("
+ << info.last_complete
+ << "," << info.last_update << "]" << dendl;
+
+ std::set<hobject_t> did;
+ std::set<hobject_t> checked;
+ std::set<hobject_t> skipped;
+ for (auto i = log.log.rbegin(); i != log.log.rend(); ++i) {
+ if (i->soid > info.last_backfill)
+ continue;
+ if (i->is_error())
+ continue;
+ if (did.count(i->soid)) continue;
+ did.insert(i->soid);
+
+ if (!missing.may_include_deletes && i->is_delete())
+ continue;
+
+ ceph::buffer::list bv;
+ int r = store->getattr(
+ ch,
+ ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
+ OI_ATTR,
+ bv);
+ if (r >= 0) {
+ object_info_t oi(bv);
+ if (oi.version < i->version) {
+ ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i
+ << " (have " << oi.version << ")"
+ << " clean_regions " << i->clean_regions << dendl;
+
+ if (debug_verify_stored_missing) {
+ auto miter = missing.get_items().find(i->soid);
+ ceph_assert(miter != missing.get_items().end());
+ ceph_assert(miter->second.need == i->version);
+ // the 'have' version is reset if an object is deleted,
+ // then created again
+ ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t());
+ checked.insert(i->soid);
+ } else {
+ missing.add(i->soid, i->version, oi.version, i->is_delete());
+ }
+ }
+ } else {
+ ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
+ if (debug_verify_stored_missing) {
+ auto miter = missing.get_items().find(i->soid);
+ if (i->is_delete()) {
+ ceph_assert(miter == missing.get_items().end() ||
+ (miter->second.need == i->version &&
+ miter->second.have == eversion_t()));
+ } else {
+ ceph_assert(miter != missing.get_items().end());
+ ceph_assert(miter->second.need == i->version);
+ ceph_assert(miter->second.have == eversion_t());
+ }
+ checked.insert(i->soid);
+ } else {
+ missing.add(i->soid, i->version, eversion_t(), i->is_delete());
+ }
+ }
+ }
+ if (debug_verify_stored_missing) {
+ for (auto &&i: missing.get_items()) {
+ if (checked.count(i.first))
+ continue;
+ if (i.first > info.last_backfill) {
+ ldpp_dout(dpp, -1) << __func__ << ": invalid missing std::set entry "
+ << "found before last_backfill: "
+ << i.first << " " << i.second
+ << " last_backfill = " << info.last_backfill
+ << dendl;
+ ceph_abort_msg("invalid missing std::set entry found");
+ }
+ ceph::buffer::list bv;
+ int r = store->getattr(
+ ch,
+ ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
+ OI_ATTR,
+ bv);
+ if (r >= 0) {
+ object_info_t oi(bv);
+ ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have);
+ } else {
+ ceph_assert(i.second.is_delete() || eversion_t() == i.second.have);
+ }
+ }
+ } else {
+ ceph_assert(must_rebuild);
+ for (auto i = divergent_priors.rbegin();
+ i != divergent_priors.rend();
+ ++i) {
+ if (i->first <= info.last_complete) break;
+ if (i->second > info.last_backfill)
+ continue;
+ if (did.count(i->second)) continue;
+ did.insert(i->second);
+ ceph::buffer::list bv;
+ int r = store->getattr(
+ ch,
+ ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
+ OI_ATTR,
+ bv);
+ if (r >= 0) {
+ object_info_t oi(bv);
+ /**
+ * 1) we see this entry in the divergent priors mapping
+ * 2) we didn't see an entry for this object in the log
+ *
+ * From 1 & 2 we know that either the object does not exist
+ * or it is at the version specified in the divergent_priors
+ * map since the object would have been deleted atomically
+ * with the addition of the divergent_priors entry, an older
+ * version would not have been recovered, and a newer version
+ * would show up in the log above.
+ */
+ /**
+ * Unfortunately the assessment above is incorrect because of
+ * http://tracker.ceph.com/issues/17916 (we were incorrectly
+ * not removing the divergent_priors std::set from disk state!),
+ * so let's check that.
+ */
+ if (oi.version > i->first && tolerate_divergent_missing_log) {
+ ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
+ << ") inconsistent with disk state (" << oi
+ << "), assuming it is tracker.ceph.com/issues/17916"
+ << dendl;
+ } else {
+ ceph_assert(oi.version == i->first);
+ }
+ } else {
+ ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl;
+ missing.add(i->second, i->first, eversion_t(), false);
+ }
+ }
+ }
+ if (clear_divergent_priors)
+ (*clear_divergent_priors) = true;
+ }
+ }
+
+ if (!must_rebuild) {
+ if (clear_divergent_priors)
+ (*clear_divergent_priors) = false;
+ missing.flush();
+ }
+ ldpp_dout(dpp, 10) << "read_log_and_missing done coll " << ch->cid
+ << " total_dups=" << total_dups
+ << " log.dups.size()=" << log.dups.size() << dendl;
+ } // static read_log_and_missing
+
+#ifdef WITH_SEASTAR
+ seastar::future<> read_log_and_missing_crimson(
+ crimson::os::FuturizedStore &store,
+ crimson::os::CollectionRef ch,
+ const pg_info_t &info,
+ ghobject_t pgmeta_oid
+ ) {
+ return read_log_and_missing_crimson(
+ store, ch, info,
+ log, (pg_log_debug ? &log_keys_debug : nullptr),
+ missing, pgmeta_oid, this);
+ }
+
+ static seastar::future<> read_log_and_missing_crimson(
+ crimson::os::FuturizedStore &store,
+ crimson::os::CollectionRef ch,
+ const pg_info_t &info,
+ IndexedLog &log,
+ std::set<std::string>* log_keys_debug,
+ pg_missing_tracker_t &missing,
+ ghobject_t pgmeta_oid,
+ const DoutPrefixProvider *dpp = nullptr);
+
+#endif
+
+}; // struct PGLog
diff --git a/src/osd/PGPeeringEvent.cc b/src/osd/PGPeeringEvent.cc
new file mode 100644
index 000000000..2d28c6f84
--- /dev/null
+++ b/src/osd/PGPeeringEvent.cc
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/mempool.h"
+#include "osd/PGPeeringEvent.h"
+#include "messages/MOSDPGLog.h"
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGPeeringEvent, pg_peering_evt, osd);
+
+MLogRec::MLogRec(pg_shard_t from, MOSDPGLog *msg)
+ : from(from), msg(msg) {}
+
+void MLogRec::print(std::ostream *out) const
+{
+ *out << "MLogRec from " << from << " ";
+ msg->inner_print(*out);
+}
diff --git a/src/osd/PGPeeringEvent.h b/src/osd/PGPeeringEvent.h
new file mode 100644
index 000000000..2828880f6
--- /dev/null
+++ b/src/osd/PGPeeringEvent.h
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/event.hpp>
+
+#include "osd/osd_types.h"
+
+class MOSDPGLog;
+
+/// what we need to instantiate a pg
+struct PGCreateInfo {
+ spg_t pgid;
+ epoch_t epoch = 0;
+ pg_history_t history;
+ PastIntervals past_intervals;
+ bool by_mon;
+ PGCreateInfo(spg_t p, epoch_t e,
+ const pg_history_t& h,
+ const PastIntervals& pi,
+ bool mon)
+ : pgid(p), epoch(e), history(h), past_intervals(pi), by_mon(mon) {}
+};
+
+class PGPeeringEvent {
+ epoch_t epoch_sent;
+ epoch_t epoch_requested;
+ std::string desc;
+public:
+ boost::intrusive_ptr< const boost::statechart::event_base > evt;
+ bool requires_pg;
+ std::unique_ptr<PGCreateInfo> create_info;
+ MEMPOOL_CLASS_HELPERS();
+ template <class T>
+ PGPeeringEvent(
+ epoch_t epoch_sent,
+ epoch_t epoch_requested,
+ const T &evt_,
+ bool req = true,
+ PGCreateInfo *ci = 0)
+ : epoch_sent(epoch_sent),
+ epoch_requested(epoch_requested),
+ evt(evt_.intrusive_from_this()),
+ requires_pg(req),
+ create_info(ci) {
+ std::stringstream out;
+ out << "epoch_sent: " << epoch_sent
+ << " epoch_requested: " << epoch_requested << " ";
+ evt_.print(&out);
+ if (create_info) {
+ out << " +create_info";
+ }
+ desc = out.str();
+ }
+ epoch_t get_epoch_sent() const {
+ return epoch_sent;
+ }
+ epoch_t get_epoch_requested() const {
+ return epoch_requested;
+ }
+ const boost::statechart::event_base &get_event() const {
+ return *evt;
+ }
+ const std::string& get_desc() const {
+ return desc;
+ }
+};
+typedef std::shared_ptr<PGPeeringEvent> PGPeeringEventRef;
+typedef std::unique_ptr<PGPeeringEvent> PGPeeringEventURef;
+
+struct MInfoRec : boost::statechart::event< MInfoRec > {
+ pg_shard_t from;
+ pg_info_t info;
+ epoch_t msg_epoch;
+ std::optional<pg_lease_t> lease;
+ std::optional<pg_lease_ack_t> lease_ack;
+ MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch,
+ std::optional<pg_lease_t> l = {},
+ std::optional<pg_lease_ack_t> la = {})
+ : from(from), info(info), msg_epoch(msg_epoch),
+ lease(l), lease_ack(la) {}
+ void print(std::ostream *out) const {
+ *out << "MInfoRec from " << from << " info: " << info;
+ if (lease) {
+ *out << " " << *lease;
+ }
+ if (lease_ack) {
+ *out << " " << *lease_ack;
+ }
+ }
+};
+
+struct MLogRec : boost::statechart::event< MLogRec > {
+ pg_shard_t from;
+ boost::intrusive_ptr<MOSDPGLog> msg;
+ MLogRec(pg_shard_t from, MOSDPGLog *msg);
+ void print(std::ostream *out) const;
+};
+
+struct MNotifyRec : boost::statechart::event< MNotifyRec > {
+ spg_t pgid;
+ pg_shard_t from;
+ pg_notify_t notify;
+ uint64_t features;
+ MNotifyRec(spg_t p, pg_shard_t from, const pg_notify_t &notify, uint64_t f)
+ : pgid(p), from(from), notify(notify), features(f) {}
+ void print(std::ostream *out) const {
+ *out << "MNotifyRec " << pgid << " from " << from << " notify: " << notify
+ << " features: 0x" << std::hex << features << std::dec;
+ }
+};
+
+struct MQuery : boost::statechart::event< MQuery > {
+ spg_t pgid;
+ pg_shard_t from;
+ pg_query_t query;
+ epoch_t query_epoch;
+ MQuery(spg_t p, pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
+ : pgid(p), from(from), query(query), query_epoch(query_epoch) {}
+ void print(std::ostream *out) const {
+ *out << "MQuery " << pgid << " from " << from
+ << " query_epoch " << query_epoch
+ << " query: " << query;
+ }
+};
+
+struct MTrim : boost::statechart::event<MTrim> {
+ epoch_t epoch;
+ int from;
+ shard_id_t shard;
+ eversion_t trim_to;
+ MTrim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to)
+ : epoch(epoch), from(from), shard(shard), trim_to(trim_to) {}
+ void print(std::ostream *out) const {
+ *out << "MTrim epoch " << epoch << " from " << from << " shard " << shard
+ << " trim_to " << trim_to;
+ }
+};
+
+struct MLease : boost::statechart::event<MLease> {
+ epoch_t epoch;
+ int from;
+ pg_lease_t lease;
+ MLease(epoch_t epoch, int from, pg_lease_t l)
+ : epoch(epoch), from(from), lease(l) {}
+ void print(std::ostream *out) const {
+ *out << "MLease epoch " << epoch << " from osd." << from << " " << lease;
+ }
+};
+
+struct MLeaseAck : boost::statechart::event<MLeaseAck> {
+ epoch_t epoch;
+ int from;
+ pg_lease_ack_t lease_ack;
+ MLeaseAck(epoch_t epoch, int from, pg_lease_ack_t l)
+ : epoch(epoch), from(from), lease_ack(l) {}
+ void print(std::ostream *out) const {
+ *out << "MLeaseAck epoch " << epoch << " from osd." << from
+ << " " << lease_ack;
+ }
+};
+
+struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
+ unsigned priority;
+ int64_t primary_num_bytes;
+ int64_t local_num_bytes;
+ explicit RequestBackfillPrio(unsigned prio, int64_t pbytes, int64_t lbytes) :
+ boost::statechart::event< RequestBackfillPrio >(),
+ priority(prio), primary_num_bytes(pbytes), local_num_bytes(lbytes) {}
+ void print(std::ostream *out) const {
+ *out << "RequestBackfillPrio: priority " << priority
+ << " primary bytes " << primary_num_bytes
+ << " local bytes " << local_num_bytes;
+ }
+};
+
+struct RequestRecoveryPrio : boost::statechart::event< RequestRecoveryPrio > {
+ unsigned priority;
+ explicit RequestRecoveryPrio(unsigned prio) :
+ boost::statechart::event< RequestRecoveryPrio >(),
+ priority(prio) {}
+ void print(std::ostream *out) const {
+ *out << "RequestRecoveryPrio: priority " << priority;
+ }
+};
+
+#define TrivialEvent(T) struct T : boost::statechart::event< T > { \
+ T() : boost::statechart::event< T >() {} \
+ void print(std::ostream *out) const { \
+ *out << #T; \
+ } \
+ };
+
+TrivialEvent(NullEvt)
+TrivialEvent(RemoteBackfillReserved)
+TrivialEvent(RemoteReservationRejectedTooFull)
+TrivialEvent(RemoteReservationRevokedTooFull)
+TrivialEvent(RemoteReservationRevoked)
+TrivialEvent(RemoteReservationCanceled)
+TrivialEvent(RemoteRecoveryReserved)
+TrivialEvent(RecoveryDone)
+
+struct DeferRecovery : boost::statechart::event<DeferRecovery> {
+ float delay;
+ explicit DeferRecovery(float delay) : delay(delay) {}
+ void print(std::ostream *out) const {
+ *out << "DeferRecovery: delay " << delay;
+ }
+};
+
+struct DeferBackfill : boost::statechart::event<DeferBackfill> {
+ float delay;
+ explicit DeferBackfill(float delay) : delay(delay) {}
+ void print(std::ostream *out) const {
+ *out << "DeferBackfill: delay " << delay;
+ }
+};
+
+TrivialEvent(RenewLease)
diff --git a/src/osd/PGStateUtils.cc b/src/osd/PGStateUtils.cc
new file mode 100644
index 000000000..5dbe78eb7
--- /dev/null
+++ b/src/osd/PGStateUtils.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PGStateUtils.h"
+#include "common/Clock.h"
+
+using ceph::Formatter;
+
+/*------NamedState----*/
+NamedState::NamedState(PGStateHistory *pgsh, const char *state_name)
+ : pgsh(pgsh), state_name(state_name), enter_time(ceph_clock_now()) {
+ if(pgsh) {
+ pgsh->enter(enter_time, state_name);
+ }
+}
+
+NamedState::~NamedState() {
+ if(pgsh) {
+ pgsh->exit(state_name);
+ }
+}
+
+/*---------PGStateHistory---------*/
+void PGStateHistory::enter(const utime_t entime, const char* state)
+{
+ if (pi == nullptr) {
+ pi = std::make_unique<PGStateInstance>();
+ }
+ pi->enter_state(entime, state);
+}
+
+void PGStateHistory::exit(const char* state) {
+ pi->setepoch(es.get_osdmap_epoch());
+ pi->exit_state(ceph_clock_now());
+ if (pi->empty()) {
+ reset();
+ }
+}
+
+void PGStateHistory::dump(Formatter* f) const {
+ f->open_array_section("history");
+ for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
+ f->open_object_section("epochs");
+ f->dump_stream("epoch") << (*pi)->this_epoch;
+ f->open_array_section("states");
+ for (auto she : (*pi)->state_history) {
+ f->open_object_section("state");
+ f->dump_string("state", std::get<2>(she));
+ f->dump_stream("enter") << std::get<0>(she);
+ f->dump_stream("exit") << std::get<1>(she);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+}
diff --git a/src/osd/PGStateUtils.h b/src/osd/PGStateUtils.h
new file mode 100644
index 000000000..952464641
--- /dev/null
+++ b/src/osd/PGStateUtils.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/utime.h"
+#include "common/Formatter.h"
+
+#include <stack>
+#include <vector>
+#include <boost/circular_buffer.hpp>
+
+class PGStateHistory;
+
+struct EpochSource {
+ virtual epoch_t get_osdmap_epoch() const = 0;
+ virtual ~EpochSource() {}
+};
+
+struct NamedState {
+ PGStateHistory *pgsh;
+ const char *state_name;
+ utime_t enter_time;
+ const char *get_state_name() { return state_name; }
+ NamedState(
+ PGStateHistory *pgsh,
+ const char *state_name_);
+ virtual ~NamedState();
+};
+
+using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
+using embedded_state = std::pair<utime_t, const char*>;
+
+struct PGStateInstance {
+ // Time spent in pg states
+
+ void setepoch(const epoch_t current_epoch) {
+ this_epoch = current_epoch;
+ }
+
+ void enter_state(const utime_t entime, const char* state) {
+ embedded_states.push(std::make_pair(entime, state));
+ }
+
+ void exit_state(const utime_t extime) {
+ embedded_state this_state = embedded_states.top();
+ state_history.push_back(state_history_entry{
+ this_state.first, extime, this_state.second});
+ embedded_states.pop();
+ }
+
+ bool empty() const {
+ return embedded_states.empty();
+ }
+
+ epoch_t this_epoch;
+ std::vector<state_history_entry> state_history;
+ std::stack<embedded_state> embedded_states;
+};
+
+class PGStateHistory {
+public:
+ PGStateHistory(const EpochSource &es) : buffer(10), es(es) {}
+
+ void enter(const utime_t entime, const char* state);
+
+ void exit(const char* state);
+
+ void reset() {
+ buffer.push_back(std::move(pi));
+ pi = nullptr;
+ }
+
+ void dump(ceph::Formatter* f) const;
+
+ const char *get_current_state() const {
+ if (pi == nullptr) return "unknown";
+ return std::get<1>(pi->embedded_states.top());
+ }
+
+private:
+ std::unique_ptr<PGStateInstance> pi;
+ boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
+ const EpochSource &es;
+};
diff --git a/src/osd/PGTransaction.h b/src/osd/PGTransaction.h
new file mode 100644
index 000000000..3b5b9e72c
--- /dev/null
+++ b/src/osd/PGTransaction.h
@@ -0,0 +1,601 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef PGTRANSACTION_H
+#define PGTRANSACTION_H
+
+#include <map>
+#include <memory>
+#include <optional>
+
+#include "common/hobject.h"
+#include "osd/osd_types.h"
+#include "osd/osd_internal_types.h"
+#include "common/interval_map.h"
+#include "common/inline_variant.h"
+
+/**
+ * This class represents transactions which can be submitted to
+ * a PGBackend. For expediency, there are some constraints on
+ * the operations submitted:
+ * 1) Rename sources may only be referenced prior to the rename
+ * operation to the destination.
+ * 2) The graph formed by edges of source->destination for clones
+ * (Create) and Renames must be acyclic.
+ * 3) clone_range sources must not be modified by the same
+ * transaction
+ */
+class PGTransaction {
+public:
+ std::map<hobject_t, ObjectContextRef> obc_map;
+
+ class ObjectOperation {
+ public:
+ struct Init
+ {
+ struct None {};
+ struct Create {};
+ struct Clone {
+ hobject_t source;
+ };
+ struct Rename {
+ hobject_t source; // must be temp object
+ };
+ };
+ using InitType = boost::variant<
+ Init::None,
+ Init::Create,
+ Init::Clone,
+ Init::Rename>;
+
+ InitType init_type = Init::None();
+ bool delete_first = false;
+
+ /**
+ * is_none() && is_delete() indicates that we are deleting an
+ * object which already exists and not recreating it. delete_first means
+ * that the transaction logically removes the object.
+
+ * There are really 4 cases:
+
+ * 1) We are modifying an existing object (is_none() &&
+ * !is_delete())
+ * a) If it's an append, we just write into the log entry the old size
+ * b) If it's an actual overwrite, we save the old versions of the
+ * extents being overwritten and write those offsets into the log
+ * entry
+ * 2) We are removing and then recreating an object (!is_none() && is_delete())
+ * -- stash
+ * 3) We are removing an object (is_none() && is_delete()) -- stash
+ * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
+ * stash)
+ *
+ * Create, Clone, Rename are the three ways we can recreate it.
+ * ECBackend transaction planning needs this context
+ * to figure out how to perform the transaction.
+ */
+ bool deletes_first() const {
+ return delete_first;
+ }
+ bool is_delete() const {
+ return boost::get<Init::None>(&init_type) != nullptr && delete_first;
+ }
+ bool is_none() const {
+ return boost::get<Init::None>(&init_type) != nullptr && !delete_first;
+ }
+ bool is_fresh_object() const {
+ return boost::get<Init::None>(&init_type) == nullptr;
+ }
+ bool is_rename() const {
+ return boost::get<Init::Rename>(&init_type) != nullptr;
+ }
+ bool has_source(hobject_t *source = nullptr) const {
+ return match(
+ init_type,
+ [&](const Init::Clone &op) -> bool {
+ if (source)
+ *source = op.source;
+ return true;
+ },
+ [&](const Init::Rename &op) -> bool {
+ if (source)
+ *source = op.source;
+ return true;
+ },
+ [&](const Init::None &) -> bool { return false; },
+ [&](const Init::Create &) -> bool { return false; });
+ }
+
+ bool clear_omap = false;
+
+ /**
+ * truncate
+ * <lowest, last> ?
+ *
+ * truncate is represented as a pair because in the event of
+ * multiple truncates within a single transaction we need to
+ * remember the lowest truncate and the final object size
+ * (the last truncate). We also adjust the buffers map
+ * to account for truncates overriding previous writes */
+ std::optional<std::pair<uint64_t, uint64_t> > truncate = std::nullopt;
+
+ std::map<std::string, std::optional<ceph::buffer::list> > attr_updates;
+
+ enum class OmapUpdateType {Remove, Insert, RemoveRange};
+ std::vector<std::pair<OmapUpdateType, ceph::buffer::list> > omap_updates;
+
+ std::optional<ceph::buffer::list> omap_header;
+
+ /// (old, new) -- only valid with no truncate or buffer updates
+ std::optional<std::pair<std::set<snapid_t>, std::set<snapid_t>>> updated_snaps;
+
+ struct alloc_hint_t {
+ uint64_t expected_object_size;
+ uint64_t expected_write_size;
+ uint32_t flags;
+ };
+ std::optional<alloc_hint_t> alloc_hint;
+
+ struct BufferUpdate {
+ struct Write {
+ ceph::buffer::list buffer;
+ uint32_t fadvise_flags;
+ };
+ struct Zero {
+ uint64_t len;
+ };
+ struct CloneRange {
+ hobject_t from;
+ uint64_t offset;
+ uint64_t len;
+ };
+ };
+ using BufferUpdateType = boost::variant<
+ BufferUpdate::Write,
+ BufferUpdate::Zero,
+ BufferUpdate::CloneRange>;
+
+ private:
+ struct SplitMerger {
+ BufferUpdateType split(
+ uint64_t offset,
+ uint64_t len,
+ const BufferUpdateType &bu) const {
+ return match(
+ bu,
+ [&](const BufferUpdate::Write &w) -> BufferUpdateType {
+ ceph::buffer::list bl;
+ bl.substr_of(w.buffer, offset, len);
+ return BufferUpdate::Write{bl, w.fadvise_flags};
+ },
+ [&](const BufferUpdate::Zero &) -> BufferUpdateType {
+ return BufferUpdate::Zero{len};
+ },
+ [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
+ return BufferUpdate::CloneRange{c.from, c.offset + offset, len};
+ });
+ }
+ uint64_t length(
+ const BufferUpdateType &left) const {
+ return match(
+ left,
+ [&](const BufferUpdate::Write &w) -> uint64_t {
+ return w.buffer.length();
+ },
+ [&](const BufferUpdate::Zero &z) -> uint64_t {
+ return z.len;
+ },
+ [&](const BufferUpdate::CloneRange &c) -> uint64_t {
+ return c.len;
+ });
+ }
+ bool can_merge(
+ const BufferUpdateType &left,
+ const BufferUpdateType &right) const {
+ return match(
+ left,
+ [&](const BufferUpdate::Write &w) -> bool {
+ auto r = boost::get<BufferUpdate::Write>(&right);
+ return r != nullptr && (w.fadvise_flags == r->fadvise_flags);
+ },
+ [&](const BufferUpdate::Zero &) -> bool {
+ auto r = boost::get<BufferUpdate::Zero>(&right);
+ return r != nullptr;
+ },
+ [&](const BufferUpdate::CloneRange &c) -> bool {
+ return false;
+ });
+ }
+ BufferUpdateType merge(
+ BufferUpdateType &&left,
+ BufferUpdateType &&right) const {
+ return match(
+ left,
+ [&](const BufferUpdate::Write &w) -> BufferUpdateType {
+ auto r = boost::get<BufferUpdate::Write>(&right);
+ ceph_assert(r && w.fadvise_flags == r->fadvise_flags);
+ ceph::buffer::list bl = w.buffer;
+ bl.append(r->buffer);
+ return BufferUpdate::Write{bl, w.fadvise_flags};
+ },
+ [&](const BufferUpdate::Zero &z) -> BufferUpdateType {
+ auto r = boost::get<BufferUpdate::Zero>(&right);
+ ceph_assert(r);
+ return BufferUpdate::Zero{z.len + r->len};
+ },
+ [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
+ ceph_abort_msg("violates can_merge condition");
+ return left;
+ });
+ }
+ };
+ public:
+ using buffer_update_type = interval_map<
+ uint64_t, BufferUpdateType, SplitMerger>;
+ buffer_update_type buffer_updates;
+
+ friend class PGTransaction;
+ };
+ std::map<hobject_t, ObjectOperation> op_map;
+private:
+ ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) {
+ auto &op = op_map[hoid];
+ ceph_assert(!op.is_delete());
+ return op;
+ }
+ ObjectOperation &get_object_op(const hobject_t &hoid) {
+ return op_map[hoid];
+ }
+public:
+ void add_obc(
+ ObjectContextRef obc) {
+ ceph_assert(obc);
+ obc_map[obc->obs.oi.soid] = obc;
+ }
+ /// Sets up state for new object
+ void create(
+ const hobject_t &hoid
+ ) {
+ auto &op = op_map[hoid];
+ ceph_assert(op.is_none() || op.is_delete());
+ op.init_type = ObjectOperation::Init::Create();
+ }
+
+ /// Sets up state for target cloned from source
+ void clone(
+ const hobject_t &target, ///< [in] obj to clone to
+ const hobject_t &source ///< [in] obj to clone from
+ ) {
+ auto &op = op_map[target];
+ ceph_assert(op.is_none() || op.is_delete());
+ op.init_type = ObjectOperation::Init::Clone{source};
+ }
+
+ /// Sets up state for target renamed from source
+ void rename(
+ const hobject_t &target, ///< [in] to, must not exist, be non-temp
+ const hobject_t &source ///< [in] source (must be a temp object)
+ ) {
+ ceph_assert(source.is_temp());
+ ceph_assert(!target.is_temp());
+ auto &op = op_map[target];
+ ceph_assert(op.is_none() || op.is_delete());
+
+ bool del_first = op.is_delete();
+ auto iter = op_map.find(source);
+ if (iter != op_map.end()) {
+ op = iter->second;
+ op_map.erase(iter);
+ op.delete_first = del_first;
+ }
+
+ op.init_type = ObjectOperation::Init::Rename{source};
+ }
+
+ /// Remove -- must not be called on rename target
+ void remove(
+ const hobject_t &hoid ///< [in] obj to remove
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ if (!op.is_fresh_object()) {
+ ceph_assert(!op.updated_snaps);
+ op = ObjectOperation();
+ op.delete_first = true;
+ } else {
+ ceph_assert(!op.is_rename());
+ op_map.erase(hoid); // make it a noop if it's a fresh object
+ }
+ }
+
+ void update_snaps(
+ const hobject_t &hoid, ///< [in] object for snaps
+ const std::set<snapid_t> &old_snaps,///< [in] old snaps value
+ const std::set<snapid_t> &new_snaps ///< [in] new snaps value
+ ) {
+ auto &op = get_object_op(hoid);
+ ceph_assert(!op.updated_snaps);
+ ceph_assert(op.buffer_updates.empty());
+ ceph_assert(!op.truncate);
+ op.updated_snaps = make_pair(
+ old_snaps,
+ new_snaps);
+ }
+
+ /// Clears, truncates
+ void omap_clear(
+ const hobject_t &hoid ///< [in] object to clear omap
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.clear_omap = true;
+ op.omap_updates.clear();
+ op.omap_header = std::nullopt;
+ }
+ void truncate(
+ const hobject_t &hoid, ///< [in] object
+ uint64_t off ///< [in] offset to truncate to
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ ceph_assert(!op.updated_snaps);
+ op.buffer_updates.erase(
+ off,
+ std::numeric_limits<uint64_t>::max() - off);
+ if (!op.truncate || off < op.truncate->first) {
+ op.truncate = std::pair<uint64_t, uint64_t>(off, off);
+ } else {
+ op.truncate->second = off;
+ }
+ }
+
+ /// Attr ops
+ void setattrs(
+ const hobject_t &hoid, ///< [in] object to write
+ std::map<std::string, ceph::buffer::list> &attrs ///< [in] attrs, may be cleared
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ for (auto &&i: attrs) {
+ auto& d = op.attr_updates[i.first];
+ d = i.second;
+ d->rebuild();
+ }
+ }
+ void setattr(
+ const hobject_t &hoid, ///< [in] object to write
+ const std::string &attrname, ///< [in] attr to write
+ ceph::buffer::list &bl ///< [in] val to write, may be claimed
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ auto& d = op.attr_updates[attrname];
+ d = bl;
+ d->rebuild();
+ }
+ void rmattr(
+ const hobject_t &hoid, ///< [in] object to write
+ const std::string &attrname ///< [in] attr to remove
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.attr_updates[attrname] = std::nullopt;
+ }
+
+ /// set alloc hint
+ void set_alloc_hint(
+ const hobject_t &hoid, ///< [in] object (must exist)
+ uint64_t expected_object_size, ///< [in]
+ uint64_t expected_write_size,
+ uint32_t flags
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.alloc_hint = ObjectOperation::alloc_hint_t{
+ expected_object_size, expected_write_size, flags};
+ }
+
+ /// Buffer updates
+ void write(
+ const hobject_t &hoid, ///< [in] object to write
+ uint64_t off, ///< [in] off at which to write
+ uint64_t len, ///< [in] len to write from bl
+ ceph::buffer::list &bl, ///< [in] bl to write will be claimed to len
+ uint32_t fadvise_flags = 0 ///< [in] fadvise hint
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ ceph_assert(!op.updated_snaps);
+ ceph_assert(len > 0);
+ ceph_assert(len == bl.length());
+ op.buffer_updates.insert(
+ off,
+ len,
+ ObjectOperation::BufferUpdate::Write{bl, fadvise_flags});
+ }
+ void clone_range(
+ const hobject_t &from, ///< [in] from
+ const hobject_t &to, ///< [in] to
+ uint64_t fromoff, ///< [in] offset
+ uint64_t len, ///< [in] len
+ uint64_t tooff ///< [in] offset
+ ) {
+ auto &op = get_object_op_for_modify(to);
+ ceph_assert(!op.updated_snaps);
+ op.buffer_updates.insert(
+ tooff,
+ len,
+ ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len});
+ }
+ void zero(
+ const hobject_t &hoid, ///< [in] object
+ uint64_t off, ///< [in] offset to start zeroing at
+ uint64_t len ///< [in] amount to zero
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ ceph_assert(!op.updated_snaps);
+ op.buffer_updates.insert(
+ off,
+ len,
+ ObjectOperation::BufferUpdate::Zero{len});
+ }
+
+ /// Omap updates
+ void omap_setkeys(
+ const hobject_t &hoid, ///< [in] object to write
+ ceph::buffer::list &keys_bl ///< [in] encoded map<string, ceph::buffer::list>
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.omap_updates.emplace_back(
+ std::make_pair(
+ ObjectOperation::OmapUpdateType::Insert,
+ keys_bl));
+ }
+ void omap_setkeys(
+ const hobject_t &hoid, ///< [in] object to write
+ std::map<std::string, ceph::buffer::list> &keys ///< [in] omap keys, may be cleared
+ ) {
+ using ceph::encode;
+ ceph::buffer::list bl;
+ encode(keys, bl);
+ omap_setkeys(hoid, bl);
+ }
+ void omap_rmkeys(
+ const hobject_t &hoid, ///< [in] object to write
+ ceph::buffer::list &keys_bl ///< [in] encode set<string>
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.omap_updates.emplace_back(
+ std::make_pair(
+ ObjectOperation::OmapUpdateType::Remove,
+ keys_bl));
+ }
+ void omap_rmkeys(
+ const hobject_t &hoid, ///< [in] object to write
+ std::set<std::string> &keys ///< [in] omap keys, may be cleared
+ ) {
+ using ceph::encode;
+ ceph::buffer::list bl;
+ encode(keys, bl);
+ omap_rmkeys(hoid, bl);
+ }
+ void omap_rmkeyrange(
+ const hobject_t &hoid, ///< [in] object to write
+ ceph::buffer::list &range_bl ///< [in] encode string[2]
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.omap_updates.emplace_back(
+ std::make_pair(
+ ObjectOperation::OmapUpdateType::RemoveRange,
+ range_bl));
+ }
+ void omap_rmkeyrange(
+ const hobject_t &hoid, ///< [in] object to write
+ std::string& key_begin, ///< [in] first key in range
+ std::string& key_end ///< [in] first key past range, range is [first,last)
+ ) {
+ ceph::buffer::list bl;
+ ::encode(key_begin, bl);
+ ::encode(key_end, bl);
+ omap_rmkeyrange(hoid, bl);
+ }
+ void omap_setheader(
+ const hobject_t &hoid, ///< [in] object to write
+ ceph::buffer::list &header ///< [in] header
+ ) {
+ auto &op = get_object_op_for_modify(hoid);
+ op.omap_header = header;
+ }
+
+ bool empty() const {
+ return op_map.empty();
+ }
+
+ uint64_t get_bytes_written() const {
+ uint64_t ret = 0;
+ for (auto &&i: op_map) {
+ for (auto &&j: i.second.buffer_updates) {
+ ret += j.get_len();
+ }
+ }
+ return ret;
+ }
+
+ void nop(
+ const hobject_t &hoid ///< [in] obj to which we are doing nothing
+ ) {
+ get_object_op_for_modify(hoid);
+ }
+
+ /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
+ * sinks are always called before clone sources
+ *
+ * TODO: add a fast path for the single object case and possibly the single
+ * object clone from source case (make_writeable made a clone).
+ *
+ * This structure only requires that the source->sink graph be acyclic.
+ * This is much more general than is actually required by PrimaryLogPG.
+ * Only 4 flavors of multi-object transactions actually happen:
+ * 1) rename temp -> object for copyfrom
+ * 2) clone head -> clone, modify head for make_writeable on normal head write
+ * 3) clone clone -> head for rollback
+ * 4) 2 + 3
+ *
+ * We can bypass the below logic for single object transactions trivially
+ * (including case 1 above since temp doesn't show up again).
+ * For 2-3, we could add something ad-hoc to ensure that they happen in the
+ * right order, but it actually seems easier to just do the graph construction.
+ */
+ template <typename T>
+ void safe_create_traverse(T &&t) {
+ std::map<hobject_t, std::list<hobject_t>> dgraph;
+ std::list<hobject_t> stack;
+
+ // Populate stack with roots, dgraph with edges
+ for (auto &&opair: op_map) {
+ hobject_t source;
+ if (opair.second.has_source(&source)) {
+ auto &l = dgraph[source];
+ if (l.empty() && !op_map.count(source)) {
+ /* Source oids not in op_map need to be added as roots
+ * (but only once!) */
+ stack.push_back(source);
+ }
+ l.push_back(opair.first);
+ } else {
+ stack.push_back(opair.first);
+ }
+ }
+
+ /* Why don't we need to worry about accessing the same node
+ * twice? dgraph nodes always have in-degree at most 1 because
+ * the inverse graph nodes (source->dest) can have out-degree
+ * at most 1 (only one possible source). We do a post-order
+ * depth-first traversal here to ensure we call f on children
+ * before parents.
+ */
+ while (!stack.empty()) {
+ hobject_t &cur = stack.front();
+ auto diter = dgraph.find(cur);
+ if (diter == dgraph.end()) {
+ /* Leaf: pop and call t() */
+ auto opiter = op_map.find(cur);
+ if (opiter != op_map.end())
+ t(*opiter);
+ stack.pop_front();
+ } else {
+ /* Internal node: push children onto stack, remove edge,
+ * recurse. When this node is encountered again, it'll
+ * be a leaf */
+ ceph_assert(!diter->second.empty());
+ stack.splice(stack.begin(), diter->second);
+ dgraph.erase(diter);
+ }
+ }
+ }
+};
+using PGTransactionUPtr = std::unique_ptr<PGTransaction>;
+
+#endif
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
new file mode 100644
index 000000000..9709f3ce1
--- /dev/null
+++ b/src/osd/PeeringState.cc
@@ -0,0 +1,7607 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PGPeeringEvent.h"
+#include "common/ceph_releases.h"
+#include "common/dout.h"
+#include "PeeringState.h"
+
+#include "messages/MOSDPGRemove.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDScrubReserve.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGInfo2.h"
+#include "messages/MOSDPGTrim.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGNotify2.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGQuery2.h"
+#include "messages/MOSDPGLease.h"
+#include "messages/MOSDPGLeaseAck.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+
+using std::dec;
+using std::hex;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+
+using ceph::Formatter;
+using ceph::make_message;
+
+BufferedRecoveryMessages::BufferedRecoveryMessages(
+ ceph_release_t r,
+ PeeringCtx &ctx)
+ : require_osd_release(r) {
+ // steal messages from ctx
+ message_map.swap(ctx.message_map);
+}
+
+void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
+{
+ if (require_osd_release >= ceph_release_t::octopus) {
+ spg_t pgid(n.info.pgid.pgid, n.to);
+ send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
+ } else {
+ send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
+ }
+}
+
+void BufferedRecoveryMessages::send_query(
+ int to,
+ spg_t to_spgid,
+ const pg_query_t &q)
+{
+ if (require_osd_release >= ceph_release_t::octopus) {
+ send_osd_message(to,
+ make_message<MOSDPGQuery2>(to_spgid, q));
+ } else {
+ auto m = make_message<MOSDPGQuery>(
+ q.epoch_sent,
+ MOSDPGQuery::pg_list_t{{to_spgid, q}});
+ send_osd_message(to, m);
+ }
+}
+
+void BufferedRecoveryMessages::send_info(
+ int to,
+ spg_t to_spgid,
+ epoch_t min_epoch,
+ epoch_t cur_epoch,
+ const pg_info_t &info,
+ std::optional<pg_lease_t> lease,
+ std::optional<pg_lease_ack_t> lease_ack)
+{
+ if (require_osd_release >= ceph_release_t::octopus) {
+ send_osd_message(
+ to,
+ make_message<MOSDPGInfo2>(
+ to_spgid,
+ info,
+ cur_epoch,
+ min_epoch,
+ lease,
+ lease_ack)
+ );
+ } else {
+ send_osd_message(
+ to,
+ make_message<MOSDPGInfo>(
+ cur_epoch,
+ vector{pg_notify_t{to_spgid.shard,
+ info.pgid.shard,
+ min_epoch, cur_epoch,
+ info, PastIntervals{}}})
+ );
+ }
+}
+
+void PGPool::update(OSDMapRef map)
+{
+ const pg_pool_t *pi = map->get_pg_pool(id);
+ if (!pi) {
+ return; // pool has been deleted
+ }
+ info = *pi;
+ name = map->get_pool_name(id);
+
+ bool updated = false;
+ if ((map->get_epoch() != cached_epoch + 1) ||
+ (pi->get_snap_epoch() == map->get_epoch())) {
+ updated = true;
+ }
+
+ if (info.is_pool_snaps_mode() && updated) {
+ snapc = pi->get_snap_context();
+ }
+ cached_epoch = map->get_epoch();
+}
+
+/*-------------Peering State Helpers----------------*/
+#undef dout_prefix
+#define dout_prefix (dpp->gen_prefix(*_dout))
+#undef psdout
+#define psdout(x) ldout(cct, x)
+
+PeeringState::PeeringState(
+ CephContext *cct,
+ pg_shard_t pg_whoami,
+ spg_t spgid,
+ const PGPool &_pool,
+ OSDMapRef curmap,
+ DoutPrefixProvider *dpp,
+ PeeringListener *pl)
+ : state_history(*pl),
+ cct(cct),
+ spgid(spgid),
+ dpp(dpp),
+ pl(pl),
+ orig_ctx(0),
+ osdmap_ref(curmap),
+ pool(_pool),
+ pg_whoami(pg_whoami),
+ info(spgid),
+ pg_log(cct),
+ missing_loc(spgid, this, dpp, cct),
+ machine(this, cct, spgid, dpp, pl, &state_history)
+{
+ machine.initiate();
+}
+
+void PeeringState::start_handle(PeeringCtx *new_ctx) {
+ ceph_assert(!rctx);
+ ceph_assert(!orig_ctx);
+ orig_ctx = new_ctx;
+ if (new_ctx) {
+ if (messages_pending_flush) {
+ rctx.emplace(*messages_pending_flush, *new_ctx);
+ } else {
+ rctx.emplace(*new_ctx);
+ }
+ rctx->start_time = ceph_clock_now();
+ }
+}
+
+void PeeringState::begin_block_outgoing() {
+ ceph_assert(!messages_pending_flush);
+ ceph_assert(orig_ctx);
+ ceph_assert(rctx);
+ messages_pending_flush = BufferedRecoveryMessages(
+ orig_ctx->require_osd_release);
+ rctx.emplace(*messages_pending_flush, *orig_ctx);
+}
+
+void PeeringState::clear_blocked_outgoing() {
+ ceph_assert(orig_ctx);
+ ceph_assert(rctx);
+ messages_pending_flush = std::optional<BufferedRecoveryMessages>();
+}
+
+void PeeringState::end_block_outgoing() {
+ ceph_assert(messages_pending_flush);
+ ceph_assert(orig_ctx);
+ ceph_assert(rctx);
+
+ orig_ctx->accept_buffered_messages(*messages_pending_flush);
+ rctx.emplace(*orig_ctx);
+ messages_pending_flush = std::optional<BufferedRecoveryMessages>();
+}
+
+void PeeringState::end_handle() {
+ if (rctx) {
+ utime_t dur = ceph_clock_now() - rctx->start_time;
+ machine.event_time += dur;
+ }
+
+ machine.event_count++;
+ rctx = std::nullopt;
+ orig_ctx = NULL;
+}
+
+void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
+{
+ /*
+ * check that any peers we are planning to (or currently) pulling
+ * objects from are dealt with.
+ */
+ missing_loc.check_recovery_sources(osdmap);
+ pl->check_recovery_sources(osdmap);
+
+ for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) {
+ if (!osdmap->is_up(i->osd)) {
+ psdout(10) << "peer_log_requested removing " << *i << dendl;
+ peer_log_requested.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+
+ for (auto i = peer_missing_requested.begin();
+ i != peer_missing_requested.end();) {
+ if (!osdmap->is_up(i->osd)) {
+ psdout(10) << "peer_missing_requested removing " << *i << dendl;
+ peer_missing_requested.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+}
+
+void PeeringState::update_history(const pg_history_t& new_history)
+{
+ auto mnow = pl->get_mnow();
+ info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
+ if (info.history.merge(new_history)) {
+ psdout(20) << __func__ << " advanced history from " << new_history << dendl;
+ dirty_info = true;
+ if (info.history.last_epoch_clean >= info.history.same_interval_since) {
+ psdout(20) << __func__ << " clearing past_intervals" << dendl;
+ past_intervals.clear();
+ dirty_big_info = true;
+ }
+ prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
+ if (prior_readable_until_ub != ceph::signedspan::zero()) {
+ dout(20) << __func__
+ << " prior_readable_until_ub " << prior_readable_until_ub
+ << " (mnow " << mnow << " + "
+ << info.history.prior_readable_until_ub << ")" << dendl;
+ }
+ }
+ pl->on_info_history_change();
+}
+
+hobject_t PeeringState::earliest_backfill() const
+{
+ hobject_t e = hobject_t::get_max();
+ for (const pg_shard_t& bt : get_backfill_targets()) {
+ const pg_info_t &pi = get_peer_info(bt);
+ e = std::min(pi.last_backfill, e);
+ }
+ return e;
+}
+
+void PeeringState::purge_strays()
+{
+ if (is_premerge()) {
+ psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
+ << dendl;
+ return;
+ }
+ if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
+ return;
+ }
+ psdout(10) << "purge_strays " << stray_set << dendl;
+
+ bool removed = false;
+ for (auto p = stray_set.begin(); p != stray_set.end(); ++p) {
+ ceph_assert(!is_acting_recovery_backfill(*p));
+ if (get_osdmap()->is_up(p->osd)) {
+ psdout(10) << "sending PGRemove to osd." << *p << dendl;
+ vector<spg_t> to_remove;
+ to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
+ auto m = make_message<MOSDPGRemove>(
+ get_osdmap_epoch(),
+ to_remove);
+ pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
+ } else {
+ psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
+ }
+ peer_missing.erase(*p);
+ peer_info.erase(*p);
+ missing_loc.remove_stray_recovery_sources(*p);
+ peer_purged.insert(*p);
+ removed = true;
+ }
+
+ // if we removed anyone, update peers (which include peer_info)
+ if (removed)
+ update_heartbeat_peers();
+
+ stray_set.clear();
+
+ // clear _requested maps; we may have to peer() again if we discover
+ // (more) stray content
+ peer_log_requested.clear();
+ peer_missing_requested.clear();
+}
+
+void PeeringState::query_unfound(Formatter *f, string state)
+{
+ psdout(20) << "Enter PeeringState common QueryUnfound" << dendl;
+ {
+ f->dump_string("state", state);
+ f->dump_bool("available_might_have_unfound", true);
+ f->open_array_section("might_have_unfound");
+ for (auto p = might_have_unfound.begin();
+ p != might_have_unfound.end();
+ ++p) {
+ if (peer_missing.count(*p)) {
+ ; // Ignore already probed OSDs
+ } else {
+ f->open_object_section("osd");
+ f->dump_stream("osd") << *p;
+ if (peer_missing_requested.count(*p)) {
+ f->dump_string("status", "querying");
+ } else if (!get_osdmap()->is_up(p->osd)) {
+ f->dump_string("status", "osd is down");
+ } else {
+ f->dump_string("status", "not queried");
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+ }
+ psdout(20) << "Exit PeeringState common QueryUnfound" << dendl;
+ return;
+}
+
+bool PeeringState::proc_replica_info(
+ pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
+{
+ auto p = peer_info.find(from);
+ if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
+ psdout(10) << " got dup osd." << from << " info "
+ << oinfo << ", identical to ours" << dendl;
+ return false;
+ }
+
+ if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
+ psdout(10) << " got info " << oinfo << " from down osd." << from
+ << " discarding" << dendl;
+ return false;
+ }
+
+ psdout(10) << " got osd." << from << " " << oinfo << dendl;
+ ceph_assert(is_primary());
+ peer_info[from] = oinfo;
+ might_have_unfound.insert(from);
+
+ update_history(oinfo.history);
+
+ // stray?
+ if (!is_up(from) && !is_acting(from)) {
+ psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
+ stray_set.insert(from);
+ if (is_clean()) {
+ purge_strays();
+ }
+ }
+
+ // was this a new info? if so, update peers!
+ if (p == peer_info.end())
+ update_heartbeat_peers();
+
+ return true;
+}
+
+
+void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
+{
+ // Remove any downed osds from peer_info
+ bool removed = false;
+ auto p = peer_info.begin();
+ while (p != peer_info.end()) {
+ if (!osdmap->is_up(p->first.osd)) {
+ psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
+ peer_missing.erase(p->first);
+ peer_log_requested.erase(p->first);
+ peer_missing_requested.erase(p->first);
+ peer_info.erase(p++);
+ removed = true;
+ } else
+ ++p;
+ }
+
+ // Remove any downed osds from peer_purged so we can re-purge if necessary
+ auto it = peer_purged.begin();
+ while (it != peer_purged.end()) {
+ if (!osdmap->is_up(it->osd)) {
+ psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
+ peer_purged.erase(it++);
+ } else {
+ ++it;
+ }
+ }
+
+ // if we removed anyone, update peers (which include peer_info)
+ if (removed)
+ update_heartbeat_peers();
+
+ check_recovery_sources(osdmap);
+}
+
+void PeeringState::update_heartbeat_peers()
+{
+ if (!is_primary())
+ return;
+
+ set<int> new_peers;
+ for (unsigned i=0; i<acting.size(); i++) {
+ if (acting[i] != CRUSH_ITEM_NONE)
+ new_peers.insert(acting[i]);
+ }
+ for (unsigned i=0; i<up.size(); i++) {
+ if (up[i] != CRUSH_ITEM_NONE)
+ new_peers.insert(up[i]);
+ }
+ for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
+ new_peers.insert(p->first.osd);
+ }
+ pl->update_heartbeat_peers(std::move(new_peers));
+}
+
+void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
+{
+ pl->prepare_write(
+ info,
+ last_written_info,
+ past_intervals,
+ pg_log,
+ dirty_info,
+ dirty_big_info,
+ last_persisted_osdmap < get_osdmap_epoch(),
+ t);
+ if (dirty_info || dirty_big_info) {
+ last_persisted_osdmap = get_osdmap_epoch();
+ last_written_info = info;
+ dirty_info = false;
+ dirty_big_info = false;
+ }
+}
+
+void PeeringState::advance_map(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ vector<int>& newup, int up_primary,
+ vector<int>& newacting, int acting_primary,
+ PeeringCtx &rctx)
+{
+ ceph_assert(lastmap == osdmap_ref);
+ psdout(10) << "handle_advance_map "
+ << newup << "/" << newacting
+ << " -- " << up_primary << "/" << acting_primary
+ << dendl;
+
+ update_osdmap_ref(osdmap);
+ pool.update(osdmap);
+
+ AdvMap evt(
+ osdmap, lastmap, newup, up_primary,
+ newacting, acting_primary);
+ handle_event(evt, &rctx);
+ if (pool.info.last_change == osdmap_ref->get_epoch()) {
+ pl->on_pool_change();
+ }
+ readable_interval = pool.get_readable_interval(cct->_conf);
+ last_require_osd_release = osdmap->require_osd_release;
+}
+
+void PeeringState::activate_map(PeeringCtx &rctx)
+{
+ psdout(10) << __func__ << dendl;
+ ActMap evt;
+ handle_event(evt, &rctx);
+ if (osdmap_ref->get_epoch() - last_persisted_osdmap >
+ cct->_conf->osd_pg_epoch_persisted_max_stale) {
+ psdout(20) << __func__ << ": Dirtying info: last_persisted is "
+ << last_persisted_osdmap
+ << " while current is " << osdmap_ref->get_epoch() << dendl;
+ dirty_info = true;
+ } else {
+ psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
+ << last_persisted_osdmap
+ << " while current is " << osdmap_ref->get_epoch() << dendl;
+ }
+ write_if_dirty(rctx.transaction);
+
+ if (get_osdmap()->check_new_blocklist_entries()) {
+ pl->check_blocklisted_watchers();
+ }
+}
+
+void PeeringState::set_last_peering_reset()
+{
+ psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
+ if (last_peering_reset != get_osdmap_epoch()) {
+ last_peering_reset = get_osdmap_epoch();
+ psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
+ clear_blocked_outgoing();
+ if (!pl->try_flush_or_schedule_async()) {
+ psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
+ begin_block_outgoing();
+ } else {
+ psdout(10) << "Not blocking outgoing recovery messages" << dendl;
+ }
+ }
+}
+
+void PeeringState::complete_flush()
+{
+ flushes_in_progress--;
+ if (flushes_in_progress == 0) {
+ pl->on_flushed();
+ }
+}
+
+void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
+{
+ const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
+ if (!pi) {
+ return; // pool deleted
+ }
+ bool changed = false;
+ if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
+ const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
+ if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
+ psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
+ changed = true;
+ }
+ }
+ if (changed) {
+ info.history.last_epoch_marked_full = osdmap->get_epoch();
+ dirty_info = true;
+ }
+}
+
+bool PeeringState::should_restart_peering(
+ int newupprimary,
+ int newactingprimary,
+ const vector<int>& newup,
+ const vector<int>& newacting,
+ OSDMapRef lastmap,
+ OSDMapRef osdmap)
+{
+ if (PastIntervals::is_new_interval(
+ primary.osd,
+ newactingprimary,
+ acting,
+ newacting,
+ up_primary.osd,
+ newupprimary,
+ up,
+ newup,
+ osdmap.get(),
+ lastmap.get(),
+ info.pgid.pgid)) {
+ psdout(20) << "new interval newup " << newup
+ << " newacting " << newacting << dendl;
+ return true;
+ }
+ if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
+ psdout(10) << __func__ << " osd transitioned from down -> up"
+ << dendl;
+ return true;
+ }
+ return false;
+}
+
+/* Called before initializing peering during advance_map */
+void PeeringState::start_peering_interval(
+ const OSDMapRef lastmap,
+ const vector<int>& newup, int new_up_primary,
+ const vector<int>& newacting, int new_acting_primary,
+ ObjectStore::Transaction &t)
+{
+ const OSDMapRef osdmap = get_osdmap();
+
+ set_last_peering_reset();
+
+ vector<int> oldacting, oldup;
+ int oldrole = get_role();
+
+ if (is_primary()) {
+ pl->clear_ready_to_merge();
+ }
+
+
+ pg_shard_t old_acting_primary = get_primary();
+ pg_shard_t old_up_primary = up_primary;
+ bool was_old_primary = is_primary();
+ bool was_old_nonprimary = is_nonprimary();
+
+ acting.swap(oldacting);
+ up.swap(oldup);
+ init_primary_up_acting(
+ newup,
+ newacting,
+ new_up_primary,
+ new_acting_primary);
+
+ if (info.stats.up != up ||
+ info.stats.acting != acting ||
+ info.stats.up_primary != new_up_primary ||
+ info.stats.acting_primary != new_acting_primary) {
+ info.stats.up = up;
+ info.stats.up_primary = new_up_primary;
+ info.stats.acting = acting;
+ info.stats.acting_primary = new_acting_primary;
+ info.stats.mapping_epoch = osdmap->get_epoch();
+ }
+
+ pl->clear_publish_stats();
+
+ // This will now be remapped during a backfill in cases
+ // that it would not have been before.
+ if (up != acting)
+ state_set(PG_STATE_REMAPPED);
+ else
+ state_clear(PG_STATE_REMAPPED);
+
+ int role = osdmap->calc_pg_role(pg_whoami, acting);
+ set_role(role);
+
+ // did acting, up, primary|acker change?
+ if (!lastmap) {
+ psdout(10) << " no lastmap" << dendl;
+ dirty_info = true;
+ dirty_big_info = true;
+ info.history.same_interval_since = osdmap->get_epoch();
+ } else {
+ std::stringstream debug;
+ ceph_assert(info.history.same_interval_since != 0);
+ bool new_interval = PastIntervals::check_new_interval(
+ old_acting_primary.osd,
+ new_acting_primary,
+ oldacting, newacting,
+ old_up_primary.osd,
+ new_up_primary,
+ oldup, newup,
+ info.history.same_interval_since,
+ info.history.last_epoch_clean,
+ osdmap.get(),
+ lastmap.get(),
+ info.pgid.pgid,
+ missing_loc.get_recoverable_predicate(),
+ &past_intervals,
+ &debug);
+ psdout(10) << __func__ << ": check_new_interval output: "
+ << debug.str() << dendl;
+ if (new_interval) {
+ if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
+ info.history.last_epoch_clean < osdmap->get_epoch()) {
+ psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
+ // our information is incomplete and useless; someone else was clean
+ // after everything we know if osdmaps were trimmed.
+ past_intervals.clear();
+ } else {
+ psdout(10) << " noting past " << past_intervals << dendl;
+ }
+ dirty_info = true;
+ dirty_big_info = true;
+ info.history.same_interval_since = osdmap->get_epoch();
+ if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
+ info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
+ osdmap->get_pg_num(info.pgid.pgid.pool()),
+ nullptr)) {
+ info.history.last_epoch_split = osdmap->get_epoch();
+ }
+ }
+ }
+
+ if (old_up_primary != up_primary ||
+ oldup != up) {
+ info.history.same_up_since = osdmap->get_epoch();
+ }
+ // this comparison includes primary rank via pg_shard_t
+ if (old_acting_primary != get_primary()) {
+ info.history.same_primary_since = osdmap->get_epoch();
+ }
+
+ on_new_interval();
+ pl->on_info_history_change();
+
+ psdout(1) << __func__ << " up " << oldup << " -> " << up
+ << ", acting " << oldacting << " -> " << acting
+ << ", acting_primary " << old_acting_primary << " -> "
+ << new_acting_primary
+ << ", up_primary " << old_up_primary << " -> " << new_up_primary
+ << ", role " << oldrole << " -> " << role
+ << ", features acting " << acting_features
+ << " upacting " << upacting_features
+ << dendl;
+
+ // deactivate.
+ state_clear(PG_STATE_ACTIVE);
+ state_clear(PG_STATE_PEERED);
+ state_clear(PG_STATE_PREMERGE);
+ state_clear(PG_STATE_DOWN);
+ state_clear(PG_STATE_RECOVERY_WAIT);
+ state_clear(PG_STATE_RECOVERY_TOOFULL);
+ state_clear(PG_STATE_RECOVERING);
+
+ peer_purged.clear();
+ acting_recovery_backfill.clear();
+
+ // reset primary/replica state?
+ if (was_old_primary || is_primary()) {
+ pl->clear_want_pg_temp();
+ } else if (was_old_nonprimary || is_nonprimary()) {
+ pl->clear_want_pg_temp();
+ }
+ clear_primary_state();
+
+ pl->on_change(t);
+
+ ceph_assert(!deleting);
+
+ // should we tell the primary we are here?
+ send_notify = !is_primary();
+
+ if (role != oldrole ||
+ was_old_primary != is_primary()) {
+ // did primary change?
+ if (was_old_primary != is_primary()) {
+ state_clear(PG_STATE_CLEAN);
+ }
+
+ pl->on_role_change();
+ } else {
+ // no role change.
+ // did primary change?
+ if (get_primary() != old_acting_primary) {
+ psdout(10) << oldacting << " -> " << acting
+ << ", acting primary "
+ << old_acting_primary << " -> " << get_primary()
+ << dendl;
+ } else {
+ // primary is the same.
+ if (is_primary()) {
+ // i am (still) primary. but my replica set changed.
+ state_clear(PG_STATE_CLEAN);
+
+ psdout(10) << oldacting << " -> " << acting
+ << ", replicas changed" << dendl;
+ }
+ }
+ }
+
+ if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
+ psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
+ pl->queue_want_pg_temp(acting);
+ }
+}
+
+void PeeringState::on_new_interval()
+{
+ dout(20) << __func__ << dendl;
+ const OSDMapRef osdmap = get_osdmap();
+
+ // initialize features
+ acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ for (auto p = acting.begin(); p != acting.end(); ++p) {
+ if (*p == CRUSH_ITEM_NONE)
+ continue;
+ uint64_t f = osdmap->get_xinfo(*p).features;
+ acting_features &= f;
+ upacting_features &= f;
+ }
+ for (auto p = up.begin(); p != up.end(); ++p) {
+ if (*p == CRUSH_ITEM_NONE)
+ continue;
+ upacting_features &= osdmap->get_xinfo(*p).features;
+ }
+ psdout(20) << __func__ << " upacting_features 0x" << std::hex
+ << upacting_features << std::dec
+ << " from " << acting << "+" << up << dendl;
+
+ psdout(20) << __func__ << " checking missing set deletes flag. missing = "
+ << get_pg_log().get_missing() << dendl;
+
+ if (!pg_log.get_missing().may_include_deletes &&
+ !perform_deletes_during_peering()) {
+ pl->rebuild_missing_set_with_deletes(pg_log);
+ }
+ ceph_assert(
+ pg_log.get_missing().may_include_deletes ==
+ !perform_deletes_during_peering());
+
+ init_hb_stamps();
+
+ // update lease bounds for a new interval
+ auto mnow = pl->get_mnow();
+ prior_readable_until_ub = std::max(prior_readable_until_ub,
+ readable_until_ub);
+ prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
+ mnow, prior_readable_until_ub);
+ psdout(10) << __func__ << " prior_readable_until_ub "
+ << prior_readable_until_ub << " (mnow " << mnow << " + "
+ << info.history.prior_readable_until_ub << ")" << dendl;
+ prior_readable_down_osds.clear(); // we populate this when we build the priorset
+
+ readable_until =
+ readable_until_ub =
+ readable_until_ub_sent =
+ readable_until_ub_from_primary = ceph::signedspan::zero();
+
+ acting_readable_until_ub.clear();
+ if (is_primary()) {
+ acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
+ }
+
+ pl->on_new_interval();
+}
+
+void PeeringState::init_primary_up_acting(
+ const vector<int> &newup,
+ const vector<int> &newacting,
+ int new_up_primary,
+ int new_acting_primary)
+{
+ actingset.clear();
+ acting = newacting;
+ for (uint8_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] != CRUSH_ITEM_NONE)
+ actingset.insert(
+ pg_shard_t(
+ acting[i],
+ pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ upset.clear();
+ up = newup;
+ for (uint8_t i = 0; i < up.size(); ++i) {
+ if (up[i] != CRUSH_ITEM_NONE)
+ upset.insert(
+ pg_shard_t(
+ up[i],
+ pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ if (!pool.info.is_erasure()) {
+ // replicated
+ up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
+ primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
+ } else {
+ // erasure
+ up_primary = pg_shard_t();
+ primary = pg_shard_t();
+ for (uint8_t i = 0; i < up.size(); ++i) {
+ if (up[i] == new_up_primary) {
+ up_primary = pg_shard_t(up[i], shard_id_t(i));
+ break;
+ }
+ }
+ for (uint8_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] == new_acting_primary) {
+ primary = pg_shard_t(acting[i], shard_id_t(i));
+ break;
+ }
+ }
+ ceph_assert(up_primary.osd == new_up_primary);
+ ceph_assert(primary.osd == new_acting_primary);
+ }
+}
+
+void PeeringState::init_hb_stamps()
+{
+ if (is_primary()) {
+ // we care about all other osds in the acting set
+ hb_stamps.resize(acting.size() - 1);
+ unsigned i = 0;
+ for (auto p : acting) {
+ if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
+ continue;
+ }
+ hb_stamps[i++] = pl->get_hb_stamps(p);
+ }
+ hb_stamps.resize(i);
+ } else if (is_nonprimary()) {
+ // we care about just the primary
+ hb_stamps.resize(1);
+ hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
+ } else {
+ hb_stamps.clear();
+ }
+ dout(10) << __func__ << " now " << hb_stamps << dendl;
+}
+
+
+void PeeringState::clear_recovery_state()
+{
+ async_recovery_targets.clear();
+ backfill_targets.clear();
+}
+
+void PeeringState::clear_primary_state()
+{
+ psdout(10) << "clear_primary_state" << dendl;
+
+ // clear peering state
+ stray_set.clear();
+ peer_log_requested.clear();
+ peer_missing_requested.clear();
+ peer_info.clear();
+ peer_bytes.clear();
+ peer_missing.clear();
+ peer_last_complete_ondisk.clear();
+ peer_activated.clear();
+ min_last_complete_ondisk = eversion_t();
+ pg_trim_to = eversion_t();
+ might_have_unfound.clear();
+ need_up_thru = false;
+ missing_loc.clear();
+ pg_log.reset_recovery_pointers();
+
+ clear_recovery_state();
+
+ last_update_ondisk = eversion_t();
+ missing_loc.clear();
+ pl->clear_primary_state();
+}
+
+/// return [start,end) bounds for required past_intervals
+static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
+ const pg_info_t &info,
+ epoch_t oldest_map) {
+ epoch_t start = std::max(
+ info.history.last_epoch_clean ? info.history.last_epoch_clean :
+ info.history.epoch_pool_created,
+ oldest_map);
+ epoch_t end = std::max(
+ info.history.same_interval_since,
+ info.history.epoch_pool_created);
+ return make_pair(start, end);
+}
+
+
+void PeeringState::check_past_interval_bounds() const
+{
+ auto oldest_epoch = pl->oldest_stored_osdmap();
+ auto rpib = get_required_past_interval_bounds(
+ info,
+ oldest_epoch);
+ if (rpib.first >= rpib.second) {
+ // do not warn if the start bound is dictated by oldest_map; the
+ // past intervals are presumably appropriate given the pg info.
+ if (!past_intervals.empty() &&
+ rpib.first > oldest_epoch) {
+ pl->get_clog_error() << info.pgid << " required past_interval bounds are"
+ << " empty [" << rpib << ") but past_intervals is not: "
+ << past_intervals;
+ derr << info.pgid << " required past_interval bounds are"
+ << " empty [" << rpib << ") but past_intervals is not: "
+ << past_intervals << dendl;
+ }
+ } else {
+ if (past_intervals.empty()) {
+ pl->get_clog_error() << info.pgid << " required past_interval bounds are"
+ << " not empty [" << rpib << ") but past_intervals "
+ << past_intervals << " is empty";
+ derr << info.pgid << " required past_interval bounds are"
+ << " not empty [" << rpib << ") but past_intervals "
+ << past_intervals << " is empty" << dendl;
+ ceph_assert(!past_intervals.empty());
+ }
+
+ auto apib = past_intervals.get_bounds();
+ if (apib.first > rpib.first) {
+ pl->get_clog_error() << info.pgid << " past_intervals [" << apib
+ << ") start interval does not contain the required"
+ << " bound [" << rpib << ") start";
+ derr << info.pgid << " past_intervals [" << apib
+ << ") start interval does not contain the required"
+ << " bound [" << rpib << ") start" << dendl;
+ ceph_abort_msg("past_interval start interval mismatch");
+ }
+ if (apib.second != rpib.second) {
+ pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
+ << ") end does not match required [" << rpib
+ << ") end";
+ derr << info.pgid << " past_interal bound [" << apib
+ << ") end does not match required [" << rpib
+ << ") end" << dendl;
+ ceph_abort_msg("past_interval end mismatch");
+ }
+ }
+}
+
+int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
+{
+ static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
+ static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
+
+ ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
+
+ // User can't set this too high anymore, but might be a legacy value
+ if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
+ pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
+ if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
+ pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
+ // Shift range from min to max to 0 to max - min
+ pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
+ ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
+
+ priority += pool_recovery_priority;
+
+ // Clamp to valid range
+ if (priority > max) {
+ return max;
+ } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
+ return OSD_RECOVERY_PRIORITY_MIN;
+ } else {
+ return priority;
+ }
+}
+
+unsigned PeeringState::get_recovery_priority()
+{
+ // a higher value -> a higher priority
+ int ret = OSD_RECOVERY_PRIORITY_BASE;
+ int base = ret;
+
+ if (state & PG_STATE_FORCED_RECOVERY) {
+ ret = OSD_RECOVERY_PRIORITY_FORCED;
+ } else {
+ // XXX: This priority boost isn't so much about inactive, but about data-at-risk
+ if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
+ base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
+ // inactive: no. of replicas < min_size, highest priority since it blocks IO
+ ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
+ }
+
+ int64_t pool_recovery_priority = 0;
+ pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+
+ ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
+ }
+ psdout(20) << __func__ << " recovery priority is " << ret << dendl;
+ return static_cast<unsigned>(ret);
+}
+
+unsigned PeeringState::get_backfill_priority()
+{
+ // a higher value -> a higher priority
+ int ret = OSD_BACKFILL_PRIORITY_BASE;
+ int base = ret;
+
+ if (state & PG_STATE_FORCED_BACKFILL) {
+ ret = OSD_BACKFILL_PRIORITY_FORCED;
+ } else {
+ if (actingset.size() < pool.info.min_size) {
+ base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
+ // inactive: no. of replicas < min_size, highest priority since it blocks IO
+ ret = base + (pool.info.min_size - actingset.size());
+
+ } else if (is_undersized()) {
+ // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
+ ceph_assert(pool.info.size > actingset.size());
+ base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+ ret = base + (pool.info.size - actingset.size());
+
+ } else if (is_degraded()) {
+ // degraded: baseline degraded
+ base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+ }
+
+ // Adjust with pool's recovery priority
+ int64_t pool_recovery_priority = 0;
+ pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+
+ ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
+ }
+
+ psdout(20) << __func__ << " backfill priority is " << ret << dendl;
+ return static_cast<unsigned>(ret);
+}
+
+unsigned PeeringState::get_delete_priority()
+{
+ auto state = get_osdmap()->get_state(pg_whoami.osd);
+ if (state & (CEPH_OSD_BACKFILLFULL |
+ CEPH_OSD_FULL)) {
+ return OSD_DELETE_PRIORITY_FULL;
+ } else if (state & CEPH_OSD_NEARFULL) {
+ return OSD_DELETE_PRIORITY_FULLISH;
+ } else {
+ return OSD_DELETE_PRIORITY_NORMAL;
+ }
+}
+
+bool PeeringState::set_force_recovery(bool b)
+{
+ bool did = false;
+ if (b) {
+ if (!(state & PG_STATE_FORCED_RECOVERY) &&
+ (state & (PG_STATE_DEGRADED |
+ PG_STATE_RECOVERY_WAIT |
+ PG_STATE_RECOVERING))) {
+ psdout(20) << __func__ << " set" << dendl;
+ state_set(PG_STATE_FORCED_RECOVERY);
+ pl->publish_stats_to_osd();
+ did = true;
+ }
+ } else if (state & PG_STATE_FORCED_RECOVERY) {
+ psdout(20) << __func__ << " clear" << dendl;
+ state_clear(PG_STATE_FORCED_RECOVERY);
+ pl->publish_stats_to_osd();
+ did = true;
+ }
+ if (did) {
+ psdout(20) << __func__ << " state " << get_current_state()
+ << dendl;
+ pl->update_local_background_io_priority(get_recovery_priority());
+ }
+ return did;
+}
+
+bool PeeringState::set_force_backfill(bool b)
+{
+ bool did = false;
+ if (b) {
+ if (!(state & PG_STATE_FORCED_BACKFILL) &&
+ (state & (PG_STATE_DEGRADED |
+ PG_STATE_BACKFILL_WAIT |
+ PG_STATE_BACKFILLING))) {
+ psdout(10) << __func__ << " set" << dendl;
+ state_set(PG_STATE_FORCED_BACKFILL);
+ pl->publish_stats_to_osd();
+ did = true;
+ }
+ } else if (state & PG_STATE_FORCED_BACKFILL) {
+ psdout(10) << __func__ << " clear" << dendl;
+ state_clear(PG_STATE_FORCED_BACKFILL);
+ pl->publish_stats_to_osd();
+ did = true;
+ }
+ if (did) {
+ psdout(20) << __func__ << " state " << get_current_state()
+ << dendl;
+ pl->update_local_background_io_priority(get_backfill_priority());
+ }
+ return did;
+}
+
+void PeeringState::schedule_renew_lease()
+{
+ pl->schedule_renew_lease(
+ last_peering_reset,
+ readable_interval / 2);
+}
+
+void PeeringState::send_lease()
+{
+ epoch_t epoch = pl->get_osdmap_epoch();
+ for (auto peer : actingset) {
+ if (peer == pg_whoami) {
+ continue;
+ }
+ pl->send_cluster_message(
+ peer.osd,
+ make_message<MOSDPGLease>(epoch,
+ spg_t(spgid.pgid, peer.shard),
+ get_lease()),
+ epoch);
+ }
+}
+
+void PeeringState::proc_lease(const pg_lease_t& l)
+{
+ if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+ psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
+ << upacting_features << std::dec
+ << " does not include SERVER_OCTOPUS" << dendl;
+ return;
+ }
+ if (!is_nonprimary()) {
+ psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
+ return;
+ }
+ psdout(10) << __func__ << " " << l << dendl;
+ if (l.readable_until_ub > readable_until_ub_from_primary) {
+ readable_until_ub_from_primary = l.readable_until_ub;
+ }
+
+ ceph::signedspan ru = ceph::signedspan::zero();
+ if (l.readable_until != ceph::signedspan::zero() &&
+ hb_stamps[0]->peer_clock_delta_ub) {
+ ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
+ psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
+ << " -> ru " << ru << dendl;
+ }
+ if (ru > readable_until) {
+ readable_until = ru;
+ psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
+ // NOTE: if we ever decide to block/queue ops on the replica,
+ // we'll need to wake them up here.
+ }
+
+ ceph::signedspan ruub;
+ if (hb_stamps[0]->peer_clock_delta_lb) {
+ ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
+ psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
+ << " -> ruub " << ruub << dendl;
+ } else {
+ ruub = pl->get_mnow() + l.interval;
+ psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
+ }
+ if (ruub > readable_until_ub) {
+ readable_until_ub = ruub;
+ psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
+ << dendl;
+ }
+}
+
+void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
+{
+ if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+ return;
+ }
+ auto now = pl->get_mnow();
+ bool was_min = false;
+ for (unsigned i = 0; i < acting.size(); ++i) {
+ if (from == acting[i]) {
+ // the lease_ack value is based on the primary's clock
+ if (a.readable_until_ub > acting_readable_until_ub[i]) {
+ if (acting_readable_until_ub[i] == readable_until) {
+ was_min = true;
+ }
+ acting_readable_until_ub[i] = a.readable_until_ub;
+ break;
+ }
+ }
+ }
+ if (was_min) {
+ auto old_ru = readable_until;
+ recalc_readable_until();
+ if (now < old_ru) {
+ pl->recheck_readable();
+ }
+ }
+}
+
+void PeeringState::proc_renew_lease()
+{
+ if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+ return;
+ }
+ renew_lease(pl->get_mnow());
+ send_lease();
+ schedule_renew_lease();
+}
+
+void PeeringState::recalc_readable_until()
+{
+ assert(is_primary());
+ ceph::signedspan min = readable_until_ub_sent;
+ for (unsigned i = 0; i < acting.size(); ++i) {
+ if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
+ continue;
+ }
+ dout(20) << __func__ << " peer osd." << acting[i]
+ << " ruub " << acting_readable_until_ub[i] << dendl;
+ if (acting_readable_until_ub[i] < min) {
+ min = acting_readable_until_ub[i];
+ }
+ }
+ readable_until = min;
+ readable_until_ub = min;
+ dout(20) << __func__ << " readable_until[_ub] " << readable_until
+ << " (sent " << readable_until_ub_sent << ")" << dendl;
+}
+
+bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
+{
+ if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+ return false;
+ }
+ bool changed = false;
+ auto p = prior_readable_down_osds.begin();
+ while (p != prior_readable_down_osds.end()) {
+ if (map->is_dead(*p)) {
+ dout(10) << __func__ << " prior_readable_down_osds osd." << *p
+ << " is dead as of epoch " << map->get_epoch()
+ << dendl;
+ p = prior_readable_down_osds.erase(p);
+ changed = true;
+ } else {
+ ++p;
+ }
+ }
+ if (changed && prior_readable_down_osds.empty()) {
+ psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
+ clear_prior_readable_until_ub();
+ return true;
+ }
+ return false;
+}
+
+bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
+{
+ epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
+ if (need_up_thru &&
+ up_thru >= info.history.same_interval_since) {
+ psdout(10) << "adjust_need_up_thru now "
+ << up_thru << ", need_up_thru now false" << dendl;
+ need_up_thru = false;
+ return true;
+ }
+ return false;
+}
+
+PastIntervals::PriorSet PeeringState::build_prior()
+{
+ if (1) {
+ // sanity check
+ for (auto it = peer_info.begin(); it != peer_info.end(); ++it) {
+ ceph_assert(info.history.last_epoch_started >=
+ it->second.history.last_epoch_started);
+ }
+ }
+
+ const OSDMap &osdmap = *get_osdmap();
+ PastIntervals::PriorSet prior = past_intervals.get_prior_set(
+ pool.info.is_erasure(),
+ info.history.last_epoch_started,
+ &missing_loc.get_recoverable_predicate(),
+ [&](epoch_t start, int osd, epoch_t *lost_at) {
+ const osd_info_t *pinfo = 0;
+ if (osdmap.exists(osd)) {
+ pinfo = &osdmap.get_info(osd);
+ if (lost_at)
+ *lost_at = pinfo->lost_at;
+ }
+
+ if (osdmap.is_up(osd)) {
+ return PastIntervals::UP;
+ } else if (!pinfo) {
+ return PastIntervals::DNE;
+ } else if (pinfo->lost_at > start) {
+ return PastIntervals::LOST;
+ } else {
+ return PastIntervals::DOWN;
+ }
+ },
+ up,
+ acting,
+ dpp);
+
+ if (prior.pg_down) {
+ state_set(PG_STATE_DOWN);
+ }
+
+ if (get_osdmap()->get_up_thru(pg_whoami.osd) <
+ info.history.same_interval_since) {
+ psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
+ << " < same_since " << info.history.same_interval_since
+ << ", must notify monitor" << dendl;
+ need_up_thru = true;
+ } else {
+ psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
+ << " >= same_since " << info.history.same_interval_since
+ << ", all is well" << dendl;
+ need_up_thru = false;
+ }
+ pl->set_probe_targets(prior.probe);
+ return prior;
+}
+
+bool PeeringState::needs_recovery() const
+{
+ ceph_assert(is_primary());
+
+ auto &missing = pg_log.get_missing();
+
+ if (missing.num_missing()) {
+ psdout(10) << __func__ << " primary has " << missing.num_missing()
+ << " missing" << dendl;
+ return true;
+ }
+
+ ceph_assert(!acting_recovery_backfill.empty());
+ for (const pg_shard_t& peer : acting_recovery_backfill) {
+ if (peer == get_primary()) {
+ continue;
+ }
+ auto pm = peer_missing.find(peer);
+ if (pm == peer_missing.end()) {
+ psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
+ << dendl;
+ continue;
+ }
+ if (pm->second.num_missing()) {
+ psdout(10) << __func__ << " osd." << peer << " has "
+ << pm->second.num_missing() << " missing" << dendl;
+ return true;
+ }
+ }
+
+ psdout(10) << __func__ << " is recovered" << dendl;
+ return false;
+}
+
+bool PeeringState::needs_backfill() const
+{
+ ceph_assert(is_primary());
+
+ // We can assume that only possible osds that need backfill
+ // are on the backfill_targets vector nodes.
+ for (const pg_shard_t& peer : backfill_targets) {
+ auto pi = peer_info.find(peer);
+ ceph_assert(pi != peer_info.end());
+ if (!pi->second.last_backfill.is_max()) {
+ psdout(10) << __func__ << " osd." << peer
+ << " has last_backfill " << pi->second.last_backfill << dendl;
+ return true;
+ }
+ }
+
+ psdout(10) << __func__ << " does not need backfill" << dendl;
+ return false;
+}
+
+/*
+ * Returns true unless there is a non-lost OSD in might_have_unfound.
+ */
+bool PeeringState::all_unfound_are_queried_or_lost(
+ const OSDMapRef osdmap) const
+{
+ ceph_assert(is_primary());
+
+ auto peer = might_have_unfound.begin();
+ auto mend = might_have_unfound.end();
+ for (; peer != mend; ++peer) {
+ if (peer_missing.count(*peer))
+ continue;
+ auto iter = peer_info.find(*peer);
+ if (iter != peer_info.end() &&
+ (iter->second.is_empty() || iter->second.dne()))
+ continue;
+ if (!osdmap->exists(peer->osd))
+ continue;
+ const osd_info_t &osd_info(osdmap->get_info(peer->osd));
+ if (osd_info.lost_at <= osd_info.up_from) {
+ // If there is even one OSD in might_have_unfound that isn't lost, we
+ // still might retrieve our unfound.
+ return false;
+ }
+ }
+ psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
+ << might_have_unfound
+ << " have been queried or are marked lost" << dendl;
+ return true;
+}
+
+
+void PeeringState::reject_reservation()
+{
+ pl->unreserve_recovery_space();
+ pl->send_cluster_message(
+ primary.osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::REJECT_TOOFULL,
+ spg_t(info.pgid.pgid, primary.shard),
+ get_osdmap_epoch()),
+ get_osdmap_epoch());
+}
+
+/**
+ * find_best_info
+ *
+ * Returns an iterator to the best info in infos sorted by:
+ * 1) Prefer newer last_update
+ * 2) Prefer longer tail if it brings another info into contiguity
+ * 3) Prefer current primary
+ */
+map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
+ const map<pg_shard_t, pg_info_t> &infos,
+ bool restrict_to_up_acting,
+ bool *history_les_bound) const
+{
+ ceph_assert(history_les_bound);
+ /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
+ * to make changes to this process. Also, make sure to update it
+ * when you find bugs! */
+ epoch_t max_last_epoch_started_found = 0;
+ for (auto i = infos.begin(); i != infos.end(); ++i) {
+ if (!cct->_conf->osd_find_best_info_ignore_history_les &&
+ max_last_epoch_started_found < i->second.history.last_epoch_started) {
+ *history_les_bound = true;
+ max_last_epoch_started_found = i->second.history.last_epoch_started;
+ }
+ if (!i->second.is_incomplete() &&
+ max_last_epoch_started_found < i->second.last_epoch_started) {
+ *history_les_bound = false;
+ max_last_epoch_started_found = i->second.last_epoch_started;
+ }
+ }
+ eversion_t min_last_update_acceptable = eversion_t::max();
+ for (auto i = infos.begin(); i != infos.end(); ++i) {
+ if (max_last_epoch_started_found <= i->second.last_epoch_started) {
+ if (min_last_update_acceptable > i->second.last_update)
+ min_last_update_acceptable = i->second.last_update;
+ }
+ }
+ if (min_last_update_acceptable == eversion_t::max())
+ return infos.end();
+
+ auto best = infos.end();
+ // find osd with newest last_update (oldest for ec_pool).
+ // if there are multiples, prefer
+ // - a longer tail, if it brings another peer into log contiguity
+ // - the current primary
+ for (auto p = infos.begin(); p != infos.end(); ++p) {
+ if (restrict_to_up_acting && !is_up(p->first) &&
+ !is_acting(p->first))
+ continue;
+ // Only consider peers with last_update >= min_last_update_acceptable
+ if (p->second.last_update < min_last_update_acceptable)
+ continue;
+ // Disqualify anyone with a too old last_epoch_started
+ if (p->second.last_epoch_started < max_last_epoch_started_found)
+ continue;
+ // Disqualify anyone who is incomplete (not fully backfilled)
+ if (p->second.is_incomplete())
+ continue;
+ if (best == infos.end()) {
+ best = p;
+ continue;
+ }
+ // Prefer newer last_update
+ if (pool.info.require_rollback()) {
+ if (p->second.last_update > best->second.last_update)
+ continue;
+ if (p->second.last_update < best->second.last_update) {
+ best = p;
+ continue;
+ }
+ } else {
+ if (p->second.last_update < best->second.last_update)
+ continue;
+ if (p->second.last_update > best->second.last_update) {
+ best = p;
+ continue;
+ }
+ }
+
+ // Prefer longer tail
+ if (p->second.log_tail > best->second.log_tail) {
+ continue;
+ } else if (p->second.log_tail < best->second.log_tail) {
+ best = p;
+ continue;
+ }
+
+ if (!p->second.has_missing() && best->second.has_missing()) {
+ psdout(10) << __func__ << " prefer osd." << p->first
+ << " because it is complete while best has missing"
+ << dendl;
+ best = p;
+ continue;
+ } else if (p->second.has_missing() && !best->second.has_missing()) {
+ psdout(10) << __func__ << " skipping osd." << p->first
+ << " because it has missing while best is complete"
+ << dendl;
+ continue;
+ } else {
+ // both are complete or have missing
+ // fall through
+ }
+
+ // prefer current primary (usually the caller), all things being equal
+ if (p->first == pg_whoami) {
+ psdout(10) << "calc_acting prefer osd." << p->first
+ << " because it is current primary" << dendl;
+ best = p;
+ continue;
+ }
+ }
+ return best;
+}
+
+void PeeringState::calc_ec_acting(
+ map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+ unsigned size,
+ const vector<int> &acting,
+ const vector<int> &up,
+ const map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ vector<int> *_want,
+ set<pg_shard_t> *backfill,
+ set<pg_shard_t> *acting_backfill,
+ ostream &ss)
+{
+ vector<int> want(size, CRUSH_ITEM_NONE);
+ map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
+ for (auto i = all_info.begin();
+ i != all_info.end();
+ ++i) {
+ all_info_by_shard[i->first.shard].insert(i->first);
+ }
+ for (uint8_t i = 0; i < want.size(); ++i) {
+ ss << "For position " << (unsigned)i << ": ";
+ if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
+ !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
+ all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
+ auth_log_shard->second.log_tail) {
+ ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
+ want[i] = up[i];
+ continue;
+ }
+ if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
+ ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
+ << " and ";
+ backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
+ }
+
+ if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
+ !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
+ all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
+ auth_log_shard->second.log_tail) {
+ ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
+ want[i] = acting[i];
+ } else if (!restrict_to_up_acting) {
+ for (auto j = all_info_by_shard[shard_id_t(i)].begin();
+ j != all_info_by_shard[shard_id_t(i)].end();
+ ++j) {
+ ceph_assert(j->shard == i);
+ if (!all_info.find(*j)->second.is_incomplete() &&
+ all_info.find(*j)->second.last_update >=
+ auth_log_shard->second.log_tail) {
+ ss << " selecting stray: " << *j << std::endl;
+ want[i] = j->osd;
+ break;
+ }
+ }
+ if (want[i] == CRUSH_ITEM_NONE)
+ ss << " failed to fill position " << (int)i << std::endl;
+ }
+ }
+
+ for (uint8_t i = 0; i < want.size(); ++i) {
+ if (want[i] != CRUSH_ITEM_NONE) {
+ acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
+ }
+ }
+ acting_backfill->insert(backfill->begin(), backfill->end());
+ _want->swap(want);
+}
+
+std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
+PeeringState::select_replicated_primary(
+ map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+ uint64_t force_auth_primary_missing_objects,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const map<pg_shard_t, pg_info_t> &all_info,
+ const OSDMapRef osdmap,
+ ostream &ss)
+{
+ pg_shard_t auth_log_shard_id = auth_log_shard->first;
+
+ ss << __func__ << " newest update on osd." << auth_log_shard_id
+ << " with " << auth_log_shard->second << std::endl;
+
+ // select primary
+ auto primary = all_info.find(up_primary);
+ if (up.size() &&
+ !primary->second.is_incomplete() &&
+ primary->second.last_update >=
+ auth_log_shard->second.log_tail) {
+ if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+ auto approx_missing_objects =
+ primary->second.stats.stats.sum.num_objects_missing;
+ auto auth_version = auth_log_shard->second.last_update.version;
+ auto primary_version = primary->second.last_update.version;
+ if (auth_version > primary_version) {
+ approx_missing_objects += auth_version - primary_version;
+ } else {
+ approx_missing_objects += primary_version - auth_version;
+ }
+ if ((uint64_t)approx_missing_objects >
+ force_auth_primary_missing_objects) {
+ primary = auth_log_shard;
+ ss << "up_primary: " << up_primary << ") has approximate "
+ << approx_missing_objects
+ << "(>" << force_auth_primary_missing_objects <<") "
+ << "missing objects, osd." << auth_log_shard_id
+ << " selected as primary instead"
+ << std::endl;
+ } else {
+ ss << "up_primary: " << up_primary << ") selected as primary"
+ << std::endl;
+ }
+ } else {
+ ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
+ }
+ } else {
+ ceph_assert(!auth_log_shard->second.is_incomplete());
+ ss << "up[0] needs backfill, osd." << auth_log_shard_id
+ << " selected as primary instead" << std::endl;
+ primary = auth_log_shard;
+ }
+
+ ss << __func__ << " primary is osd." << primary->first
+ << " with " << primary->second << std::endl;
+
+ /* We include auth_log_shard->second.log_tail because in GetLog,
+ * we will request logs back to the min last_update over our
+ * acting_backfill set, which will result in our log being extended
+ * as far backwards as necessary to pick up any peers which can
+ * be log recovered by auth_log_shard's log */
+ eversion_t oldest_auth_log_entry =
+ std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
+
+ return std::make_pair(primary, oldest_auth_log_entry);
+}
+
+
+/**
+ * calculate the desired acting set.
+ *
+ * Choose an appropriate acting set. Prefer up[0], unless it is
+ * incomplete, or another osd has a longer tail that allows us to
+ * bring other up nodes up to date.
+ */
+void PeeringState::calc_replicated_acting(
+ map<pg_shard_t, pg_info_t>::const_iterator primary,
+ eversion_t oldest_auth_log_entry,
+ unsigned size,
+ const vector<int> &acting,
+ const vector<int> &up,
+ pg_shard_t up_primary,
+ const map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ vector<int> *want,
+ set<pg_shard_t> *backfill,
+ set<pg_shard_t> *acting_backfill,
+ const OSDMapRef osdmap,
+ const PGPool& pool,
+ ostream &ss)
+{
+ ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
+ << std::endl;
+
+ want->push_back(primary->first.osd);
+ acting_backfill->insert(primary->first);
+
+ // select replicas that have log contiguity with primary.
+ // prefer up, then acting, then any peer_info osds
+ for (auto i : up) {
+ pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
+ if (up_cand == primary->first)
+ continue;
+ const pg_info_t &cur_info = all_info.find(up_cand)->second;
+ if (cur_info.is_incomplete() ||
+ cur_info.last_update < oldest_auth_log_entry) {
+ ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
+ backfill->insert(up_cand);
+ acting_backfill->insert(up_cand);
+ } else {
+ want->push_back(i);
+ acting_backfill->insert(up_cand);
+ ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
+ }
+ }
+
+ if (want->size() >= size) {
+ return;
+ }
+
+ std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
+ candidate_by_last_update.reserve(acting.size());
+ // This no longer has backfill OSDs, but they are covered above.
+ for (auto i : acting) {
+ pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
+ // skip up osds we already considered above
+ if (acting_cand == primary->first)
+ continue;
+ auto up_it = find(up.begin(), up.end(), i);
+ if (up_it != up.end())
+ continue;
+
+ const pg_info_t &cur_info = all_info.find(acting_cand)->second;
+ if (cur_info.is_incomplete() ||
+ cur_info.last_update < oldest_auth_log_entry) {
+ ss << " shard " << acting_cand << " (acting) REJECTED "
+ << cur_info << std::endl;
+ } else {
+ candidate_by_last_update.emplace_back(cur_info.last_update, i);
+ }
+ }
+
+ auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
+ const std::pair<eversion_t, int> &rhs) {
+ return lhs.first > rhs.first;
+ };
+ // sort by last_update, in descending order.
+ std::sort(candidate_by_last_update.begin(),
+ candidate_by_last_update.end(), sort_by_eversion);
+ for (auto &p: candidate_by_last_update) {
+ ceph_assert(want->size() < size);
+ want->push_back(p.second);
+ pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
+ acting_backfill->insert(s);
+ ss << " shard " << s << " (acting) accepted "
+ << all_info.find(s)->second << std::endl;
+ if (want->size() >= size) {
+ return;
+ }
+ }
+
+ if (restrict_to_up_acting) {
+ return;
+ }
+ candidate_by_last_update.clear();
+ candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
+ // continue to search stray to find more suitable peers
+ for (auto &i : all_info) {
+ // skip up osds we already considered above
+ if (i.first == primary->first)
+ continue;
+ auto up_it = find(up.begin(), up.end(), i.first.osd);
+ if (up_it != up.end())
+ continue;
+ auto acting_it = find(
+ acting.begin(), acting.end(), i.first.osd);
+ if (acting_it != acting.end())
+ continue;
+
+ if (i.second.is_incomplete() ||
+ i.second.last_update < oldest_auth_log_entry) {
+ ss << " shard " << i.first << " (stray) REJECTED " << i.second
+ << std::endl;
+ } else {
+ candidate_by_last_update.emplace_back(
+ i.second.last_update, i.first.osd);
+ }
+ }
+
+ if (candidate_by_last_update.empty()) {
+ // save us some effort
+ return;
+ }
+
+ // sort by last_update, in descending order.
+ std::sort(candidate_by_last_update.begin(),
+ candidate_by_last_update.end(), sort_by_eversion);
+
+ for (auto &p: candidate_by_last_update) {
+ ceph_assert(want->size() < size);
+ want->push_back(p.second);
+ pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
+ acting_backfill->insert(s);
+ ss << " shard " << s << " (stray) accepted "
+ << all_info.find(s)->second << std::endl;
+ if (want->size() >= size) {
+ return;
+ }
+ }
+}
+
+// Defines osd preference order: acting set, then larger last_update
+using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update>
+using osd_id_t = int;
+
+class bucket_candidates_t {
+ std::deque<std::pair<osd_ord_t, osd_id_t>> osds;
+ int selected = 0;
+
+public:
+ void add_osd(osd_ord_t ord, osd_id_t osd) {
+ // osds will be added in smallest to largest order
+ assert(osds.empty() || osds.back().first <= ord);
+ osds.push_back(std::make_pair(ord, osd));
+ }
+ osd_id_t pop_osd() {
+ ceph_assert(!is_empty());
+ auto ret = osds.front();
+ osds.pop_front();
+ return ret.second;
+ }
+
+ void inc_selected() { selected++; }
+ unsigned get_num_selected() const { return selected; }
+
+ osd_ord_t get_ord() const {
+ return osds.empty() ? std::make_tuple(false, eversion_t())
+ : osds.front().first;
+ }
+
+ bool is_empty() const { return osds.empty(); }
+
+ bool operator<(const bucket_candidates_t &rhs) const {
+ return std::make_tuple(-selected, get_ord()) <
+ std::make_tuple(-rhs.selected, rhs.get_ord());
+ }
+
+ friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &);
+};
+
+std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand)
+{
+ return lhs << "candidates[" << cand.osds << "]";
+}
+
+class bucket_heap_t {
+ using elem_t = std::reference_wrapper<bucket_candidates_t>;
+ std::vector<elem_t> heap;
+
+ // Max heap -- should emit buckets in order of preference
+ struct comp {
+ bool operator()(const elem_t &lhs, const elem_t &rhs) {
+ return lhs.get() < rhs.get();
+ }
+ };
+public:
+ void push_if_nonempty(elem_t e) {
+ if (!e.get().is_empty()) {
+ heap.push_back(e);
+ std::push_heap(heap.begin(), heap.end(), comp());
+ }
+ }
+ elem_t pop() {
+ std::pop_heap(heap.begin(), heap.end(), comp());
+ auto ret = heap.back();
+ heap.pop_back();
+ return ret;
+ }
+
+ bool is_empty() const { return heap.empty(); }
+};
+
+/**
+ * calc_replicated_acting_stretch
+ *
+ * Choose an acting set using as much of the up set as possible; filling
+ * in the remaining slots so as to maximize the number of crush buckets at
+ * level pool.info.peering_crush_bucket_barrier represented.
+ *
+ * Stretch clusters are a bit special: while they have a "size" the
+ * same way as normal pools, if we happen to lose a data center
+ * (we call it a "stretch bucket", but really it'll be a data center or
+ * a cloud availability zone), we don't actually want to shove
+ * 2 DC's worth of replication into a single site -- it won't fit!
+ * So we locally calculate a bucket_max, based
+ * on the targeted number of stretch buckets for the pool and
+ * its size. Then we won't pull more than bucket_max from any
+ * given ancestor even if it leaves us undersized.
+
+ * There are two distinct phases: (commented below)
+ */
+void PeeringState::calc_replicated_acting_stretch(
+ map<pg_shard_t, pg_info_t>::const_iterator primary,
+ eversion_t oldest_auth_log_entry,
+ unsigned size,
+ const vector<int> &acting,
+ const vector<int> &up,
+ pg_shard_t up_primary,
+ const map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ vector<int> *want,
+ set<pg_shard_t> *backfill,
+ set<pg_shard_t> *acting_backfill,
+ const OSDMapRef osdmap,
+ const PGPool& pool,
+ ostream &ss)
+{
+ ceph_assert(want);
+ ceph_assert(acting_backfill);
+ ceph_assert(backfill);
+ ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
+ << std::endl;
+
+ auto used = [want](int osd) {
+ return std::find(want->begin(), want->end(), osd) != want->end();
+ };
+
+ auto usable_info = [&](const auto &cur_info) mutable {
+ return !(cur_info.is_incomplete() ||
+ cur_info.last_update < oldest_auth_log_entry);
+ };
+
+ auto osd_info = [&](int osd) mutable -> const pg_info_t & {
+ pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
+ const pg_info_t &cur_info = all_info.find(cand)->second;
+ return cur_info;
+ };
+
+ auto usable_osd = [&](int osd) mutable {
+ return usable_info(osd_info(osd));
+ };
+
+ std::map<int, bucket_candidates_t> ancestors;
+ auto get_ancestor = [&](int osd) mutable {
+ int ancestor = osdmap->crush->get_parent_of_type(
+ osd,
+ pool.info.peering_crush_bucket_barrier,
+ pool.info.crush_rule);
+ return &ancestors[ancestor];
+ };
+
+ unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
+ if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) {
+ ++bucket_max;
+ }
+
+ /* 1) Select all usable osds from the up set as well as the primary
+ *
+ * We also stash any unusable osds from up into backfill.
+ */
+ auto add_required = [&](int osd) {
+ if (!used(osd)) {
+ want->push_back(osd);
+ acting_backfill->insert(
+ pg_shard_t(osd, shard_id_t::NO_SHARD));
+ get_ancestor(osd)->inc_selected();
+ }
+ };
+ add_required(primary->first.osd);
+ ss << " osd " << primary->first.osd << " primary accepted "
+ << osd_info(primary->first.osd) << std::endl;
+ for (auto upcand: up) {
+ auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
+ auto &curinfo = osd_info(upcand);
+ if (usable_osd(upcand)) {
+ ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
+ add_required(upcand);
+ } else {
+ ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
+ backfill->insert(upshard);
+ acting_backfill->insert(upshard);
+ }
+ }
+
+ if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
+ ss << " up set sufficient" << std::endl;
+ return;
+ }
+ ss << " up set insufficient, considering remaining osds" << std::endl;
+
+ /* 2) Fill out remaining slots from usable osds in all_info
+ * while maximizing the number of ancestor nodes at the
+ * barrier_id crush level.
+ */
+ {
+ std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
+ /* To do this, we first filter the set of usable osd into an ordered
+ * list of usable osds
+ */
+ auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
+ return std::make_tuple(
+ !is_acting /* acting should sort first */,
+ info.last_update);
+ };
+ for (auto &cand : acting) {
+ auto &cand_info = osd_info(cand);
+ if (!used(cand) && usable_info(cand_info)) {
+ ss << " acting candidate " << cand << " " << cand_info << std::endl;
+ candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
+ }
+ }
+ if (!restrict_to_up_acting) {
+ for (auto &[cand, info] : all_info) {
+ if (!used(cand.osd) && usable_info(info) &&
+ (std::find(acting.begin(), acting.end(), cand.osd)
+ == acting.end())) {
+ ss << " other candidate " << cand << " " << info << std::endl;
+ candidates.push_back(
+ std::make_pair(get_osd_ord(false, info), cand.osd));
+ }
+ }
+ }
+ std::sort(candidates.begin(), candidates.end());
+
+ // We then filter these candidates by ancestor
+ std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
+ get_ancestor(cand.second)->add_osd(cand.first, cand.second);
+ });
+ }
+
+ auto pop_ancestor = [&](auto &ancestor) {
+ ceph_assert(!ancestor.is_empty());
+ auto osd = ancestor.pop_osd();
+
+ ss << " accepting candidate " << osd << std::endl;
+
+ ceph_assert(!used(osd));
+ ceph_assert(usable_osd(osd));
+
+ want->push_back(osd);
+ acting_backfill->insert(
+ pg_shard_t(osd, shard_id_t::NO_SHARD));
+ ancestor.inc_selected();
+ };
+
+ /* Next, we use the ancestors map to grab a descendant of the
+ * peering_crush_mandatory_member if not already represented.
+ *
+ * TODO: using 0 here to match other users. Prior to merge, I
+ * expect that this and other users should instead check against
+ * CRUSH_ITEM_NONE.
+ */
+ if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
+ auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
+ if (aiter != ancestors.end() &&
+ !aiter->second.get_num_selected()) {
+ ss << " adding required ancestor " << aiter->first << std::endl;
+ ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
+ pop_ancestor(aiter->second);
+ }
+ }
+
+ /* We then place the ancestors in a heap ordered by fewest selected
+ * and then by the ordering token of the next osd */
+ bucket_heap_t aheap;
+ std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
+ aheap.push_if_nonempty(anc.second);
+ });
+
+ /* and pull from this heap until it's empty or we have enough.
+ * "We have enough" is a sufficient check here for
+ * stretch_set_can_peer() because our heap sorting always
+ * pulls from ancestors with the least number of included OSDs,
+ * so if it is possible to satisfy the bucket_count constraints we
+ * will do so.
+ */
+ while (!aheap.is_empty() && want->size() < pool.info.size) {
+ auto next = aheap.pop();
+ pop_ancestor(next.get());
+ if (next.get().get_num_selected() < bucket_max) {
+ aheap.push_if_nonempty(next);
+ }
+ }
+
+ /* The end result is that we should have as many buckets covered as
+ * possible while respecting up, the primary selection,
+ * the pool size (given bucket count constraints),
+ * and the mandatory member.
+ */
+}
+
+
+bool PeeringState::recoverable(const vector<int> &want) const
+{
+ unsigned num_want_acting = 0;
+ set<pg_shard_t> have;
+ for (int i = 0; i < (int)want.size(); ++i) {
+ if (want[i] != CRUSH_ITEM_NONE) {
+ ++num_want_acting;
+ have.insert(
+ pg_shard_t(
+ want[i],
+ pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ }
+
+ if (num_want_acting < pool.info.min_size) {
+ const bool recovery_ec_pool_below_min_size=
+ HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
+
+ if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
+ psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
+ return false;
+ } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
+ psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
+ return false;
+ }
+ }
+ if (missing_loc.get_recoverable_predicate()(have)) {
+ return true;
+ } else {
+ psdout(10) << __func__ << " failed, not recoverable " << dendl;
+ return false;
+ }
+}
+
+void PeeringState::choose_async_recovery_ec(
+ const map<pg_shard_t, pg_info_t> &all_info,
+ const pg_info_t &auth_info,
+ vector<int> *want,
+ set<pg_shard_t> *async_recovery,
+ const OSDMapRef osdmap) const
+{
+ set<pair<int, pg_shard_t> > candidates_by_cost;
+ for (uint8_t i = 0; i < want->size(); ++i) {
+ if ((*want)[i] == CRUSH_ITEM_NONE)
+ continue;
+
+ // Considering log entries to recover is accurate enough for
+ // now. We could use minimum_to_decode_with_cost() later if
+ // necessary.
+ pg_shard_t shard_i((*want)[i], shard_id_t(i));
+ // do not include strays
+ if (stray_set.find(shard_i) != stray_set.end())
+ continue;
+ // Do not include an osd that is not up, since choosing it as
+ // an async_recovery_target will move it out of the acting set.
+ // This results in it being identified as a stray during peering,
+ // because it is no longer in the up or acting set.
+ if (!is_up(shard_i))
+ continue;
+ auto shard_info = all_info.find(shard_i)->second;
+ // for ec pools we rollback all entries past the authoritative
+ // last_update *before* activation. This is relatively inexpensive
+ // compared to recovery, since it is purely local, so treat shards
+ // past the authoritative last_update the same as those equal to it.
+ version_t auth_version = auth_info.last_update.version;
+ version_t candidate_version = shard_info.last_update.version;
+ if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+ auto approx_missing_objects =
+ shard_info.stats.stats.sum.num_objects_missing;
+ if (auth_version > candidate_version) {
+ approx_missing_objects += auth_version - candidate_version;
+ }
+ if (static_cast<uint64_t>(approx_missing_objects) >
+ cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+ candidates_by_cost.emplace(approx_missing_objects, shard_i);
+ }
+ } else {
+ if (auth_version > candidate_version &&
+ (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+ candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
+ }
+ }
+ }
+
+ psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
+ << dendl;
+
+ // take out as many osds as we can for async recovery, in order of cost
+ for (auto rit = candidates_by_cost.rbegin();
+ rit != candidates_by_cost.rend(); ++rit) {
+ pg_shard_t cur_shard = rit->second;
+ vector<int> candidate_want(*want);
+ candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
+ if (recoverable(candidate_want)) {
+ want->swap(candidate_want);
+ async_recovery->insert(cur_shard);
+ }
+ }
+ psdout(20) << __func__ << " result want=" << *want
+ << " async_recovery=" << *async_recovery << dendl;
+}
+
+void PeeringState::choose_async_recovery_replicated(
+ const map<pg_shard_t, pg_info_t> &all_info,
+ const pg_info_t &auth_info,
+ vector<int> *want,
+ set<pg_shard_t> *async_recovery,
+ const OSDMapRef osdmap) const
+{
+ set<pair<int, pg_shard_t> > candidates_by_cost;
+ for (auto osd_num : *want) {
+ pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
+ // do not include strays
+ if (stray_set.find(shard_i) != stray_set.end())
+ continue;
+ // Do not include an osd that is not up, since choosing it as
+ // an async_recovery_target will move it out of the acting set.
+ // This results in it being identified as a stray during peering,
+ // because it is no longer in the up or acting set.
+ if (!is_up(shard_i))
+ continue;
+ auto shard_info = all_info.find(shard_i)->second;
+ // use the approximate magnitude of the difference in length of
+ // logs plus historical missing objects as the cost of recovery
+ version_t auth_version = auth_info.last_update.version;
+ version_t candidate_version = shard_info.last_update.version;
+ if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+ auto approx_missing_objects =
+ shard_info.stats.stats.sum.num_objects_missing;
+ if (auth_version > candidate_version) {
+ approx_missing_objects += auth_version - candidate_version;
+ } else {
+ approx_missing_objects += candidate_version - auth_version;
+ }
+ if (static_cast<uint64_t>(approx_missing_objects) >
+ cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+ candidates_by_cost.emplace(approx_missing_objects, shard_i);
+ }
+ } else {
+ size_t approx_entries;
+ if (auth_version > candidate_version) {
+ approx_entries = auth_version - candidate_version;
+ } else {
+ approx_entries = candidate_version - auth_version;
+ }
+ if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+ candidates_by_cost.insert(make_pair(approx_entries, shard_i));
+ }
+ }
+ }
+
+ psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
+ << dendl;
+ // take out as many osds as we can for async recovery, in order of cost
+ for (auto rit = candidates_by_cost.rbegin();
+ rit != candidates_by_cost.rend(); ++rit) {
+ if (want->size() <= pool.info.min_size) {
+ break;
+ }
+ pg_shard_t cur_shard = rit->second;
+ vector<int> candidate_want(*want);
+ for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
+ if (*it == cur_shard.osd) {
+ candidate_want.erase(it);
+ if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
+ // if we're in stretch mode, we can only remove the osd if it doesn't
+ // break peering limits.
+ want->swap(candidate_want);
+ async_recovery->insert(cur_shard);
+ }
+ break;
+ }
+ }
+ }
+
+ psdout(20) << __func__ << " result want=" << *want
+ << " async_recovery=" << *async_recovery << dendl;
+}
+
+/**
+ * choose acting
+ *
+ * calculate the desired acting, and request a change with the monitor
+ * if it differs from the current acting.
+ *
+ * if restrict_to_up_acting=true, we filter out anything that's not in
+ * up/acting. in order to lift this restriction, we need to
+ * 1) check whether it's worth switching the acting set any time we get
+ * a new pg info (not just here, when recovery finishes)
+ * 2) check whether anything in want_acting went down on each new map
+ * (and, if so, calculate a new want_acting)
+ * 3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
+ * TODO!
+ */
+bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
+ bool restrict_to_up_acting,
+ bool *history_les_bound,
+ bool request_pg_temp_change_only)
+{
+ map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
+ all_info[pg_whoami] = info;
+
+ if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
+ for (auto p = all_info.begin(); p != all_info.end(); ++p) {
+ psdout(10) << __func__ << " all_info osd." << p->first << " "
+ << p->second << dendl;
+ }
+ }
+
+ auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
+ history_les_bound);
+
+ if (auth_log_shard == all_info.end()) {
+ if (up != acting) {
+ psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
+ << " reverting to up" << dendl;
+ want_acting = up;
+ vector<int> empty;
+ pl->queue_want_pg_temp(empty);
+ } else {
+ psdout(10) << __func__ << " failed" << dendl;
+ ceph_assert(want_acting.empty());
+ }
+ return false;
+ }
+
+ ceph_assert(!auth_log_shard->second.is_incomplete());
+ auth_log_shard_id = auth_log_shard->first;
+
+ set<pg_shard_t> want_backfill, want_acting_backfill;
+ vector<int> want;
+ stringstream ss;
+ if (pool.info.is_replicated()) {
+ auto [primary_shard, oldest_log] = select_replicated_primary(
+ auth_log_shard,
+ cct->_conf.get_val<uint64_t>(
+ "osd_force_auth_primary_missing_objects"),
+ up,
+ up_primary,
+ all_info,
+ get_osdmap(),
+ ss);
+ if (pool.info.is_stretch_pool()) {
+ calc_replicated_acting_stretch(
+ primary_shard,
+ oldest_log,
+ get_osdmap()->get_pg_size(info.pgid.pgid),
+ acting,
+ up,
+ up_primary,
+ all_info,
+ restrict_to_up_acting,
+ &want,
+ &want_backfill,
+ &want_acting_backfill,
+ get_osdmap(),
+ pool,
+ ss);
+ } else {
+ calc_replicated_acting(
+ primary_shard,
+ oldest_log,
+ get_osdmap()->get_pg_size(info.pgid.pgid),
+ acting,
+ up,
+ up_primary,
+ all_info,
+ restrict_to_up_acting,
+ &want,
+ &want_backfill,
+ &want_acting_backfill,
+ get_osdmap(),
+ pool,
+ ss);
+ }
+ } else {
+ calc_ec_acting(
+ auth_log_shard,
+ get_osdmap()->get_pg_size(info.pgid.pgid),
+ acting,
+ up,
+ all_info,
+ restrict_to_up_acting,
+ &want,
+ &want_backfill,
+ &want_acting_backfill,
+ ss);
+ }
+ psdout(10) << ss.str() << dendl;
+
+ if (!recoverable(want)) {
+ want_acting.clear();
+ return false;
+ }
+
+ set<pg_shard_t> want_async_recovery;
+ if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
+ if (pool.info.is_erasure()) {
+ choose_async_recovery_ec(
+ all_info, auth_log_shard->second, &want, &want_async_recovery,
+ get_osdmap());
+ } else {
+ choose_async_recovery_replicated(
+ all_info, auth_log_shard->second, &want, &want_async_recovery,
+ get_osdmap());
+ }
+ }
+ while (want.size() > pool.info.size) {
+ // async recovery should have taken out as many osds as it can.
+ // if not, then always evict the last peer
+ // (will get synchronously recovered later)
+ psdout(10) << __func__ << " evicting osd." << want.back()
+ << " from oversized want " << want << dendl;
+ want.pop_back();
+ }
+ if (want != acting) {
+ psdout(10) << __func__ << " want " << want << " != acting " << acting
+ << ", requesting pg_temp change" << dendl;
+ want_acting = want;
+
+ if (!cct->_conf->osd_debug_no_acting_change) {
+ if (want_acting == up) {
+ // There can't be any pending backfill if
+ // want is the same as crush map up OSDs.
+ ceph_assert(want_backfill.empty());
+ vector<int> empty;
+ pl->queue_want_pg_temp(empty);
+ } else
+ pl->queue_want_pg_temp(want);
+ }
+ return false;
+ }
+
+ if (request_pg_temp_change_only)
+ return true;
+ want_acting.clear();
+ acting_recovery_backfill = want_acting_backfill;
+ psdout(10) << "acting_recovery_backfill is "
+ << acting_recovery_backfill << dendl;
+ ceph_assert(
+ backfill_targets.empty() ||
+ backfill_targets == want_backfill);
+ if (backfill_targets.empty()) {
+ // Caller is GetInfo
+ backfill_targets = want_backfill;
+ }
+ // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
+ ceph_assert(
+ async_recovery_targets.empty() ||
+ async_recovery_targets == want_async_recovery ||
+ !needs_recovery());
+ if (async_recovery_targets.empty() || !needs_recovery()) {
+ async_recovery_targets = want_async_recovery;
+ }
+ // Will not change if already set because up would have had to change
+ // Verify that nothing in backfill is in stray_set
+ for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) {
+ ceph_assert(stray_set.find(*i) == stray_set.end());
+ }
+ psdout(10) << "choose_acting want=" << want << " backfill_targets="
+ << want_backfill << " async_recovery_targets="
+ << async_recovery_targets << dendl;
+ return true;
+}
+
+void PeeringState::log_weirdness()
+{
+ if (pg_log.get_tail() != info.log_tail)
+ pl->get_clog_error() << info.pgid
+ << " info mismatch, log.tail " << pg_log.get_tail()
+ << " != info.log_tail " << info.log_tail;
+ if (pg_log.get_head() != info.last_update)
+ pl->get_clog_error() << info.pgid
+ << " info mismatch, log.head " << pg_log.get_head()
+ << " != info.last_update " << info.last_update;
+
+ if (!pg_log.get_log().empty()) {
+ // sloppy check
+ if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
+ pl->get_clog_error() << info.pgid
+ << " log bound mismatch, info (tail,head] ("
+ << pg_log.get_tail() << ","
+ << pg_log.get_head() << "]"
+ << " actual ["
+ << pg_log.get_log().log.begin()->version << ","
+ << pg_log.get_log().log.rbegin()->version << "]";
+ }
+
+ if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
+ pl->get_clog_error() << info.pgid
+ << " caller_ops.size "
+ << pg_log.get_log().caller_ops.size()
+ << " > log size " << pg_log.get_log().log.size();
+ }
+}
+
+/*
+ * Process information from a replica to determine if it could have any
+ * objects that i need.
+ *
+ * TODO: if the missing set becomes very large, this could get expensive.
+ * Instead, we probably want to just iterate over our unfound set.
+ */
+bool PeeringState::search_for_missing(
+ const pg_info_t &oinfo, const pg_missing_t &omissing,
+ pg_shard_t from,
+ PeeringCtxWrapper &ctx)
+{
+ uint64_t num_unfound_before = missing_loc.num_unfound();
+ bool found_missing = missing_loc.add_source_info(
+ from, oinfo, omissing, ctx.handle);
+ if (found_missing && num_unfound_before != missing_loc.num_unfound())
+ pl->publish_stats_to_osd();
+ // avoid doing this if the peer is empty. This is abit of paranoia
+ // to avoid doing something rash if add_source_info() above
+ // incorrectly decided we found something new. (if the peer has
+ // last_update=0'0 that's impossible.)
+ if (found_missing &&
+ oinfo.last_update != eversion_t()) {
+ pg_info_t tinfo(oinfo);
+ tinfo.pgid.shard = pg_whoami.shard;
+ ctx.send_info(
+ from.osd,
+ spg_t(info.pgid.pgid, from.shard),
+ get_osdmap_epoch(), // fixme: use lower epoch?
+ get_osdmap_epoch(),
+ tinfo);
+ }
+ return found_missing;
+}
+
+bool PeeringState::discover_all_missing(
+ BufferedRecoveryMessages &rctx)
+{
+ auto &missing = pg_log.get_missing();
+ uint64_t unfound = get_num_unfound();
+ bool any = false; // did we start any queries
+
+ psdout(10) << __func__ << " "
+ << missing.num_missing() << " missing, "
+ << unfound << " unfound"
+ << dendl;
+
+ auto m = might_have_unfound.begin();
+ auto mend = might_have_unfound.end();
+ for (; m != mend; ++m) {
+ pg_shard_t peer(*m);
+
+ if (!get_osdmap()->is_up(peer.osd)) {
+ psdout(20) << __func__ << " skipping down osd." << peer << dendl;
+ continue;
+ }
+
+ if (peer_purged.count(peer)) {
+ psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
+ continue;
+ }
+
+ auto iter = peer_info.find(peer);
+ if (iter != peer_info.end() &&
+ (iter->second.is_empty() || iter->second.dne())) {
+ // ignore empty peers
+ continue;
+ }
+
+ // If we've requested any of this stuff, the pg_missing_t information
+ // should be on its way.
+ // TODO: coalsce requested_* into a single data structure
+ if (peer_missing.find(peer) != peer_missing.end()) {
+ psdout(20) << __func__ << ": osd." << peer
+ << ": we already have pg_missing_t" << dendl;
+ continue;
+ }
+ if (peer_log_requested.find(peer) != peer_log_requested.end()) {
+ psdout(20) << __func__ << ": osd." << peer
+ << ": in peer_log_requested" << dendl;
+ continue;
+ }
+ if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
+ psdout(20) << __func__ << ": osd." << peer
+ << ": in peer_missing_requested" << dendl;
+ continue;
+ }
+
+ // Request missing
+ psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
+ << dendl;
+ peer_missing_requested.insert(peer);
+ rctx.send_query(
+ peer.osd,
+ spg_t(info.pgid.pgid, peer.shard),
+ pg_query_t(
+ pg_query_t::FULLLOG,
+ peer.shard, pg_whoami.shard,
+ info.history, get_osdmap_epoch()));
+ any = true;
+ }
+ return any;
+}
+
+/* Build the might_have_unfound set.
+ *
+ * This is used by the primary OSD during recovery.
+ *
+ * This set tracks the OSDs which might have unfound objects that the primary
+ * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
+ * will remove the OSD from the set.
+ */
+void PeeringState::build_might_have_unfound()
+{
+ ceph_assert(might_have_unfound.empty());
+ ceph_assert(is_primary());
+
+ psdout(10) << __func__ << dendl;
+
+ check_past_interval_bounds();
+
+ might_have_unfound = past_intervals.get_might_have_unfound(
+ pg_whoami,
+ pool.info.is_erasure());
+
+ // include any (stray) peers
+ for (auto p = peer_info.begin(); p != peer_info.end(); ++p)
+ might_have_unfound.insert(p->first);
+
+ psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
+}
+
+void PeeringState::activate(
+ ObjectStore::Transaction& t,
+ epoch_t activation_epoch,
+ PeeringCtxWrapper &ctx)
+{
+ ceph_assert(!is_peered());
+
+ // twiddle pg state
+ state_clear(PG_STATE_DOWN);
+
+ send_notify = false;
+
+ if (is_primary()) {
+ // only update primary last_epoch_started if we will go active
+ if (acting_set_writeable()) {
+ ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
+ info.last_epoch_started <= activation_epoch);
+ info.last_epoch_started = activation_epoch;
+ info.last_interval_started = info.history.same_interval_since;
+ }
+ } else if (is_acting(pg_whoami)) {
+ /* update last_epoch_started on acting replica to whatever the primary sent
+ * unless it's smaller (could happen if we are going peered rather than
+ * active, see doc/dev/osd_internals/last_epoch_started.rst) */
+ if (info.last_epoch_started < activation_epoch) {
+ info.last_epoch_started = activation_epoch;
+ info.last_interval_started = info.history.same_interval_since;
+ }
+ }
+
+ auto &missing = pg_log.get_missing();
+
+ min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
+ if (is_primary()) {
+ last_update_ondisk = info.last_update;
+ }
+ last_update_applied = info.last_update;
+ last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
+
+ need_up_thru = false;
+
+ // write pg info, log
+ dirty_info = true;
+ dirty_big_info = true; // maybe
+
+ pl->schedule_event_on_commit(
+ t,
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ ActivateCommitted(
+ get_osdmap_epoch(),
+ activation_epoch)));
+
+ // init complete pointer
+ if (missing.num_missing() == 0) {
+ psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
+ << " -> " << info.last_update << dendl;
+ info.last_complete = info.last_update;
+ info.stats.stats.sum.num_objects_missing = 0;
+ pg_log.reset_recovery_pointers();
+ } else {
+ psdout(10) << "activate - not complete, " << missing << dendl;
+ info.stats.stats.sum.num_objects_missing = missing.num_missing();
+ pg_log.activate_not_complete(info);
+ }
+
+ log_weirdness();
+
+ if (is_primary()) {
+ // initialize snap_trimq
+ interval_set<snapid_t> to_trim;
+ auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
+ auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
+ if (p != removed_snaps_queue.end()) {
+ dout(20) << "activate - purged_snaps " << info.purged_snaps
+ << " removed_snaps " << p->second
+ << dendl;
+ for (auto q : p->second) {
+ to_trim.insert(q.first, q.second);
+ }
+ }
+ interval_set<snapid_t> purged;
+ purged.intersection_of(to_trim, info.purged_snaps);
+ to_trim.subtract(purged);
+
+ if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+ renew_lease(pl->get_mnow());
+ // do not schedule until we are actually activated
+ }
+
+ // adjust purged_snaps: PG may have been inactive while snaps were pruned
+ // from the removed_snaps_queue in the osdmap. update local purged_snaps
+ // reflect only those snaps that we thought were pruned and were still in
+ // the queue.
+ info.purged_snaps.swap(purged);
+
+ // start up replicas
+ if (prior_readable_down_osds.empty()) {
+ dout(10) << __func__ << " no prior_readable_down_osds to wait on, clearing ub"
+ << dendl;
+ clear_prior_readable_until_ub();
+ }
+ info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
+ prior_readable_until_ub);
+
+ ceph_assert(!acting_recovery_backfill.empty());
+ for (auto i = acting_recovery_backfill.begin();
+ i != acting_recovery_backfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ pg_shard_t peer = *i;
+ ceph_assert(peer_info.count(peer));
+ pg_info_t& pi = peer_info[peer];
+
+ psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
+
+ MRef<MOSDPGLog> m;
+ ceph_assert(peer_missing.count(peer));
+ pg_missing_t& pm = peer_missing[peer];
+
+ bool needs_past_intervals = pi.dne();
+
+ // Save num_bytes for backfill reservation request, can't be negative
+ peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
+
+ if (pi.last_update == info.last_update) {
+ // empty log
+ if (!pi.last_backfill.is_max())
+ pl->get_clog_info() << info.pgid << " continuing backfill to osd."
+ << peer
+ << " from (" << pi.log_tail << "," << pi.last_update
+ << "] " << pi.last_backfill
+ << " to " << info.last_update;
+ if (!pi.is_empty()) {
+ psdout(10) << "activate peer osd." << peer
+ << " is up to date, queueing in pending_activators" << dendl;
+ ctx.send_info(
+ peer.osd,
+ spg_t(info.pgid.pgid, peer.shard),
+ get_osdmap_epoch(), // fixme: use lower epoch?
+ get_osdmap_epoch(),
+ info,
+ get_lease());
+ } else {
+ psdout(10) << "activate peer osd." << peer
+ << " is up to date, but sending pg_log anyway" << dendl;
+ m = make_message<MOSDPGLog>(
+ i->shard, pg_whoami.shard,
+ get_osdmap_epoch(), info,
+ last_peering_reset);
+ }
+ } else if (
+ pg_log.get_tail() > pi.last_update ||
+ pi.last_backfill == hobject_t() ||
+ (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
+ /* ^ This last case covers a situation where a replica is not contiguous
+ * with the auth_log, but is contiguous with this replica. Reshuffling
+ * the active set to handle this would be tricky, so instead we just go
+ * ahead and backfill it anyway. This is probably preferrable in any
+ * case since the replica in question would have to be significantly
+ * behind.
+ */
+ // backfill
+ pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
+ << " from (" << pi.log_tail << "," << pi.last_update
+ << "] " << pi.last_backfill
+ << " to " << info.last_update;
+
+ pi.last_update = info.last_update;
+ pi.last_complete = info.last_update;
+ pi.set_last_backfill(hobject_t());
+ pi.last_epoch_started = info.last_epoch_started;
+ pi.last_interval_started = info.last_interval_started;
+ pi.history = info.history;
+ pi.hit_set = info.hit_set;
+ pi.stats.stats.clear();
+ pi.stats.stats.sum.num_bytes = peer_bytes[peer];
+
+ // initialize peer with our purged_snaps.
+ pi.purged_snaps = info.purged_snaps;
+
+ m = make_message<MOSDPGLog>(
+ i->shard, pg_whoami.shard,
+ get_osdmap_epoch(), pi,
+ last_peering_reset /* epoch to create pg at */);
+
+ // send some recent log, so that op dup detection works well.
+ m->log.copy_up_to(cct, pg_log.get_log(),
+ cct->_conf->osd_max_pg_log_entries);
+ m->info.log_tail = m->log.tail;
+ pi.log_tail = m->log.tail; // sigh...
+
+ pm.clear();
+ } else {
+ // catch up
+ ceph_assert(pg_log.get_tail() <= pi.last_update);
+ m = make_message<MOSDPGLog>(
+ i->shard, pg_whoami.shard,
+ get_osdmap_epoch(), info,
+ last_peering_reset /* epoch to create pg at */);
+ // send new stuff to append to replicas log
+ m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
+ }
+
+ // share past_intervals if we are creating the pg on the replica
+ // based on whether our info for that peer was dne() *before*
+ // updating pi.history in the backfill block above.
+ if (m && needs_past_intervals)
+ m->past_intervals = past_intervals;
+
+ // update local version of peer's missing list!
+ if (m && pi.last_backfill != hobject_t()) {
+ for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) {
+ if (p->soid <= pi.last_backfill &&
+ !p->is_error()) {
+ if (perform_deletes_during_peering() && p->is_delete()) {
+ pm.rm(p->soid, p->version);
+ } else {
+ pm.add_next_event(*p);
+ }
+ }
+ }
+ }
+
+ if (m) {
+ dout(10) << "activate peer osd." << peer << " sending " << m->log
+ << dendl;
+ m->lease = get_lease();
+ pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
+ }
+
+ // peer now has
+ pi.last_update = info.last_update;
+
+ // update our missing
+ if (pm.num_missing() == 0) {
+ pi.last_complete = pi.last_update;
+ psdout(10) << "activate peer osd." << peer << " " << pi
+ << " uptodate" << dendl;
+ } else {
+ psdout(10) << "activate peer osd." << peer << " " << pi
+ << " missing " << pm << dendl;
+ }
+ }
+
+ // Set up missing_loc
+ set<pg_shard_t> complete_shards;
+ for (auto i = acting_recovery_backfill.begin();
+ i != acting_recovery_backfill.end();
+ ++i) {
+ psdout(20) << __func__ << " setting up missing_loc from shard " << *i
+ << " " << dendl;
+ if (*i == get_primary()) {
+ missing_loc.add_active_missing(missing);
+ if (!missing.have_missing())
+ complete_shards.insert(*i);
+ } else {
+ auto peer_missing_entry = peer_missing.find(*i);
+ ceph_assert(peer_missing_entry != peer_missing.end());
+ missing_loc.add_active_missing(peer_missing_entry->second);
+ if (!peer_missing_entry->second.have_missing() &&
+ peer_info[*i].last_backfill.is_max())
+ complete_shards.insert(*i);
+ }
+ }
+
+ // If necessary, create might_have_unfound to help us find our unfound objects.
+ // NOTE: It's important that we build might_have_unfound before trimming the
+ // past intervals.
+ might_have_unfound.clear();
+ if (needs_recovery()) {
+ // If only one shard has missing, we do a trick to add all others as recovery
+ // source, this is considered safe since the PGLogs have been merged locally,
+ // and covers vast majority of the use cases, like one OSD/host is down for
+ // a while for hardware repairing
+ if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
+ missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
+ } else {
+ missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
+ ctx.handle);
+ for (auto i = acting_recovery_backfill.begin();
+ i != acting_recovery_backfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
+ ceph_assert(peer_missing.count(*i));
+ ceph_assert(peer_info.count(*i));
+ missing_loc.add_source_info(
+ *i,
+ peer_info[*i],
+ peer_missing[*i],
+ ctx.handle);
+ }
+ }
+ for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) {
+ if (is_acting_recovery_backfill(i->first))
+ continue;
+ ceph_assert(peer_info.count(i->first));
+ search_for_missing(
+ peer_info[i->first],
+ i->second,
+ i->first,
+ ctx);
+ }
+
+ build_might_have_unfound();
+
+ // Always call now so update_calc_stats() will be accurate
+ discover_all_missing(ctx.msgs);
+
+ }
+
+ // num_objects_degraded if calculated should reflect this too, unless no
+ // missing and we are about to go clean.
+ if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
+ state_set(PG_STATE_UNDERSIZED);
+ }
+
+ state_set(PG_STATE_ACTIVATING);
+ pl->on_activate(std::move(to_trim));
+ }
+ if (acting_set_writeable()) {
+ PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+ pg_log.roll_forward(rollbacker.get());
+ }
+}
+
+void PeeringState::share_pg_info()
+{
+ psdout(10) << "share_pg_info" << dendl;
+
+ info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
+ prior_readable_until_ub);
+
+ // share new pg_info_t with replicas
+ ceph_assert(!acting_recovery_backfill.empty());
+ for (auto pg_shard : acting_recovery_backfill) {
+ if (pg_shard == pg_whoami) continue;
+ if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
+ peer->second.last_epoch_started = info.last_epoch_started;
+ peer->second.last_interval_started = info.last_interval_started;
+ peer->second.history.merge(info.history);
+ }
+ MessageRef m;
+ if (last_require_osd_release >= ceph_release_t::octopus) {
+ m = make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard},
+ info,
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ std::optional<pg_lease_t>{get_lease()},
+ std::nullopt);
+ } else {
+ m = make_message<MOSDPGInfo>(get_osdmap_epoch(),
+ MOSDPGInfo::pg_list_t{
+ pg_notify_t{pg_shard.shard,
+ pg_whoami.shard,
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ info,
+ past_intervals}});
+ }
+ pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
+ }
+}
+
+void PeeringState::merge_log(
+ ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog,
+ pg_shard_t from)
+{
+ PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+ pg_log.merge_log(
+ oinfo, std::move(olog), from, info, rollbacker.get(),
+ dirty_info, dirty_big_info);
+}
+
+void PeeringState::rewind_divergent_log(
+ ObjectStore::Transaction& t, eversion_t newhead)
+{
+ PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+ pg_log.rewind_divergent_log(
+ newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
+}
+
+
+void PeeringState::proc_primary_info(
+ ObjectStore::Transaction &t, const pg_info_t &oinfo)
+{
+ ceph_assert(!is_primary());
+
+ update_history(oinfo.history);
+ if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
+ info.stats.stats.sum.num_scrub_errors = 0;
+ info.stats.stats.sum.num_shallow_scrub_errors = 0;
+ info.stats.stats.sum.num_deep_scrub_errors = 0;
+ dirty_info = true;
+ }
+
+ if (!(info.purged_snaps == oinfo.purged_snaps)) {
+ psdout(10) << __func__ << " updating purged_snaps to "
+ << oinfo.purged_snaps
+ << dendl;
+ info.purged_snaps = oinfo.purged_snaps;
+ dirty_info = true;
+ dirty_big_info = true;
+ }
+}
+
+void PeeringState::proc_master_log(
+ ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
+{
+ psdout(10) << "proc_master_log for osd." << from << ": "
+ << olog << " " << omissing << dendl;
+ ceph_assert(!is_peered() && is_primary());
+
+ // merge log into our own log to build master log. no need to
+ // make any adjustments to their missing map; we are taking their
+ // log to be authoritative (i.e., their entries are by definitely
+ // non-divergent).
+ merge_log(t, oinfo, std::move(olog), from);
+ peer_info[from] = oinfo;
+ psdout(10) << " peer osd." << from << " now " << oinfo
+ << " " << omissing << dendl;
+ might_have_unfound.insert(from);
+
+ // See doc/dev/osd_internals/last_epoch_started
+ if (oinfo.last_epoch_started > info.last_epoch_started) {
+ info.last_epoch_started = oinfo.last_epoch_started;
+ dirty_info = true;
+ }
+ if (oinfo.last_interval_started > info.last_interval_started) {
+ info.last_interval_started = oinfo.last_interval_started;
+ dirty_info = true;
+ }
+ update_history(oinfo.history);
+ ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
+ info.last_epoch_started >= info.history.last_epoch_started);
+
+ peer_missing[from].claim(std::move(omissing));
+}
+
+void PeeringState::proc_replica_log(
+ pg_info_t &oinfo,
+ const pg_log_t &olog,
+ pg_missing_t&& omissing,
+ pg_shard_t from)
+{
+ psdout(10) << "proc_replica_log for osd." << from << ": "
+ << oinfo << " " << olog << " " << omissing << dendl;
+
+ pg_log.proc_replica_log(oinfo, olog, omissing, from);
+
+ peer_info[from] = oinfo;
+ psdout(10) << " peer osd." << from << " now "
+ << oinfo << " " << omissing << dendl;
+ might_have_unfound.insert(from);
+
+ for (auto i = omissing.get_items().begin();
+ i != omissing.get_items().end();
+ ++i) {
+ psdout(20) << " after missing " << i->first
+ << " need " << i->second.need
+ << " have " << i->second.have << dendl;
+ }
+ peer_missing[from].claim(std::move(omissing));
+}
+
+void PeeringState::fulfill_info(
+ pg_shard_t from, const pg_query_t &query,
+ pair<pg_shard_t, pg_info_t> &notify_info)
+{
+ ceph_assert(from == primary);
+ ceph_assert(query.type == pg_query_t::INFO);
+
+ // info
+ psdout(10) << "sending info" << dendl;
+ notify_info = make_pair(from, info);
+}
+
+void PeeringState::fulfill_log(
+ pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
+{
+ psdout(10) << "log request from " << from << dendl;
+ ceph_assert(from == primary);
+ ceph_assert(query.type != pg_query_t::INFO);
+
+ auto mlog = make_message<MOSDPGLog>(
+ from.shard, pg_whoami.shard,
+ get_osdmap_epoch(),
+ info, query_epoch);
+ mlog->missing = pg_log.get_missing();
+
+ // primary -> other, when building master log
+ if (query.type == pg_query_t::LOG) {
+ psdout(10) << " sending info+missing+log since " << query.since
+ << dendl;
+ if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
+ pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
+ << query.since
+ << " when my log.tail is " << pg_log.get_tail()
+ << ", sending full log instead";
+ mlog->log = pg_log.get_log(); // primary should not have requested this!!
+ } else
+ mlog->log.copy_after(cct, pg_log.get_log(), query.since);
+ }
+ else if (query.type == pg_query_t::FULLLOG) {
+ psdout(10) << " sending info+missing+full log" << dendl;
+ mlog->log = pg_log.get_log();
+ }
+
+ psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
+
+ pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
+}
+
+void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
+{
+ if (query.query.type == pg_query_t::INFO) {
+ pair<pg_shard_t, pg_info_t> notify_info;
+ // note this refreshes our prior_readable_until_ub value
+ update_history(query.query.history);
+ fulfill_info(query.from, query.query, notify_info);
+ rctx.send_notify(
+ notify_info.first.osd,
+ pg_notify_t(
+ notify_info.first.shard, pg_whoami.shard,
+ query.query_epoch,
+ get_osdmap_epoch(),
+ notify_info.second,
+ past_intervals));
+ } else {
+ update_history(query.query.history);
+ fulfill_log(query.from, query.query, query.query_epoch);
+ }
+}
+
+void PeeringState::try_mark_clean()
+{
+ if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
+ state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
+ state_set(PG_STATE_CLEAN);
+ info.history.last_epoch_clean = get_osdmap_epoch();
+ info.history.last_interval_clean = info.history.same_interval_since;
+ past_intervals.clear();
+ dirty_big_info = true;
+ dirty_info = true;
+ }
+
+ if (!is_active() && is_peered()) {
+ if (is_clean()) {
+ bool target;
+ if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
+ if (target) {
+ psdout(10) << "ready to merge (target)" << dendl;
+ pl->set_ready_to_merge_target(
+ info.last_update,
+ info.history.last_epoch_started,
+ info.history.last_epoch_clean);
+ } else {
+ psdout(10) << "ready to merge (source)" << dendl;
+ pl->set_ready_to_merge_source(info.last_update);
+ }
+ }
+ } else {
+ psdout(10) << "not clean, not ready to merge" << dendl;
+ // we should have notified OSD in Active state entry point
+ }
+ }
+
+ state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+
+ share_pg_info();
+ pl->publish_stats_to_osd();
+ clear_recovery_state();
+}
+
+void PeeringState::split_into(
+ pg_t child_pgid, PeeringState *child, unsigned split_bits)
+{
+ child->update_osdmap_ref(get_osdmap());
+ child->pool = pool;
+
+ // Log
+ pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
+ child->info.last_complete = info.last_complete;
+
+ info.last_update = pg_log.get_head();
+ child->info.last_update = child->pg_log.get_head();
+
+ child->info.last_user_version = info.last_user_version;
+
+ info.log_tail = pg_log.get_tail();
+ child->info.log_tail = child->pg_log.get_tail();
+
+ // reset last_complete, we might have modified pg_log & missing above
+ pg_log.reset_complete_to(&info);
+ child->pg_log.reset_complete_to(&child->info);
+
+ // Info
+ child->info.history = info.history;
+ child->info.history.epoch_created = get_osdmap_epoch();
+ child->info.purged_snaps = info.purged_snaps;
+
+ if (info.last_backfill.is_max()) {
+ child->info.set_last_backfill(hobject_t::get_max());
+ } else {
+ // restart backfill on parent and child to be safe. we could
+ // probably do better in the bitwise sort case, but it's more
+ // fragile (there may be special work to do on backfill completion
+ // in the future).
+ info.set_last_backfill(hobject_t());
+ child->info.set_last_backfill(hobject_t());
+ // restarting backfill implies that the missing set is empty,
+ // since it is only used for objects prior to last_backfill
+ pg_log.reset_backfill();
+ child->pg_log.reset_backfill();
+ }
+
+ child->info.stats = info.stats;
+ child->info.stats.parent_split_bits = split_bits;
+ info.stats.stats_invalid = true;
+ child->info.stats.stats_invalid = true;
+ child->info.last_epoch_started = info.last_epoch_started;
+ child->info.last_interval_started = info.last_interval_started;
+
+ // There can't be recovery/backfill going on now
+ int primary, up_primary;
+ vector<int> newup, newacting;
+ get_osdmap()->pg_to_up_acting_osds(
+ child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
+ child->init_primary_up_acting(
+ newup,
+ newacting,
+ up_primary,
+ primary);
+ child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
+
+ // this comparison includes primary rank via pg_shard_t
+ if (get_primary() != child->get_primary())
+ child->info.history.same_primary_since = get_osdmap_epoch();
+
+ child->info.stats.up = newup;
+ child->info.stats.up_primary = up_primary;
+ child->info.stats.acting = newacting;
+ child->info.stats.acting_primary = primary;
+ child->info.stats.mapping_epoch = get_osdmap_epoch();
+
+ // History
+ child->past_intervals = past_intervals;
+
+ child->on_new_interval();
+
+ child->send_notify = !child->is_primary();
+
+ child->dirty_info = true;
+ child->dirty_big_info = true;
+ dirty_info = true;
+ dirty_big_info = true;
+}
+
+void PeeringState::merge_from(
+ map<spg_t,PeeringState *>& sources,
+ PeeringCtx &rctx,
+ unsigned split_bits,
+ const pg_merge_meta_t& last_pg_merge_meta)
+{
+ bool incomplete = false;
+ if (info.last_complete != info.last_update ||
+ info.is_incomplete() ||
+ info.dne()) {
+ psdout(10) << __func__ << " target incomplete" << dendl;
+ incomplete = true;
+ }
+ if (last_pg_merge_meta.source_pgid != pg_t()) {
+ if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
+ psdout(10) << __func__ << " target doesn't match expected parent "
+ << last_pg_merge_meta.source_pgid.get_parent()
+ << " of source_pgid " << last_pg_merge_meta.source_pgid
+ << dendl;
+ incomplete = true;
+ }
+ if (info.last_update != last_pg_merge_meta.target_version) {
+ psdout(10) << __func__ << " target version doesn't match expected "
+ << last_pg_merge_meta.target_version << dendl;
+ incomplete = true;
+ }
+ }
+
+ PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
+ pg_log.roll_forward(handler.get());
+
+ info.last_complete = info.last_update; // to fake out trim()
+ pg_log.reset_recovery_pointers();
+ pg_log.trim(info.last_update, info);
+
+ vector<PGLog*> log_from;
+ for (auto& i : sources) {
+ auto& source = i.second;
+ if (!source) {
+ psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
+ incomplete = true;
+ continue;
+ }
+ if (source->info.last_complete != source->info.last_update ||
+ source->info.is_incomplete() ||
+ source->info.dne()) {
+ psdout(10) << __func__ << " source " << source->pg_whoami
+ << " incomplete"
+ << dendl;
+ incomplete = true;
+ }
+ if (last_pg_merge_meta.source_pgid != pg_t()) {
+ if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
+ dout(10) << __func__ << " source " << source->info.pgid.pgid
+ << " doesn't match expected source pgid "
+ << last_pg_merge_meta.source_pgid << dendl;
+ incomplete = true;
+ }
+ if (source->info.last_update != last_pg_merge_meta.source_version) {
+ dout(10) << __func__ << " source version doesn't match expected "
+ << last_pg_merge_meta.target_version << dendl;
+ incomplete = true;
+ }
+ }
+
+ // prepare log
+ PGLog::LogEntryHandlerRef handler{
+ source->pl->get_log_handler(rctx.transaction)};
+ source->pg_log.roll_forward(handler.get());
+ source->info.last_complete = source->info.last_update; // to fake out trim()
+ source->pg_log.reset_recovery_pointers();
+ source->pg_log.trim(source->info.last_update, source->info);
+ log_from.push_back(&source->pg_log);
+
+ // combine stats
+ info.stats.add(source->info.stats);
+
+ // pull up last_update
+ info.last_update = std::max(info.last_update, source->info.last_update);
+
+ // adopt source's PastIntervals if target has none. we can do this since
+ // pgp_num has been reduced prior to the merge, so the OSD mappings for
+ // the PGs are identical.
+ if (past_intervals.empty() && !source->past_intervals.empty()) {
+ psdout(10) << __func__ << " taking source's past_intervals" << dendl;
+ past_intervals = source->past_intervals;
+ }
+ }
+
+ info.last_complete = info.last_update;
+ info.log_tail = info.last_update;
+ if (incomplete) {
+ info.last_backfill = hobject_t();
+ }
+
+ // merge logs
+ pg_log.merge_from(log_from, info.last_update);
+
+ // make sure we have a meaningful last_epoch_started/clean (if we were a
+ // placeholder)
+ if (info.history.epoch_created == 0) {
+ // start with (a) source's history, since these PGs *should* have been
+ // remapped in concert with each other...
+ info.history = sources.begin()->second->info.history;
+
+ // we use the last_epoch_{started,clean} we got from
+ // the caller, which are the epochs that were reported by the PGs were
+ // found to be ready for merge.
+ info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
+ info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
+ info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
+ psdout(10) << __func__
+ << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
+ << last_pg_merge_meta.last_epoch_clean
+ << " from pool last_dec_*, source pg history was "
+ << sources.begin()->second->info.history
+ << dendl;
+
+ // above we have pulled down source's history and we need to check
+ // history.epoch_created again to confirm that source is not a placeholder
+ // too. (peering requires a sane history.same_interval_since value for any
+ // non-newly created pg and below here we know we are basically iterating
+ // back a series of past maps to fake a merge process, hence we need to
+ // fix history.same_interval_since first so that start_peering_interval()
+ // will not complain)
+ if (info.history.epoch_created == 0) {
+ dout(10) << __func__ << " both merge target and source are placeholders,"
+ << " set sis to lec " << info.history.last_epoch_clean
+ << dendl;
+ info.history.same_interval_since = info.history.last_epoch_clean;
+ }
+
+ // if the past_intervals start is later than last_epoch_clean, it
+ // implies the source repeered again but the target didn't, or
+ // that the source became clean in a later epoch than the target.
+ // avoid the discrepancy but adjusting the interval start
+ // backwards to match so that check_past_interval_bounds() will
+ // not complain.
+ auto pib = past_intervals.get_bounds();
+ if (info.history.last_epoch_clean < pib.first) {
+ psdout(10) << __func__ << " last_epoch_clean "
+ << info.history.last_epoch_clean << " < past_interval start "
+ << pib.first << ", adjusting start backwards" << dendl;
+ past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
+ }
+
+ // Similarly, if the same_interval_since value is later than
+ // last_epoch_clean, the next interval change will result in a
+ // past_interval start that is later than last_epoch_clean. This
+ // can happen if we use the pg_history values from the merge
+ // source. Adjust the same_interval_since value backwards if that
+ // happens. (We trust the les and lec values more because they came from
+ // the real target, whereas the history value we stole from the source.)
+ if (info.history.last_epoch_started < info.history.same_interval_since) {
+ psdout(10) << __func__ << " last_epoch_started "
+ << info.history.last_epoch_started << " < same_interval_since "
+ << info.history.same_interval_since
+ << ", adjusting pg_history backwards" << dendl;
+ info.history.same_interval_since = info.history.last_epoch_clean;
+ // make sure same_{up,primary}_since are <= same_interval_since
+ info.history.same_up_since = std::min(
+ info.history.same_up_since, info.history.same_interval_since);
+ info.history.same_primary_since = std::min(
+ info.history.same_primary_since, info.history.same_interval_since);
+ }
+ }
+
+ dirty_info = true;
+ dirty_big_info = true;
+}
+
+void PeeringState::start_split_stats(
+ const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
+{
+ out->resize(childpgs.size() + 1);
+ info.stats.stats.sum.split(*out);
+}
+
+void PeeringState::finish_split_stats(
+ const object_stat_sum_t& stats, ObjectStore::Transaction &t)
+{
+ info.stats.stats.sum = stats;
+ write_if_dirty(t);
+}
+
+void PeeringState::update_blocked_by()
+{
+ // set a max on the number of blocking peers we report. if we go
+ // over, report a random subset. keep the result sorted.
+ unsigned keep = std::min<unsigned>(
+ blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
+ unsigned skip = blocked_by.size() - keep;
+ info.stats.blocked_by.clear();
+ info.stats.blocked_by.resize(keep);
+ unsigned pos = 0;
+ for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) {
+ if (skip > 0 && (rand() % (skip + keep) < skip)) {
+ --skip;
+ } else {
+ info.stats.blocked_by[pos++] = *p;
+ --keep;
+ }
+ }
+}
+
+static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
+{
+ for (auto&p : pgs)
+ if (p.shard == shard)
+ return true;
+ return false;
+}
+
+static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
+{
+ for (auto&p : pgs) {
+ if (p == skip)
+ continue;
+ if (p.shard == shard)
+ return p;
+ }
+ return pg_shard_t();
+}
+
+void PeeringState::update_calc_stats()
+{
+ info.stats.version = info.last_update;
+ info.stats.created = info.history.epoch_created;
+ info.stats.last_scrub = info.history.last_scrub;
+ info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
+ info.stats.last_deep_scrub = info.history.last_deep_scrub;
+ info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
+ info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
+ info.stats.last_epoch_clean = info.history.last_epoch_clean;
+
+ info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
+ info.stats.ondisk_log_size = info.stats.log_size;
+ info.stats.log_start = pg_log.get_tail();
+ info.stats.ondisk_log_start = pg_log.get_tail();
+ info.stats.snaptrimq_len = pl->get_snap_trimq_size();
+
+ unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
+
+ // In rare case that upset is too large (usually transient), use as target
+ // for calculations below.
+ unsigned target = std::max(num_shards, (unsigned)upset.size());
+ // For undersized actingset may be larger with OSDs out
+ unsigned nrep = std::max(actingset.size(), upset.size());
+ // calc num_object_copies
+ info.stats.stats.calc_copies(std::max(target, nrep));
+ info.stats.stats.sum.num_objects_degraded = 0;
+ info.stats.stats.sum.num_objects_unfound = 0;
+ info.stats.stats.sum.num_objects_misplaced = 0;
+ info.stats.avail_no_missing.clear();
+ info.stats.object_location_counts.clear();
+
+ // We should never hit this condition, but if end up hitting it,
+ // make sure to update num_objects and set PG_STATE_INCONSISTENT.
+ if (info.stats.stats.sum.num_objects < 0) {
+ psdout(0) << __func__ << " negative num_objects = "
+ << info.stats.stats.sum.num_objects << " setting it to 0 "
+ << dendl;
+ info.stats.stats.sum.num_objects = 0;
+ state_set(PG_STATE_INCONSISTENT);
+ }
+
+ if ((is_remapped() || is_undersized() || !is_clean()) &&
+ (is_peered()|| is_activating())) {
+ psdout(20) << __func__ << " actingset " << actingset << " upset "
+ << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
+
+ ceph_assert(!acting_recovery_backfill.empty());
+
+ bool estimate = false;
+
+ // NOTE: we only generate degraded, misplaced and unfound
+ // values for the summation, not individual stat categories.
+ int64_t num_objects = info.stats.stats.sum.num_objects;
+
+ // Objects missing from up nodes, sorted by # objects.
+ boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
+ // Objects missing from nodes not in up, sort by # objects
+ boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
+
+ // Fill missing_target_objects/acting_source_objects
+
+ {
+ int64_t missing;
+
+ // Primary first
+ missing = pg_log.get_missing().num_missing();
+ ceph_assert(acting_recovery_backfill.count(pg_whoami));
+ if (upset.count(pg_whoami)) {
+ missing_target_objects.emplace(missing, pg_whoami);
+ } else {
+ acting_source_objects.emplace(missing, pg_whoami);
+ }
+ info.stats.stats.sum.num_objects_missing_on_primary = missing;
+ if (missing == 0)
+ info.stats.avail_no_missing.push_back(pg_whoami);
+ psdout(20) << __func__ << " shard " << pg_whoami
+ << " primary objects " << num_objects
+ << " missing " << missing
+ << dendl;
+ }
+
+ // All other peers
+ for (auto& peer : peer_info) {
+ // Primary should not be in the peer_info, skip if it is.
+ if (peer.first == pg_whoami) continue;
+ int64_t missing = 0;
+ int64_t peer_num_objects =
+ std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
+ // Backfill targets always track num_objects accurately
+ // all other peers track missing accurately.
+ if (is_backfill_target(peer.first)) {
+ missing = std::max((int64_t)0, num_objects - peer_num_objects);
+ } else {
+ if (peer_missing.count(peer.first)) {
+ missing = peer_missing[peer.first].num_missing();
+ } else {
+ psdout(20) << __func__ << " no peer_missing found for "
+ << peer.first << dendl;
+ if (is_recovering()) {
+ estimate = true;
+ }
+ missing = std::max((int64_t)0, num_objects - peer_num_objects);
+ }
+ }
+ if (upset.count(peer.first)) {
+ missing_target_objects.emplace(missing, peer.first);
+ } else if (actingset.count(peer.first)) {
+ acting_source_objects.emplace(missing, peer.first);
+ }
+ peer.second.stats.stats.sum.num_objects_missing = missing;
+ if (missing == 0)
+ info.stats.avail_no_missing.push_back(peer.first);
+ psdout(20) << __func__ << " shard " << peer.first
+ << " objects " << peer_num_objects
+ << " missing " << missing
+ << dendl;
+ }
+
+ // Compute object_location_counts
+ for (auto& ml: missing_loc.get_missing_locs()) {
+ info.stats.object_location_counts[ml.second]++;
+ psdout(30) << __func__ << " " << ml.first << " object_location_counts["
+ << ml.second << "]=" << info.stats.object_location_counts[ml.second]
+ << dendl;
+ }
+ int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
+ if (not_missing) {
+ // During recovery we know upset == actingset and is being populated
+ // During backfill we know that all non-missing objects are in the actingset
+ info.stats.object_location_counts[actingset] = not_missing;
+ }
+ psdout(30) << __func__ << " object_location_counts["
+ << upset << "]=" << info.stats.object_location_counts[upset]
+ << dendl;
+ psdout(20) << __func__ << " object_location_counts "
+ << info.stats.object_location_counts << dendl;
+
+ // A misplaced object is not stored on the correct OSD
+ int64_t misplaced = 0;
+ // a degraded objects has fewer replicas or EC shards than the pool specifies.
+ int64_t degraded = 0;
+
+ if (is_recovering()) {
+ for (auto& sml: missing_loc.get_missing_by_count()) {
+ for (auto& ml: sml.second) {
+ int missing_shards;
+ if (sml.first == shard_id_t::NO_SHARD) {
+ psdout(20) << __func__ << " ml " << ml.second
+ << " upset size " << upset.size()
+ << " up " << ml.first.up << dendl;
+ missing_shards = (int)upset.size() - ml.first.up;
+ } else {
+ // Handle shards not even in upset below
+ if (!find_shard(upset, sml.first))
+ continue;
+ missing_shards = std::max(0, 1 - ml.first.up);
+ psdout(20) << __func__
+ << " shard " << sml.first
+ << " ml " << ml.second
+ << " missing shards " << missing_shards << dendl;
+ }
+ int odegraded = ml.second * missing_shards;
+ // Copies on other osds but limited to the possible degraded
+ int more_osds = std::min(missing_shards, ml.first.other);
+ int omisplaced = ml.second * more_osds;
+ ceph_assert(omisplaced <= odegraded);
+ odegraded -= omisplaced;
+
+ misplaced += omisplaced;
+ degraded += odegraded;
+ }
+ }
+
+ psdout(20) << __func__ << " missing based degraded "
+ << degraded << dendl;
+ psdout(20) << __func__ << " missing based misplaced "
+ << misplaced << dendl;
+
+ // Handle undersized case
+ if (pool.info.is_replicated()) {
+ // Add degraded for missing targets (num_objects missing)
+ ceph_assert(target >= upset.size());
+ unsigned needed = target - upset.size();
+ degraded += num_objects * needed;
+ } else {
+ for (unsigned i = 0 ; i < num_shards; ++i) {
+ shard_id_t shard(i);
+
+ if (!find_shard(upset, shard)) {
+ pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
+
+ if (pgs != pg_shard_t()) {
+ int64_t missing;
+
+ if (pgs == pg_whoami)
+ missing = info.stats.stats.sum.num_objects_missing_on_primary;
+ else
+ missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
+
+ degraded += missing;
+ misplaced += std::max((int64_t)0, num_objects - missing);
+ } else {
+ // No shard anywhere
+ degraded += num_objects;
+ }
+ }
+ }
+ }
+ goto out;
+ }
+
+ // Handle undersized case
+ if (pool.info.is_replicated()) {
+ // Add to missing_target_objects
+ ceph_assert(target >= missing_target_objects.size());
+ unsigned needed = target - missing_target_objects.size();
+ if (needed)
+ missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
+ } else {
+ for (unsigned i = 0 ; i < num_shards; ++i) {
+ shard_id_t shard(i);
+ bool found = false;
+ for (const auto& t : missing_target_objects) {
+ if (std::get<1>(t).shard == shard) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
+ }
+ }
+
+ for (const auto& item : missing_target_objects)
+ psdout(20) << __func__ << " missing shard " << std::get<1>(item)
+ << " missing= " << std::get<0>(item) << dendl;
+ for (const auto& item : acting_source_objects)
+ psdout(20) << __func__ << " acting shard " << std::get<1>(item)
+ << " missing= " << std::get<0>(item) << dendl;
+
+ // Handle all objects not in missing for remapped
+ // or backfill
+ for (auto m = missing_target_objects.rbegin();
+ m != missing_target_objects.rend(); ++m) {
+
+ int64_t extra_missing = -1;
+
+ if (pool.info.is_replicated()) {
+ if (!acting_source_objects.empty()) {
+ auto extra_copy = acting_source_objects.begin();
+ extra_missing = std::get<0>(*extra_copy);
+ acting_source_objects.erase(extra_copy);
+ }
+ } else { // Erasure coded
+ // Use corresponding shard
+ for (const auto& a : acting_source_objects) {
+ if (std::get<1>(a).shard == std::get<1>(*m).shard) {
+ extra_missing = std::get<0>(a);
+ acting_source_objects.erase(a);
+ break;
+ }
+ }
+ }
+
+ if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
+ // We don't know which of the objects on the target
+ // are part of extra_missing so assume are all degraded.
+ misplaced += std::get<0>(*m) - extra_missing;
+ degraded += extra_missing;
+ } else {
+ // 1. extra_missing == -1, more targets than sources so degraded
+ // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
+ // previously degraded are now present on the target.
+ degraded += std::get<0>(*m);
+ }
+ }
+ // If there are still acting that haven't been accounted for
+ // then they are misplaced
+ for (const auto& a : acting_source_objects) {
+ int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
+ psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
+ << dendl;
+ misplaced += extra_misplaced;
+ }
+out:
+ // NOTE: Tests use these messages to verify this code
+ psdout(20) << __func__ << " degraded " << degraded
+ << (estimate ? " (est)": "") << dendl;
+ psdout(20) << __func__ << " misplaced " << misplaced
+ << (estimate ? " (est)": "")<< dendl;
+
+ info.stats.stats.sum.num_objects_degraded = degraded;
+ info.stats.stats.sum.num_objects_unfound = get_num_unfound();
+ info.stats.stats.sum.num_objects_misplaced = misplaced;
+ }
+}
+
+std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
+ bool pg_stats_publish_valid,
+ const pg_stat_t &pg_stats_publish,
+ const object_stat_collection_t &unstable_stats)
+{
+ if (info.stats.stats.sum.num_scrub_errors) {
+ psdout(10) << __func__ << " inconsistent due to " <<
+ info.stats.stats.sum.num_scrub_errors << " scrub errors" << dendl;
+ state_set(PG_STATE_INCONSISTENT);
+ } else {
+ state_clear(PG_STATE_INCONSISTENT);
+ state_clear(PG_STATE_FAILED_REPAIR);
+ }
+
+ utime_t now = ceph_clock_now();
+ if (info.stats.state != state) {
+ info.stats.last_change = now;
+ // Optimistic estimation, if we just find out an inactive PG,
+ // assumt it is active till now.
+ if (!(state & PG_STATE_ACTIVE) &&
+ (info.stats.state & PG_STATE_ACTIVE))
+ info.stats.last_active = now;
+
+ if ((state & PG_STATE_ACTIVE) &&
+ !(info.stats.state & PG_STATE_ACTIVE))
+ info.stats.last_became_active = now;
+ if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
+ !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
+ info.stats.last_became_peered = now;
+ info.stats.state = state;
+ }
+
+ update_calc_stats();
+ if (info.stats.stats.sum.num_objects_degraded) {
+ state_set(PG_STATE_DEGRADED);
+ } else {
+ state_clear(PG_STATE_DEGRADED);
+ }
+ update_blocked_by();
+
+ pg_stat_t pre_publish = info.stats;
+ pre_publish.stats.add(unstable_stats);
+ utime_t cutoff = now;
+ cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
+
+ // share (some of) our purged_snaps via the pg_stats. limit # of intervals
+ // because we don't want to make the pg_stat_t structures too expensive.
+ unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
+ unsigned num = 0;
+ auto i = info.purged_snaps.begin();
+ while (num < max && i != info.purged_snaps.end()) {
+ pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
+ ++num;
+ ++i;
+ }
+ psdout(20) << __func__ << " reporting purged_snaps "
+ << pre_publish.purged_snaps << dendl;
+
+ if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
+ info.stats.last_fresh > cutoff) {
+ psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
+ << ": no change since " << info.stats.last_fresh << dendl;
+ return std::nullopt;
+ } else {
+ // update our stat summary and timestamps
+ info.stats.reported_epoch = get_osdmap_epoch();
+ ++info.stats.reported_seq;
+
+ info.stats.last_fresh = now;
+
+ if (info.stats.state & PG_STATE_CLEAN)
+ info.stats.last_clean = now;
+ if (info.stats.state & PG_STATE_ACTIVE)
+ info.stats.last_active = now;
+ if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
+ info.stats.last_peered = now;
+ info.stats.last_unstale = now;
+ if ((info.stats.state & PG_STATE_DEGRADED) == 0)
+ info.stats.last_undegraded = now;
+ if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
+ info.stats.last_fullsized = now;
+
+ psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
+ << ":" << pg_stats_publish.reported_seq << dendl;
+ return std::make_optional(std::move(pre_publish));
+ }
+}
+
+void PeeringState::init(
+ int role,
+ const vector<int>& newup, int new_up_primary,
+ const vector<int>& newacting, int new_acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pi,
+ bool backfill,
+ ObjectStore::Transaction &t)
+{
+ psdout(10) << "init role " << role << " up "
+ << newup << " acting " << newacting
+ << " history " << history
+ << " past_intervals " << pi
+ << dendl;
+
+ set_role(role);
+ init_primary_up_acting(
+ newup,
+ newacting,
+ new_up_primary,
+ new_acting_primary);
+
+ info.history = history;
+ past_intervals = pi;
+
+ info.stats.up = up;
+ info.stats.up_primary = new_up_primary;
+ info.stats.acting = acting;
+ info.stats.acting_primary = new_acting_primary;
+ info.stats.mapping_epoch = info.history.same_interval_since;
+
+ if (!perform_deletes_during_peering()) {
+ pg_log.set_missing_may_contain_deletes();
+ }
+
+ if (backfill) {
+ psdout(10) << __func__ << ": Setting backfill" << dendl;
+ info.set_last_backfill(hobject_t());
+ info.last_complete = info.last_update;
+ pg_log.mark_log_for_rewrite();
+ }
+
+ on_new_interval();
+
+ dirty_info = true;
+ dirty_big_info = true;
+ write_if_dirty(t);
+}
+
+void PeeringState::dump_peering_state(Formatter *f)
+{
+ f->dump_string("state", get_pg_state_string());
+ f->dump_unsigned("epoch", get_osdmap_epoch());
+ f->open_array_section("up");
+ for (auto p = up.begin(); p != up.end(); ++p)
+ f->dump_unsigned("osd", *p);
+ f->close_section();
+ f->open_array_section("acting");
+ for (auto p = acting.begin(); p != acting.end(); ++p)
+ f->dump_unsigned("osd", *p);
+ f->close_section();
+ if (!backfill_targets.empty()) {
+ f->open_array_section("backfill_targets");
+ for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
+ f->dump_stream("shard") << *p;
+ f->close_section();
+ }
+ if (!async_recovery_targets.empty()) {
+ f->open_array_section("async_recovery_targets");
+ for (auto p = async_recovery_targets.begin();
+ p != async_recovery_targets.end();
+ ++p)
+ f->dump_stream("shard") << *p;
+ f->close_section();
+ }
+ if (!acting_recovery_backfill.empty()) {
+ f->open_array_section("acting_recovery_backfill");
+ for (auto p = acting_recovery_backfill.begin();
+ p != acting_recovery_backfill.end();
+ ++p)
+ f->dump_stream("shard") << *p;
+ f->close_section();
+ }
+ f->open_object_section("info");
+ update_calc_stats();
+ info.dump(f);
+ f->close_section();
+
+ f->open_array_section("peer_info");
+ for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
+ f->open_object_section("info");
+ f->dump_stream("peer") << p->first;
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void PeeringState::update_stats(
+ std::function<bool(pg_history_t &, pg_stat_t &)> f,
+ ObjectStore::Transaction *t) {
+ if (f(info.history, info.stats)) {
+ pl->publish_stats_to_osd();
+ }
+ pl->on_info_history_change();
+
+ if (t) {
+ dirty_info = true;
+ write_if_dirty(*t);
+ }
+}
+
+bool PeeringState::append_log_entries_update_missing(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
+ std::optional<eversion_t> roll_forward_to)
+{
+ ceph_assert(!entries.empty());
+ ceph_assert(entries.begin()->version > info.last_update);
+
+ PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+ bool invalidate_stats =
+ pg_log.append_new_log_entries(
+ info.last_backfill,
+ entries,
+ rollbacker.get());
+
+ if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
+ pg_log.roll_forward(rollbacker.get());
+ }
+ if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
+ pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
+ last_rollback_info_trimmed_to_applied = *roll_forward_to;
+ }
+
+ info.last_update = pg_log.get_head();
+
+ if (pg_log.get_missing().num_missing() == 0) {
+ // advance last_complete since nothing else is missing!
+ info.last_complete = info.last_update;
+ }
+ info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
+
+ psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
+ << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
+ if (trim_to)
+ pg_log.trim(*trim_to, info);
+ dirty_info = true;
+ write_if_dirty(t);
+ return invalidate_stats;
+}
+
+void PeeringState::merge_new_log_entries(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObjectStore::Transaction &t,
+ std::optional<eversion_t> trim_to,
+ std::optional<eversion_t> roll_forward_to)
+{
+ psdout(10) << __func__ << " " << entries << dendl;
+ ceph_assert(is_primary());
+
+ bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
+ for (auto i = acting_recovery_backfill.begin();
+ i != acting_recovery_backfill.end();
+ ++i) {
+ pg_shard_t peer(*i);
+ if (peer == pg_whoami) continue;
+ ceph_assert(peer_missing.count(peer));
+ ceph_assert(peer_info.count(peer));
+ pg_missing_t& pmissing(peer_missing[peer]);
+ psdout(20) << __func__ << " peer_missing for " << peer
+ << " = " << pmissing << dendl;
+ pg_info_t& pinfo(peer_info[peer]);
+ bool invalidate_stats = PGLog::append_log_entries_update_missing(
+ pinfo.last_backfill,
+ entries,
+ true,
+ NULL,
+ pmissing,
+ NULL,
+ dpp);
+ pinfo.last_update = info.last_update;
+ pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
+ rebuild_missing = rebuild_missing || invalidate_stats;
+ }
+
+ if (!rebuild_missing) {
+ return;
+ }
+
+ for (auto &&i: entries) {
+ missing_loc.rebuild(
+ i.soid,
+ pg_whoami,
+ acting_recovery_backfill,
+ info,
+ pg_log.get_missing(),
+ peer_missing,
+ peer_info);
+ }
+}
+
+void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
+{
+ // raise last_complete only if we were previously up to date
+ if (info.last_complete == info.last_update)
+ info.last_complete = e.version;
+
+ // raise last_update.
+ ceph_assert(e.version > info.last_update);
+ info.last_update = e.version;
+
+ // raise user_version, if it increased (it may have not get bumped
+ // by all logged updates)
+ if (e.user_version > info.last_user_version)
+ info.last_user_version = e.user_version;
+
+ // log mutation
+ pg_log.add(e, applied);
+ psdout(10) << "add_log_entry " << e << dendl;
+}
+
+
+void PeeringState::append_log(
+ vector<pg_log_entry_t>&& logv,
+ eversion_t trim_to,
+ eversion_t roll_forward_to,
+ eversion_t mlcod,
+ ObjectStore::Transaction &t,
+ bool transaction_applied,
+ bool async)
+{
+ /* The primary has sent an info updating the history, but it may not
+ * have arrived yet. We want to make sure that we cannot remember this
+ * write without remembering that it happened in an interval which went
+ * active in epoch history.last_epoch_started.
+ */
+ if (info.last_epoch_started != info.history.last_epoch_started) {
+ info.history.last_epoch_started = info.last_epoch_started;
+ }
+ if (info.last_interval_started != info.history.last_interval_started) {
+ info.history.last_interval_started = info.last_interval_started;
+ }
+ psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
+
+ PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
+ if (!transaction_applied) {
+ /* We must be a backfill or async recovery peer, so it's ok if we apply
+ * out-of-turn since we won't be considered when
+ * determining a min possible last_update.
+ *
+ * We skip_rollforward() here, which advances the crt, without
+ * doing an actual rollforward. This avoids cleaning up entries
+ * from the backend and we do not end up in a situation, where the
+ * object is deleted before we can _merge_object_divergent_entries().
+ */
+ pg_log.skip_rollforward();
+ }
+
+ for (auto p = logv.begin(); p != logv.end(); ++p) {
+ add_log_entry(*p, transaction_applied);
+
+ /* We don't want to leave the rollforward artifacts around
+ * here past last_backfill. It's ok for the same reason as
+ * above */
+ if (transaction_applied &&
+ p->soid > info.last_backfill) {
+ pg_log.roll_forward(handler.get());
+ }
+ }
+ if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
+ pg_log.roll_forward_to(
+ roll_forward_to,
+ handler.get());
+ last_rollback_info_trimmed_to_applied = roll_forward_to;
+ }
+
+ psdout(10) << __func__ << " approx pg log length = "
+ << pg_log.get_log().approx_size() << dendl;
+ psdout(10) << __func__ << " dups pg log length = "
+ << pg_log.get_log().dups.size() << dendl;
+ psdout(10) << __func__ << " transaction_applied = "
+ << transaction_applied << dendl;
+ if (!transaction_applied || async)
+ psdout(10) << __func__ << " " << pg_whoami
+ << " is async_recovery or backfill target" << dendl;
+ pg_log.trim(trim_to, info, transaction_applied, async);
+
+ // update the local pg, pg log
+ dirty_info = true;
+ write_if_dirty(t);
+
+ if (!is_primary())
+ min_last_complete_ondisk = mlcod;
+}
+
+void PeeringState::recover_got(
+ const hobject_t &oid, eversion_t v,
+ bool is_delete,
+ ObjectStore::Transaction &t)
+{
+ if (v > pg_log.get_can_rollback_to()) {
+ /* This can only happen during a repair, and even then, it would
+ * be one heck of a race. If we are repairing the object, the
+ * write in question must be fully committed, so it's not valid
+ * to roll it back anyway (and we'll be rolled forward shortly
+ * anyway) */
+ PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
+ pg_log.roll_forward_to(v, handler.get());
+ }
+
+ psdout(10) << "got missing " << oid << " v " << v << dendl;
+ pg_log.recover_got(oid, v, info);
+ if (pg_log.get_log().log.empty()) {
+ psdout(10) << "last_complete now " << info.last_complete
+ << " while log is empty" << dendl;
+ } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
+ psdout(10) << "last_complete now " << info.last_complete
+ << " log.complete_to " << pg_log.get_log().complete_to->version
+ << dendl;
+ } else {
+ psdout(10) << "last_complete now " << info.last_complete
+ << " log.complete_to at end" << dendl;
+ //below is not true in the repair case.
+ //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
+ ceph_assert(info.last_complete == info.last_update);
+ }
+
+ if (is_primary()) {
+ ceph_assert(missing_loc.needs_recovery(oid));
+ if (!is_delete)
+ missing_loc.add_location(oid, pg_whoami);
+ }
+
+ // update pg
+ dirty_info = true;
+ write_if_dirty(t);
+}
+
+void PeeringState::update_backfill_progress(
+ const hobject_t &updated_backfill,
+ const pg_stat_t &updated_stats,
+ bool preserve_local_num_bytes,
+ ObjectStore::Transaction &t) {
+ info.set_last_backfill(updated_backfill);
+ if (preserve_local_num_bytes) {
+ psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
+ << " local " << info.stats.stats.sum.num_bytes << dendl;
+ int64_t bytes = info.stats.stats.sum.num_bytes;
+ info.stats = updated_stats;
+ info.stats.stats.sum.num_bytes = bytes;
+ } else {
+ psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
+ << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
+ info.stats = updated_stats;
+ }
+
+ dirty_info = true;
+ write_if_dirty(t);
+}
+
+void PeeringState::adjust_purged_snaps(
+ std::function<void(interval_set<snapid_t> &snaps)> f) {
+ f(info.purged_snaps);
+ dirty_info = true;
+ dirty_big_info = true;
+}
+
+void PeeringState::on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &soid,
+ const eversion_t &version)
+{
+ pl->publish_stats_to_osd();
+ // done!
+ peer_missing[peer].got(soid, version);
+ missing_loc.add_location(soid, peer);
+}
+
+void PeeringState::begin_peer_recover(
+ pg_shard_t peer,
+ const hobject_t soid)
+{
+ peer_missing[peer].revise_have(soid, eversion_t());
+}
+
+void PeeringState::force_object_missing(
+ const set<pg_shard_t> &peers,
+ const hobject_t &soid,
+ eversion_t version)
+{
+ for (auto &&peer : peers) {
+ if (peer != primary) {
+ peer_missing[peer].add(soid, version, eversion_t(), false);
+ } else {
+ pg_log.missing_add(soid, version, eversion_t());
+ pg_log.reset_complete_to(&info);
+ pg_log.set_last_requested(0);
+ }
+ }
+
+ missing_loc.rebuild(
+ soid,
+ pg_whoami,
+ acting_recovery_backfill,
+ info,
+ pg_log.get_missing(),
+ peer_missing,
+ peer_info);
+}
+
+void PeeringState::pre_submit_op(
+ const hobject_t &hoid,
+ const vector<pg_log_entry_t>& logv,
+ eversion_t at_version)
+{
+ if (at_version > eversion_t()) {
+ for (auto &&i : get_acting_recovery_backfill()) {
+ if (i == primary) continue;
+ pg_info_t &pinfo = peer_info[i];
+ // keep peer_info up to date
+ if (pinfo.last_complete == pinfo.last_update)
+ pinfo.last_complete = at_version;
+ pinfo.last_update = at_version;
+ }
+ }
+
+ bool requires_missing_loc = false;
+ for (auto &&i : get_async_recovery_targets()) {
+ if (i == primary || !get_peer_missing(i).is_missing(hoid))
+ continue;
+ requires_missing_loc = true;
+ for (auto &&entry: logv) {
+ peer_missing[i].add_next_event(entry);
+ }
+ }
+
+ if (requires_missing_loc) {
+ for (auto &&entry: logv) {
+ psdout(30) << __func__ << " missing_loc before: "
+ << missing_loc.get_locations(entry.soid) << dendl;
+ missing_loc.add_missing(entry.soid, entry.version,
+ eversion_t(), entry.is_delete());
+ // clear out missing_loc
+ missing_loc.clear_location(entry.soid);
+ for (auto &i: get_actingset()) {
+ if (!get_peer_missing(i).is_missing(entry.soid))
+ missing_loc.add_location(entry.soid, i);
+ }
+ psdout(30) << __func__ << " missing_loc after: "
+ << missing_loc.get_locations(entry.soid) << dendl;
+ }
+ }
+}
+
+void PeeringState::recovery_committed_to(eversion_t version)
+{
+ psdout(10) << __func__ << " version " << version
+ << " now ondisk" << dendl;
+ last_complete_ondisk = version;
+
+ if (last_complete_ondisk == info.last_update) {
+ if (!is_primary()) {
+ // Either we are a replica or backfill target.
+ // we are fully up to date. tell the primary!
+ pl->send_cluster_message(
+ get_primary().osd,
+ make_message<MOSDPGTrim>(
+ get_osdmap_epoch(),
+ spg_t(info.pgid.pgid, primary.shard),
+ last_complete_ondisk),
+ get_osdmap_epoch());
+ } else {
+ calc_min_last_complete_ondisk();
+ }
+ }
+}
+
+void PeeringState::complete_write(eversion_t v, eversion_t lc)
+{
+ last_update_ondisk = v;
+ last_complete_ondisk = lc;
+ calc_min_last_complete_ondisk();
+}
+
+void PeeringState::calc_trim_to()
+{
+ size_t target = pl->get_target_pg_log_entries();
+
+ eversion_t limit = std::min(
+ min_last_complete_ondisk,
+ pg_log.get_can_rollback_to());
+ if (limit != eversion_t() &&
+ limit != pg_trim_to &&
+ pg_log.get_log().approx_size() > target) {
+ size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+ cct->_conf->osd_pg_log_trim_max);
+ if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+ cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+ return;
+ }
+ auto it = pg_log.get_log().log.begin();
+ eversion_t new_trim_to;
+ for (size_t i = 0; i < num_to_trim; ++i) {
+ new_trim_to = it->version;
+ ++it;
+ if (new_trim_to > limit) {
+ new_trim_to = limit;
+ psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
+ break;
+ }
+ }
+ psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
+ pg_trim_to = new_trim_to;
+ assert(pg_trim_to <= pg_log.get_head());
+ assert(pg_trim_to <= min_last_complete_ondisk);
+ }
+}
+
+void PeeringState::calc_trim_to_aggressive()
+{
+ size_t target = pl->get_target_pg_log_entries();
+
+ // limit pg log trimming up to the can_rollback_to value
+ eversion_t limit = std::min({
+ pg_log.get_head(),
+ pg_log.get_can_rollback_to(),
+ last_update_ondisk});
+ psdout(10) << __func__ << " limit = " << limit << dendl;
+
+ if (limit != eversion_t() &&
+ limit != pg_trim_to &&
+ pg_log.get_log().approx_size() > target) {
+ psdout(10) << __func__ << " approx pg log length = "
+ << pg_log.get_log().approx_size() << dendl;
+ uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
+ cct->_conf->osd_pg_log_trim_max);
+ psdout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
+ if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+ cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+ return;
+ }
+ auto it = pg_log.get_log().log.begin(); // oldest log entry
+ auto rit = pg_log.get_log().log.rbegin();
+ eversion_t by_n_to_keep; // start from tail
+ eversion_t by_n_to_trim = eversion_t::max(); // start from head
+ for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
+ i++;
+ if (i > target && by_n_to_keep == eversion_t()) {
+ by_n_to_keep = rit->version;
+ }
+ if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
+ by_n_to_trim = it->version;
+ }
+ if (by_n_to_keep != eversion_t() &&
+ by_n_to_trim != eversion_t::max()) {
+ break;
+ }
+ }
+
+ if (by_n_to_keep == eversion_t()) {
+ return;
+ }
+
+ pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
+ psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
+ ceph_assert(pg_trim_to <= pg_log.get_head());
+ }
+}
+
+void PeeringState::apply_op_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats)
+{
+ info.stats.stats.add(delta_stats);
+ info.stats.stats.floor(0);
+
+ for (auto i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ pg_shard_t bt = *i;
+ pg_info_t& pinfo = peer_info[bt];
+ if (soid <= pinfo.last_backfill)
+ pinfo.stats.stats.add(delta_stats);
+ }
+}
+
+void PeeringState::update_complete_backfill_object_stats(
+ const hobject_t &hoid,
+ const pg_stat_t &stats)
+{
+ for (auto &&bt: get_backfill_targets()) {
+ pg_info_t& pinfo = peer_info[bt];
+ //Add stats to all peers that were missing object
+ if (hoid > pinfo.last_backfill)
+ pinfo.stats.add(stats);
+ }
+}
+
+void PeeringState::update_peer_last_backfill(
+ pg_shard_t peer,
+ const hobject_t &new_last_backfill)
+{
+ pg_info_t &pinfo = peer_info[peer];
+ pinfo.last_backfill = new_last_backfill;
+ if (new_last_backfill.is_max()) {
+ /* pinfo.stats might be wrong if we did log-based recovery on the
+ * backfilled portion in addition to continuing backfill.
+ */
+ pinfo.stats = info.stats;
+ }
+}
+
+void PeeringState::set_revert_with_targets(
+ const hobject_t &soid,
+ const set<pg_shard_t> &good_peers)
+{
+ for (auto &&peer: good_peers) {
+ missing_loc.add_location(soid, peer);
+ }
+}
+
+void PeeringState::prepare_backfill_for_missing(
+ const hobject_t &soid,
+ const eversion_t &version,
+ const vector<pg_shard_t> &targets) {
+ for (auto &&peer: targets) {
+ peer_missing[peer].add(soid, version, eversion_t(), false);
+ }
+}
+
+void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
+{
+ info.hit_set = hset_history;
+}
+
+/*------------ Peering State Machine----------------*/
+#undef dout_prefix
+#define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
+ << "state<" << get_state_name() << ">: ")
+#undef psdout
+#define psdout(x) ldout(context< PeeringMachine >().cct, x)
+
+#define DECLARE_LOCALS \
+ PeeringState *ps = context< PeeringMachine >().state; \
+ std::ignore = ps; \
+ PeeringListener *pl = context< PeeringMachine >().pl; \
+ std::ignore = pl
+
+
+/*------Crashed-------*/
+PeeringState::Crashed::Crashed(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Crashed")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ ceph_abort_msg("we got a bad state machine event");
+}
+
+
+/*------Initial-------*/
+PeeringState::Initial::Initial(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Initial")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
+{
+ DECLARE_LOCALS;
+ ps->proc_replica_info(
+ notify.from, notify.notify.info, notify.notify.epoch_sent);
+ ps->set_last_peering_reset();
+ return transit< Primary >();
+}
+
+boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
+{
+ DECLARE_LOCALS;
+ ceph_assert(!ps->is_primary());
+ post_event(i);
+ return transit< Stray >();
+}
+
+boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
+{
+ DECLARE_LOCALS;
+ ceph_assert(!ps->is_primary());
+ post_event(i);
+ return transit< Stray >();
+}
+
+void PeeringState::Initial::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_initial_latency, dur);
+}
+
+/*------Started-------*/
+PeeringState::Started::Started(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::Started::react(const IntervalFlush&)
+{
+ psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
+ context< PeeringMachine >().state->end_block_outgoing();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "Started advmap" << dendl;
+ ps->check_full_transition(advmap.lastmap, advmap.osdmap);
+ if (ps->should_restart_peering(
+ advmap.up_primary,
+ advmap.acting_primary,
+ advmap.newup,
+ advmap.newacting,
+ advmap.lastmap,
+ advmap.osdmap)) {
+ psdout(10) << "should_restart_peering, transitioning to Reset"
+ << dendl;
+ post_event(advmap);
+ return transit< Reset >();
+ }
+ ps->remove_down_peer_info(advmap.osdmap);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Started::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->close_section();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Started::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "Started");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::Started::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_started_latency, dur);
+ ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
+}
+
+/*--------Reset---------*/
+PeeringState::Reset::Reset(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Reset")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ ps->flushes_in_progress = 0;
+ ps->set_last_peering_reset();
+ ps->log_weirdness();
+}
+
+boost::statechart::result
+PeeringState::Reset::react(const IntervalFlush&)
+{
+ psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
+ context< PeeringMachine >().state->end_block_outgoing();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "Reset advmap" << dendl;
+
+ ps->check_full_transition(advmap.lastmap, advmap.osdmap);
+
+ if (ps->should_restart_peering(
+ advmap.up_primary,
+ advmap.acting_primary,
+ advmap.newup,
+ advmap.newacting,
+ advmap.lastmap,
+ advmap.osdmap)) {
+ psdout(10) << "should restart peering, calling start_peering_interval again"
+ << dendl;
+ ps->start_peering_interval(
+ advmap.lastmap,
+ advmap.newup, advmap.up_primary,
+ advmap.newacting, advmap.acting_primary,
+ context< PeeringMachine >().get_cur_transaction());
+ }
+ ps->remove_down_peer_info(advmap.osdmap);
+ ps->check_past_interval_bounds();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Reset::react(const ActMap&)
+{
+ DECLARE_LOCALS;
+ if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
+ ps->info.history.refresh_prior_readable_until_ub(
+ pl->get_mnow(),
+ ps->prior_readable_until_ub);
+ context< PeeringMachine >().send_notify(
+ ps->get_primary().osd,
+ pg_notify_t(
+ ps->get_primary().shard, ps->pg_whoami.shard,
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ ps->info,
+ ps->past_intervals));
+ }
+
+ ps->update_heartbeat_peers();
+
+ return transit< Started >();
+}
+
+boost::statechart::result PeeringState::Reset::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->close_section();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "Reset");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::Reset::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_reset_latency, dur);
+}
+
+/*-------Start---------*/
+PeeringState::Start::Start(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Start")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+ if (ps->is_primary()) {
+ psdout(1) << "transitioning to Primary" << dendl;
+ post_event(MakePrimary());
+ } else { //is_stray
+ psdout(1) << "transitioning to Stray" << dendl;
+ post_event(MakeStray());
+ }
+}
+
+void PeeringState::Start::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_start_latency, dur);
+}
+
+/*---------Primary--------*/
+PeeringState::Primary::Primary(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+ ceph_assert(ps->want_acting.empty());
+
+ // set CREATING bit until we have peered for the first time.
+ if (ps->info.history.last_epoch_started == 0) {
+ ps->state_set(PG_STATE_CREATING);
+ // use the history timestamp, which ultimately comes from the
+ // monitor in the create case.
+ utime_t t = ps->info.history.last_scrub_stamp;
+ ps->info.stats.last_fresh = t;
+ ps->info.stats.last_active = t;
+ ps->info.stats.last_change = t;
+ ps->info.stats.last_peered = t;
+ ps->info.stats.last_clean = t;
+ ps->info.stats.last_unstale = t;
+ ps->info.stats.last_undegraded = t;
+ ps->info.stats.last_fullsized = t;
+ ps->info.stats.last_scrub_stamp = t;
+ ps->info.stats.last_deep_scrub_stamp = t;
+ ps->info.stats.last_clean_scrub_stamp = t;
+ }
+}
+
+boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
+{
+ DECLARE_LOCALS;
+ psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
+ ps->proc_replica_info(
+ notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(const ActMap&)
+{
+ DECLARE_LOCALS;
+ psdout(7) << "handle ActMap primary" << dendl;
+ pl->publish_stats_to_osd();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+ const SetForceRecovery&)
+{
+ DECLARE_LOCALS;
+ ps->set_force_recovery(true);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+ const UnsetForceRecovery&)
+{
+ DECLARE_LOCALS;
+ ps->set_force_recovery(false);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+ const RequestScrub& evt)
+{
+ DECLARE_LOCALS;
+ if (ps->is_primary()) {
+ pl->scrub_requested(evt.deep, evt.repair);
+ psdout(10) << "marking for scrub" << dendl;
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+ const SetForceBackfill&)
+{
+ DECLARE_LOCALS;
+ ps->set_force_backfill(true);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+ const UnsetForceBackfill&)
+{
+ DECLARE_LOCALS;
+ ps->set_force_backfill(false);
+ return discard_event();
+}
+
+void PeeringState::Primary::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ ps->want_acting.clear();
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_primary_latency, dur);
+ pl->clear_primary_state();
+ ps->state_clear(PG_STATE_CREATING);
+}
+
+/*---------Peering--------*/
+PeeringState::Peering::Peering(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
+ history_les_bound(false)
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ ceph_assert(!ps->is_peered());
+ ceph_assert(!ps->is_peering());
+ ceph_assert(ps->is_primary());
+ ps->state_set(PG_STATE_PEERING);
+}
+
+boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "Peering advmap" << dendl;
+ if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
+ psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
+ post_event(advmap);
+ return transit< Reset >();
+ }
+
+ ps->adjust_need_up_thru(advmap.osdmap);
+ ps->check_prior_readable_down_osds(advmap.osdmap);
+
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Peering::react(const QueryState& q)
+{
+ DECLARE_LOCALS;
+
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+
+ q.f->open_array_section("past_intervals");
+ ps->past_intervals.dump(q.f);
+ q.f->close_section();
+
+ q.f->open_array_section("probing_osds");
+ for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p)
+ q.f->dump_stream("osd") << *p;
+ q.f->close_section();
+
+ if (prior_set.pg_down)
+ q.f->dump_string("blocked", "peering is blocked due to down osds");
+
+ q.f->open_array_section("down_osds_we_would_probe");
+ for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p)
+ q.f->dump_int("osd", *p);
+ q.f->close_section();
+
+ q.f->open_array_section("peering_blocked_by");
+ for (auto p = prior_set.blocked_by.begin();
+ p != prior_set.blocked_by.end();
+ ++p) {
+ q.f->open_object_section("osd");
+ q.f->dump_int("osd", p->first);
+ q.f->dump_int("current_lost_at", p->second);
+ q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
+ q.f->close_section();
+ }
+ q.f->close_section();
+
+ if (history_les_bound) {
+ q.f->open_array_section("peering_blocked_by_detail");
+ q.f->open_object_section("item");
+ q.f->dump_string("detail","peering_blocked_by_history_les_bound");
+ q.f->close_section();
+ q.f->close_section();
+ }
+
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "Peering");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::Peering::exit()
+{
+
+ DECLARE_LOCALS;
+ psdout(10) << "Leaving Peering" << dendl;
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ ps->state_clear(PG_STATE_PEERING);
+ pl->clear_probe_targets();
+
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_peering_latency, dur);
+}
+
+
+/*------Backfilling-------*/
+PeeringState::Backfilling::Backfilling(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+
+ DECLARE_LOCALS;
+ ps->backfill_reserved = true;
+ pl->on_backfill_reserved();
+ ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
+ ps->state_clear(PG_STATE_BACKFILL_WAIT);
+ ps->state_set(PG_STATE_BACKFILLING);
+ pl->publish_stats_to_osd();
+}
+
+void PeeringState::Backfilling::backfill_release_reservations()
+{
+ DECLARE_LOCALS;
+ pl->cancel_local_background_io_reservation();
+ for (auto it = ps->backfill_targets.begin();
+ it != ps->backfill_targets.end();
+ ++it) {
+ ceph_assert(*it != ps->pg_whoami);
+ pl->send_cluster_message(
+ it->osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::RELEASE,
+ spg_t(ps->info.pgid.pgid, it->shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ }
+}
+
+void PeeringState::Backfilling::cancel_backfill()
+{
+ DECLARE_LOCALS;
+ backfill_release_reservations();
+ pl->on_backfill_canceled();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const Backfilled &c)
+{
+ backfill_release_reservations();
+ return transit<Recovered>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const DeferBackfill &c)
+{
+ DECLARE_LOCALS;
+
+ psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
+ ps->state_set(PG_STATE_BACKFILL_WAIT);
+ ps->state_clear(PG_STATE_BACKFILLING);
+ cancel_backfill();
+
+ pl->schedule_event_after(
+ std::make_shared<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ RequestBackfill()),
+ c.delay);
+ return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const UnfoundBackfill &c)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "backfill has unfound, can't continue" << dendl;
+ ps->state_set(PG_STATE_BACKFILL_UNFOUND);
+ ps->state_clear(PG_STATE_BACKFILLING);
+ cancel_backfill();
+ return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
+{
+ DECLARE_LOCALS;
+
+ ps->state_set(PG_STATE_BACKFILL_TOOFULL);
+ ps->state_clear(PG_STATE_BACKFILLING);
+ cancel_backfill();
+
+ pl->schedule_event_after(
+ std::make_shared<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ RequestBackfill()),
+ ps->cct->_conf->osd_backfill_retry_interval);
+
+ return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const RemoteReservationRevoked &)
+{
+ DECLARE_LOCALS;
+ ps->state_set(PG_STATE_BACKFILL_WAIT);
+ cancel_backfill();
+ if (ps->needs_backfill()) {
+ return transit<WaitLocalBackfillReserved>();
+ } else {
+ // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
+ return discard_event();
+ }
+}
+
+void PeeringState::Backfilling::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ ps->backfill_reserved = false;
+ ps->state_clear(PG_STATE_BACKFILLING);
+ ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
+}
+
+/*--WaitRemoteBackfillReserved--*/
+
+PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
+ backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ ps->state_set(PG_STATE_BACKFILL_WAIT);
+ pl->publish_stats_to_osd();
+ post_event(RemoteBackfillReserved());
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
+{
+ DECLARE_LOCALS;
+
+ int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
+ psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
+ if (backfill_osd_it !=
+ context< Active >().remote_shards_to_reserve_backfill.end()) {
+ // The primary never backfills itself
+ ceph_assert(*backfill_osd_it != ps->pg_whoami);
+ pl->send_cluster_message(
+ backfill_osd_it->osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::REQUEST,
+ spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
+ ps->get_osdmap_epoch(),
+ ps->get_backfill_priority(),
+ num_bytes,
+ ps->peer_bytes[*backfill_osd_it]),
+ ps->get_osdmap_epoch());
+ ++backfill_osd_it;
+ } else {
+ ps->peer_bytes.clear();
+ post_event(AllBackfillsReserved());
+ }
+ return discard_event();
+}
+
+void PeeringState::WaitRemoteBackfillReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
+}
+
+void PeeringState::WaitRemoteBackfillReserved::retry()
+{
+ DECLARE_LOCALS;
+ pl->cancel_local_background_io_reservation();
+
+ // Send CANCEL to all previously acquired reservations
+ set<pg_shard_t>::const_iterator it, begin, end;
+ begin = context< Active >().remote_shards_to_reserve_backfill.begin();
+ end = context< Active >().remote_shards_to_reserve_backfill.end();
+ ceph_assert(begin != end);
+ for (it = begin; it != backfill_osd_it; ++it) {
+ // The primary never backfills itself
+ ceph_assert(*it != ps->pg_whoami);
+ pl->send_cluster_message(
+ it->osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::RELEASE,
+ spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ }
+
+ ps->state_clear(PG_STATE_BACKFILL_WAIT);
+ pl->publish_stats_to_osd();
+
+ pl->schedule_event_after(
+ std::make_shared<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ RequestBackfill()),
+ ps->cct->_conf->osd_backfill_retry_interval);
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
+{
+ DECLARE_LOCALS;
+ ps->state_set(PG_STATE_BACKFILL_TOOFULL);
+ retry();
+ return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
+{
+ retry();
+ return transit<NotBackfilling>();
+}
+
+/*--WaitLocalBackfillReserved--*/
+PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ ps->state_set(PG_STATE_BACKFILL_WAIT);
+ pl->request_local_background_io_reservation(
+ ps->get_backfill_priority(),
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ LocalBackfillReserved()),
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ DeferBackfill(0.0)));
+ pl->publish_stats_to_osd();
+}
+
+void PeeringState::WaitLocalBackfillReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
+}
+
+/*----NotBackfilling------*/
+PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_REPAIR);
+ pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q)
+{
+ DECLARE_LOCALS;
+
+ ps->query_unfound(q.f, "NotBackfilling");
+ return discard_event();
+}
+
+boost::statechart::result
+PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
+{
+ return discard_event();
+}
+
+boost::statechart::result
+PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
+{
+ return discard_event();
+}
+
+void PeeringState::NotBackfilling::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
+}
+
+/*----NotRecovering------*/
+PeeringState::NotRecovering::NotRecovering(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_REPAIR);
+ pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q)
+{
+ DECLARE_LOCALS;
+
+ ps->query_unfound(q.f, "NotRecovering");
+ return discard_event();
+}
+
+void PeeringState::NotRecovering::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
+}
+
+/*---RepNotRecovering----*/
+PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
+{
+ DECLARE_LOCALS;
+ ps->reject_reservation();
+ post_event(RemoteReservationRejectedTooFull());
+ return discard_event();
+}
+
+void PeeringState::RepNotRecovering::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
+}
+
+/*---RepWaitRecoveryReserved--*/
+PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
+{
+ DECLARE_LOCALS;
+ pl->send_cluster_message(
+ ps->primary.osd,
+ make_message<MRecoveryReserve>(
+ MRecoveryReserve::GRANT,
+ spg_t(ps->info.pgid.pgid, ps->primary.shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ return transit<RepRecovering>();
+}
+
+boost::statechart::result
+PeeringState::RepWaitRecoveryReserved::react(
+ const RemoteReservationCanceled &evt)
+{
+ DECLARE_LOCALS;
+ pl->unreserve_recovery_space();
+
+ pl->cancel_remote_recovery_reservation();
+ return transit<RepNotRecovering>();
+}
+
+void PeeringState::RepWaitRecoveryReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
+}
+
+/*-RepWaitBackfillReserved*/
+PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
+{
+
+ DECLARE_LOCALS;
+
+ if (!pl->try_reserve_recovery_space(
+ evt.primary_num_bytes, evt.local_num_bytes)) {
+ post_event(RejectTooFullRemoteReservation());
+ } else {
+ PGPeeringEventURef preempt;
+ if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
+ // older peers will interpret preemption as TOOFULL
+ preempt = std::make_unique<PGPeeringEvent>(
+ pl->get_osdmap_epoch(),
+ pl->get_osdmap_epoch(),
+ RemoteBackfillPreempted());
+ }
+ pl->request_remote_recovery_reservation(
+ evt.priority,
+ std::make_unique<PGPeeringEvent>(
+ pl->get_osdmap_epoch(),
+ pl->get_osdmap_epoch(),
+ RemoteBackfillReserved()),
+ std::move(preempt));
+ }
+ return transit<RepWaitBackfillReserved>();
+}
+
+boost::statechart::result
+PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
+{
+ DECLARE_LOCALS;
+
+ // fall back to a local reckoning of priority of primary doesn't pass one
+ // (pre-mimic compat)
+ int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
+
+ PGPeeringEventURef preempt;
+ if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
+ // older peers can't handle this
+ preempt = std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ RemoteRecoveryPreempted());
+ }
+
+ pl->request_remote_recovery_reservation(
+ prio,
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ RemoteRecoveryReserved()),
+ std::move(preempt));
+ return transit<RepWaitRecoveryReserved>();
+}
+
+void PeeringState::RepWaitBackfillReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
+{
+ DECLARE_LOCALS;
+
+
+ pl->send_cluster_message(
+ ps->primary.osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::GRANT,
+ spg_t(ps->info.pgid.pgid, ps->primary.shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ return transit<RepRecovering>();
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(
+ const RejectTooFullRemoteReservation &evt)
+{
+ DECLARE_LOCALS;
+ ps->reject_reservation();
+ post_event(RemoteReservationRejectedTooFull());
+ return discard_event();
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(
+ const RemoteReservationRejectedTooFull &evt)
+{
+ DECLARE_LOCALS;
+ pl->unreserve_recovery_space();
+
+ pl->cancel_remote_recovery_reservation();
+ return transit<RepNotRecovering>();
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(
+ const RemoteReservationCanceled &evt)
+{
+ DECLARE_LOCALS;
+ pl->unreserve_recovery_space();
+
+ pl->cancel_remote_recovery_reservation();
+ return transit<RepNotRecovering>();
+}
+
+/*---RepRecovering-------*/
+PeeringState::RepRecovering::RepRecovering(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
+{
+ DECLARE_LOCALS;
+
+
+ pl->unreserve_recovery_space();
+ pl->send_cluster_message(
+ ps->primary.osd,
+ make_message<MRecoveryReserve>(
+ MRecoveryReserve::REVOKE,
+ spg_t(ps->info.pgid.pgid, ps->primary.shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ return discard_event();
+}
+
+boost::statechart::result
+PeeringState::RepRecovering::react(const BackfillTooFull &)
+{
+ DECLARE_LOCALS;
+
+
+ pl->unreserve_recovery_space();
+ pl->send_cluster_message(
+ ps->primary.osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::REVOKE_TOOFULL,
+ spg_t(ps->info.pgid.pgid, ps->primary.shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ return discard_event();
+}
+
+boost::statechart::result
+PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
+{
+ DECLARE_LOCALS;
+
+
+ pl->unreserve_recovery_space();
+ pl->send_cluster_message(
+ ps->primary.osd,
+ make_message<MBackfillReserve>(
+ MBackfillReserve::REVOKE,
+ spg_t(ps->info.pgid.pgid, ps->primary.shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ return discard_event();
+}
+
+void PeeringState::RepRecovering::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ pl->unreserve_recovery_space();
+
+ pl->cancel_remote_recovery_reservation();
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
+}
+
+/*------Activating--------*/
+PeeringState::Activating::Activating(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+void PeeringState::Activating::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_activating_latency, dur);
+}
+
+PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ // Make sure all nodes that part of the recovery aren't full
+ if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
+ ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
+ post_event(RecoveryTooFull());
+ return;
+ }
+
+ ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
+ ps->state_set(PG_STATE_RECOVERY_WAIT);
+ pl->request_local_background_io_reservation(
+ ps->get_recovery_priority(),
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ LocalRecoveryReserved()),
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ DeferRecovery(0.0)));
+ pl->publish_stats_to_osd();
+}
+
+boost::statechart::result
+PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
+{
+ DECLARE_LOCALS;
+ ps->state_set(PG_STATE_RECOVERY_TOOFULL);
+ pl->schedule_event_after(
+ std::make_shared<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ DoRecovery()),
+ ps->cct->_conf->osd_recovery_retry_interval);
+ return transit<NotRecovering>();
+}
+
+void PeeringState::WaitLocalRecoveryReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
+}
+
+PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
+ remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
+{
+ context< PeeringMachine >().log_enter(state_name);
+ post_event(RemoteRecoveryReserved());
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
+ DECLARE_LOCALS;
+
+ if (remote_recovery_reservation_it !=
+ context< Active >().remote_shards_to_reserve_recovery.end()) {
+ ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
+ pl->send_cluster_message(
+ remote_recovery_reservation_it->osd,
+ make_message<MRecoveryReserve>(
+ MRecoveryReserve::REQUEST,
+ spg_t(context< PeeringMachine >().spgid.pgid,
+ remote_recovery_reservation_it->shard),
+ ps->get_osdmap_epoch(),
+ ps->get_recovery_priority()),
+ ps->get_osdmap_epoch());
+ ++remote_recovery_reservation_it;
+ } else {
+ post_event(AllRemotesReserved());
+ }
+ return discard_event();
+}
+
+void PeeringState::WaitRemoteRecoveryReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
+}
+
+PeeringState::Recovering::Recovering(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_RECOVERY_WAIT);
+ ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
+ ps->state_set(PG_STATE_RECOVERING);
+ pl->on_recovery_reserved();
+ ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
+ pl->publish_stats_to_osd();
+}
+
+void PeeringState::Recovering::release_reservations(bool cancel)
+{
+ DECLARE_LOCALS;
+ ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
+
+ // release remote reservations
+ for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin();
+ i != context< Active >().remote_shards_to_reserve_recovery.end();
+ ++i) {
+ if (*i == ps->pg_whoami) // skip myself
+ continue;
+ pl->send_cluster_message(
+ i->osd,
+ make_message<MRecoveryReserve>(
+ MRecoveryReserve::RELEASE,
+ spg_t(ps->info.pgid.pgid, i->shard),
+ ps->get_osdmap_epoch()),
+ ps->get_osdmap_epoch());
+ }
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const AllReplicasRecovered &evt)
+{
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_FORCED_RECOVERY);
+ release_reservations();
+ pl->cancel_local_background_io_reservation();
+ return transit<Recovered>();
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const RequestBackfill &evt)
+{
+ DECLARE_LOCALS;
+
+ release_reservations();
+
+ ps->state_clear(PG_STATE_FORCED_RECOVERY);
+ pl->cancel_local_background_io_reservation();
+ pl->publish_stats_to_osd();
+ // transit any async_recovery_targets back into acting
+ // so pg won't have to stay undersized for long
+ // as backfill might take a long time to complete..
+ if (!ps->async_recovery_targets.empty()) {
+ pg_shard_t auth_log_shard;
+ bool history_les_bound = false;
+ // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
+ ps->choose_acting(auth_log_shard, true, &history_les_bound);
+ }
+ return transit<WaitLocalBackfillReserved>();
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const DeferRecovery &evt)
+{
+ DECLARE_LOCALS;
+ if (!ps->state_test(PG_STATE_RECOVERING)) {
+ // we may have finished recovery and have an AllReplicasRecovered
+ // event queued to move us to the next state.
+ psdout(10) << "got defer recovery but not recovering" << dendl;
+ return discard_event();
+ }
+ psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
+ ps->state_set(PG_STATE_RECOVERY_WAIT);
+ pl->cancel_local_background_io_reservation();
+ release_reservations(true);
+ pl->schedule_event_after(
+ std::make_shared<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ DoRecovery()),
+ evt.delay);
+ return transit<NotRecovering>();
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const UnfoundRecovery &evt)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "recovery has unfound, can't continue" << dendl;
+ ps->state_set(PG_STATE_RECOVERY_UNFOUND);
+ pl->cancel_local_background_io_reservation();
+ release_reservations(true);
+ return transit<NotRecovering>();
+}
+
+void PeeringState::Recovering::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ ps->state_clear(PG_STATE_RECOVERING);
+ pl->get_peering_perf().tinc(rs_recovering_latency, dur);
+}
+
+PeeringState::Recovered::Recovered(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
+{
+ pg_shard_t auth_log_shard;
+
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+
+ ceph_assert(!ps->needs_recovery());
+
+ // if we finished backfill, all acting are active; recheck if
+ // DEGRADED | UNDERSIZED is appropriate.
+ ceph_assert(!ps->acting_recovery_backfill.empty());
+ if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
+ ps->acting_recovery_backfill.size()) {
+ ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
+ pl->publish_stats_to_osd();
+ }
+
+ // adjust acting set? (e.g. because backfill completed...)
+ bool history_les_bound = false;
+ if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
+ true, &history_les_bound)) {
+ ceph_assert(ps->want_acting.size());
+ } else if (!ps->async_recovery_targets.empty()) {
+ // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
+ ps->choose_acting(auth_log_shard, true, &history_les_bound);
+ }
+
+ if (context< Active >().all_replicas_activated &&
+ ps->async_recovery_targets.empty())
+ post_event(GoClean());
+}
+
+void PeeringState::Recovered::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_recovered_latency, dur);
+}
+
+PeeringState::Clean::Clean(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+
+ if (ps->info.last_complete != ps->info.last_update) {
+ ceph_abort();
+ }
+
+
+ ps->try_mark_clean();
+
+ context< PeeringMachine >().get_cur_transaction().register_on_commit(
+ pl->on_clean());
+}
+
+void PeeringState::Clean::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ ps->state_clear(PG_STATE_CLEAN);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_clean_latency, dur);
+}
+
+template <typename T>
+set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
+{
+ set<int> osds_found;
+ set<pg_shard_t> out;
+ for (auto i = in.begin(); i != in.end(); ++i) {
+ if (*i != skip && !osds_found.count(i->osd)) {
+ osds_found.insert(i->osd);
+ out.insert(*i);
+ }
+ }
+ return out;
+}
+
+/*---------Active---------*/
+PeeringState::Active::Active(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
+ remote_shards_to_reserve_recovery(
+ unique_osd_shard_set(
+ context< PeeringMachine >().state->pg_whoami,
+ context< PeeringMachine >().state->acting_recovery_backfill)),
+ remote_shards_to_reserve_backfill(
+ unique_osd_shard_set(
+ context< PeeringMachine >().state->pg_whoami,
+ context< PeeringMachine >().state->backfill_targets)),
+ all_replicas_activated(false)
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+
+ DECLARE_LOCALS;
+
+ ceph_assert(!ps->backfill_reserved);
+ ceph_assert(ps->is_primary());
+ psdout(10) << "In Active, about to call activate" << dendl;
+ ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+ ps->activate(context< PeeringMachine >().get_cur_transaction(),
+ ps->get_osdmap_epoch(),
+ context< PeeringMachine >().get_recovery_ctx());
+
+ // everyone has to commit/ack before we are truly active
+ ps->blocked_by.clear();
+ for (auto p = ps->acting_recovery_backfill.begin();
+ p != ps->acting_recovery_backfill.end();
+ ++p) {
+ if (p->shard != ps->pg_whoami.shard) {
+ ps->blocked_by.insert(p->shard);
+ }
+ }
+ pl->publish_stats_to_osd();
+ psdout(10) << "Activate Finished" << dendl;
+}
+
+boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
+{
+ DECLARE_LOCALS;
+
+ if (ps->should_restart_peering(
+ advmap.up_primary,
+ advmap.acting_primary,
+ advmap.newup,
+ advmap.newacting,
+ advmap.lastmap,
+ advmap.osdmap)) {
+ psdout(10) << "Active advmap interval change, fast return" << dendl;
+ return forward_event();
+ }
+ psdout(10) << "Active advmap" << dendl;
+ bool need_publish = false;
+
+ pl->on_active_advmap(advmap.osdmap);
+ if (ps->dirty_big_info) {
+ // share updated purged_snaps to mgr/mon so that we (a) stop reporting
+ // purged snaps and (b) perhaps share more snaps that we have purged
+ // but didn't fit in pg_stat_t.
+ need_publish = true;
+ ps->share_pg_info();
+ }
+
+ bool need_acting_change = false;
+ for (size_t i = 0; i < ps->want_acting.size(); i++) {
+ int osd = ps->want_acting[i];
+ if (!advmap.osdmap->is_up(osd)) {
+ pg_shard_t osd_with_shard(osd, shard_id_t(i));
+ if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
+ psdout(10) << "Active stray osd." << osd << " in want_acting is down"
+ << dendl;
+ need_acting_change = true;
+ }
+ }
+ }
+ if (need_acting_change) {
+ psdout(10) << "Active need acting change, call choose_acting again"
+ << dendl;
+ // possibly because we re-add some strays into the acting set and
+ // some of them then go down in a subsequent map before we could see
+ // the map changing the pg temp.
+ // call choose_acting again to clear them out.
+ // note that we leave restrict_to_up_acting to false in order to
+ // not overkill any chosen stray that is still alive.
+ pg_shard_t auth_log_shard;
+ bool history_les_bound = false;
+ ps->remove_down_peer_info(advmap.osdmap);
+ ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
+ }
+
+ /* Check for changes in pool size (if the acting set changed as a result,
+ * this does not matter) */
+ if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
+ ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
+ if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
+ ps->actingset.size()) {
+ ps->state_clear(PG_STATE_UNDERSIZED);
+ } else {
+ ps->state_set(PG_STATE_UNDERSIZED);
+ }
+ // degraded changes will be detected by call from publish_stats_to_osd()
+ need_publish = true;
+ }
+
+ // if we haven't reported our PG stats in a long time, do so now.
+ if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
+ psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
+ << " epochs" << dendl;
+ need_publish = true;
+ }
+
+ if (need_publish)
+ pl->publish_stats_to_osd();
+
+ if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
+ pl->recheck_readable();
+ }
+
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const ActMap&)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "Active: handling ActMap" << dendl;
+ ceph_assert(ps->is_primary());
+
+ pl->on_active_actmap();
+
+ if (ps->have_unfound()) {
+ // object may have become unfound
+ ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
+ }
+
+ uint64_t unfound = ps->missing_loc.num_unfound();
+ if (unfound > 0 &&
+ ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
+ if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
+ pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
+ << " objects unfound and apparently lost, would automatically "
+ << "mark these objects lost but this feature is not yet implemented "
+ << "(osd_auto_mark_unfound_lost)";
+ } else
+ pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
+ << unfound << " objects unfound and apparently lost";
+ }
+
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
+{
+
+ DECLARE_LOCALS;
+ ceph_assert(ps->is_primary());
+ if (ps->peer_info.count(notevt.from)) {
+ psdout(10) << "Active: got notify from " << notevt.from
+ << ", already have info from that osd, ignoring"
+ << dendl;
+ } else if (ps->peer_purged.count(notevt.from)) {
+ psdout(10) << "Active: got notify from " << notevt.from
+ << ", already purged that peer, ignoring"
+ << dendl;
+ } else {
+ psdout(10) << "Active: got notify from " << notevt.from
+ << ", calling proc_replica_info and discover_all_missing"
+ << dendl;
+ ps->proc_replica_info(
+ notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
+ if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
+ ps->discover_all_missing(
+ context<PeeringMachine>().get_recovery_ctx().msgs);
+ }
+ // check if it is a previous down acting member that's coming back.
+ // if so, request pg_temp change to trigger a new interval transition
+ pg_shard_t auth_log_shard;
+ bool history_les_bound = false;
+ // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
+ ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
+ if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
+ psdout(10) << "Active: got notify from previous acting member "
+ << notevt.from << ", requesting pg_temp change"
+ << dendl;
+ }
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MTrim& trim)
+{
+ DECLARE_LOCALS;
+ ceph_assert(ps->is_primary());
+
+ // peer is informing us of their last_complete_ondisk
+ ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
+ ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
+ trim.trim_to);
+ // trim log when the pg is recovered
+ ps->calc_min_last_complete_ondisk();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
+{
+ DECLARE_LOCALS;
+ ceph_assert(ps->is_primary());
+
+ ceph_assert(!ps->acting_recovery_backfill.empty());
+ if (infoevt.lease_ack) {
+ ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
+ }
+ // don't update history (yet) if we are active and primary; the replica
+ // may be telling us they have activated (and committed) but we can't
+ // share that until _everyone_ does the same.
+ if (ps->is_acting_recovery_backfill(infoevt.from) &&
+ ps->peer_activated.count(infoevt.from) == 0) {
+ psdout(10) << " peer osd." << infoevt.from
+ << " activated and committed" << dendl;
+ ps->peer_activated.insert(infoevt.from);
+ ps->blocked_by.erase(infoevt.from.shard);
+ pl->publish_stats_to_osd();
+ if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
+ all_activated_and_committed();
+ }
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "searching osd." << logevt.from
+ << " log for unfound items" << dendl;
+ ps->proc_replica_log(
+ logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from);
+ bool got_missing = ps->search_for_missing(
+ ps->peer_info[logevt.from],
+ ps->peer_missing[logevt.from],
+ logevt.from,
+ context< PeeringMachine >().get_recovery_ctx());
+ // If there are missing AND we are "fully" active then start recovery now
+ if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
+ post_event(DoRecovery());
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const QueryState& q)
+{
+ DECLARE_LOCALS;
+
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+
+ {
+ q.f->open_array_section("might_have_unfound");
+ for (auto p = ps->might_have_unfound.begin();
+ p != ps->might_have_unfound.end();
+ ++p) {
+ q.f->open_object_section("osd");
+ q.f->dump_stream("osd") << *p;
+ if (ps->peer_missing.count(*p)) {
+ q.f->dump_string("status", "already probed");
+ } else if (ps->peer_missing_requested.count(*p)) {
+ q.f->dump_string("status", "querying");
+ } else if (!ps->get_osdmap()->is_up(p->osd)) {
+ q.f->dump_string("status", "osd is down");
+ } else {
+ q.f->dump_string("status", "not queried");
+ }
+ q.f->close_section();
+ }
+ q.f->close_section();
+ }
+ {
+ q.f->open_object_section("recovery_progress");
+ q.f->open_array_section("backfill_targets");
+ for (auto p = ps->backfill_targets.begin();
+ p != ps->backfill_targets.end(); ++p)
+ q.f->dump_stream("replica") << *p;
+ q.f->close_section();
+ pl->dump_recovery_info(q.f);
+ q.f->close_section();
+ }
+
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const QueryUnfound& q)
+{
+ DECLARE_LOCALS;
+
+ ps->query_unfound(q.f, "Active");
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(
+ const ActivateCommitted &evt)
+{
+ DECLARE_LOCALS;
+ ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
+ ps->peer_activated.insert(ps->pg_whoami);
+ psdout(10) << "_activate_committed " << evt.epoch
+ << " peer_activated now " << ps->peer_activated
+ << " last_interval_started "
+ << ps->info.history.last_interval_started
+ << " last_epoch_started "
+ << ps->info.history.last_epoch_started
+ << " same_interval_since "
+ << ps->info.history.same_interval_since
+ << dendl;
+ ceph_assert(!ps->acting_recovery_backfill.empty());
+ if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
+ all_activated_and_committed();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
+{
+
+ DECLARE_LOCALS;
+ pg_t pgid = context< PeeringMachine >().spgid.pgid;
+
+ all_replicas_activated = true;
+
+ ps->state_clear(PG_STATE_ACTIVATING);
+ ps->state_clear(PG_STATE_CREATING);
+ ps->state_clear(PG_STATE_PREMERGE);
+
+ bool merge_target;
+ if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
+ ps->state_set(PG_STATE_PEERED);
+ ps->state_set(PG_STATE_PREMERGE);
+
+ if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
+ if (merge_target) {
+ pg_t src = pgid;
+ src.set_ps(ps->pool.info.get_pg_num_pending());
+ assert(src.get_parent() == pgid);
+ pl->set_not_ready_to_merge_target(pgid, src);
+ } else {
+ pl->set_not_ready_to_merge_source(pgid);
+ }
+ }
+ } else if (!ps->acting_set_writeable()) {
+ ps->state_set(PG_STATE_PEERED);
+ } else {
+ ps->state_set(PG_STATE_ACTIVE);
+ }
+
+ auto mnow = pl->get_mnow();
+ if (ps->prior_readable_until_ub > mnow) {
+ psdout(10) << " waiting for prior_readable_until_ub "
+ << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
+ ps->state_set(PG_STATE_WAIT);
+ pl->queue_check_readable(
+ ps->last_peering_reset,
+ ps->prior_readable_until_ub - mnow);
+ } else {
+ psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
+ << ps->prior_readable_until_ub << dendl;
+ }
+
+ if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
+ pl->send_pg_created(pgid);
+ }
+
+ ps->info.history.last_epoch_started = ps->info.last_epoch_started;
+ ps->info.history.last_interval_started = ps->info.last_interval_started;
+ ps->dirty_info = true;
+
+ ps->share_pg_info();
+ pl->publish_stats_to_osd();
+
+ pl->on_activate_complete();
+
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
+{
+ DECLARE_LOCALS;
+ ps->proc_renew_lease();
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
+{
+ DECLARE_LOCALS;
+ ps->proc_lease_ack(la.from, la.lease_ack);
+ return discard_event();
+}
+
+
+boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
+{
+ DECLARE_LOCALS;
+ pl->recheck_readable();
+ return discard_event();
+}
+
+/*
+ * update info.history.last_epoch_started ONLY after we and all
+ * replicas have activated AND committed the activate transaction
+ * (i.e. the peering results are stable on disk).
+ */
+void PeeringState::Active::all_activated_and_committed()
+{
+ DECLARE_LOCALS;
+ psdout(10) << "all_activated_and_committed" << dendl;
+ ceph_assert(ps->is_primary());
+ ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
+ ceph_assert(!ps->acting_recovery_backfill.empty());
+ ceph_assert(ps->blocked_by.empty());
+
+ if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
+ // this is overkill when the activation is quick, but when it is slow it
+ // is important, because the lease was renewed by the activate itself but we
+ // don't know how long ago that was, and simply scheduling now may leave
+ // a gap in lease coverage. keep it simple and aggressively renew.
+ ps->renew_lease(pl->get_mnow());
+ ps->send_lease();
+ ps->schedule_renew_lease();
+ }
+
+ // Degraded?
+ ps->update_calc_stats();
+ if (ps->info.stats.stats.sum.num_objects_degraded) {
+ ps->state_set(PG_STATE_DEGRADED);
+ } else {
+ ps->state_clear(PG_STATE_DEGRADED);
+ }
+
+ post_event(PeeringState::AllReplicasActivated());
+}
+
+
+void PeeringState::Active::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+
+ DECLARE_LOCALS;
+ pl->cancel_local_background_io_reservation();
+
+ ps->blocked_by.clear();
+ ps->backfill_reserved = false;
+ ps->state_clear(PG_STATE_ACTIVATING);
+ ps->state_clear(PG_STATE_DEGRADED);
+ ps->state_clear(PG_STATE_UNDERSIZED);
+ ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
+ ps->state_clear(PG_STATE_BACKFILL_WAIT);
+ ps->state_clear(PG_STATE_RECOVERY_WAIT);
+ ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_active_latency, dur);
+ pl->on_active_exit();
+}
+
+/*------ReplicaActive-----*/
+PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+ ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+}
+
+
+boost::statechart::result PeeringState::ReplicaActive::react(
+ const Activate& actevt) {
+ DECLARE_LOCALS;
+ psdout(10) << "In ReplicaActive, about to call activate" << dendl;
+ ps->activate(
+ context< PeeringMachine >().get_cur_transaction(),
+ actevt.activation_epoch,
+ context< PeeringMachine >().get_recovery_ctx());
+ psdout(10) << "Activate Finished" << dendl;
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(
+ const ActivateCommitted &evt)
+{
+ DECLARE_LOCALS;
+ psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
+
+ auto &rctx = context<PeeringMachine>().get_recovery_ctx();
+ auto epoch = ps->get_osdmap_epoch();
+ pg_info_t i = ps->info;
+ i.history.last_epoch_started = evt.activation_epoch;
+ i.history.last_interval_started = i.history.same_interval_since;
+ rctx.send_info(
+ ps->get_primary().osd,
+ spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
+ epoch,
+ epoch,
+ i,
+ {}, /* lease */
+ ps->get_lease_ack());
+
+ if (ps->acting_set_writeable()) {
+ ps->state_set(PG_STATE_ACTIVE);
+ } else {
+ ps->state_set(PG_STATE_PEERED);
+ }
+ pl->on_activate_committed();
+
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
+{
+ DECLARE_LOCALS;
+ spg_t spgid = context< PeeringMachine >().spgid;
+ epoch_t epoch = pl->get_osdmap_epoch();
+
+ ps->proc_lease(l.lease);
+ pl->send_cluster_message(
+ ps->get_primary().osd,
+ make_message<MOSDPGLeaseAck>(epoch,
+ spg_t(spgid.pgid, ps->get_primary().shard),
+ ps->get_lease_ack()),
+ epoch);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
+{
+ DECLARE_LOCALS;
+ ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
+ infoevt.info);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "received log from " << logevt.from << dendl;
+ ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+ ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
+ ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
+ if (logevt.msg->lease) {
+ ps->proc_lease(*logevt.msg->lease);
+ }
+
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
+{
+ DECLARE_LOCALS;
+ // primary is instructing us to trim
+ ps->pg_log.trim(trim.trim_to, ps->info);
+ ps->dirty_info = true;
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
+{
+ DECLARE_LOCALS;
+ if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
+ ps->info.history.refresh_prior_readable_until_ub(
+ pl->get_mnow(), ps->prior_readable_until_ub);
+ context< PeeringMachine >().send_notify(
+ ps->get_primary().osd,
+ pg_notify_t(
+ ps->get_primary().shard, ps->pg_whoami.shard,
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ ps->info,
+ ps->past_intervals));
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(
+ const MQuery& query)
+{
+ DECLARE_LOCALS;
+ ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "ReplicaActive");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::ReplicaActive::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ pl->unreserve_recovery_space();
+
+ pl->cancel_remote_recovery_reservation();
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
+
+ ps->min_last_complete_ondisk = eversion_t();
+}
+
+/*-------Stray---*/
+PeeringState::Stray::Stray(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Stray")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+
+ DECLARE_LOCALS;
+ ceph_assert(!ps->is_peered());
+ ceph_assert(!ps->is_peering());
+ ceph_assert(!ps->is_primary());
+
+ if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
+ ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
+ post_event(DeleteStart());
+ } else {
+ ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+ }
+}
+
+boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
+{
+ DECLARE_LOCALS;
+ MOSDPGLog *msg = logevt.msg.get();
+ psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
+
+ ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+ if (msg->info.last_backfill == hobject_t()) {
+ // restart backfill
+ ps->info = msg->info;
+ pl->on_info_history_change();
+ ps->dirty_info = true;
+ ps->dirty_big_info = true; // maybe.
+
+ PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+ ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
+
+ ps->pg_log.reset_backfill();
+ } else {
+ ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
+ }
+ if (logevt.msg->lease) {
+ ps->proc_lease(*logevt.msg->lease);
+ }
+
+ ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
+
+ post_event(Activate(logevt.msg->info.last_epoch_started));
+ return transit<ReplicaActive>();
+}
+
+boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
+
+ if (ps->info.last_update > infoevt.info.last_update) {
+ // rewind divergent log entries
+ ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+ ps->rewind_divergent_log(t, infoevt.info.last_update);
+ ps->info.stats = infoevt.info.stats;
+ ps->info.hit_set = infoevt.info.hit_set;
+ }
+
+ if (infoevt.lease) {
+ ps->proc_lease(*infoevt.lease);
+ }
+
+ ceph_assert(infoevt.info.last_update == ps->info.last_update);
+ ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
+
+ post_event(Activate(infoevt.info.last_epoch_started));
+ return transit<ReplicaActive>();
+}
+
+boost::statechart::result PeeringState::Stray::react(const MQuery& query)
+{
+ DECLARE_LOCALS;
+ ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Stray::react(const ActMap&)
+{
+ DECLARE_LOCALS;
+ if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
+ ps->info.history.refresh_prior_readable_until_ub(
+ pl->get_mnow(), ps->prior_readable_until_ub);
+ context< PeeringMachine >().send_notify(
+ ps->get_primary().osd,
+ pg_notify_t(
+ ps->get_primary().shard, ps->pg_whoami.shard,
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ ps->info,
+ ps->past_intervals));
+ }
+ return discard_event();
+}
+
+void PeeringState::Stray::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_stray_latency, dur);
+}
+
+
+/*--------ToDelete----------*/
+PeeringState::ToDelete::ToDelete(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+ pl->get_perf_logger().inc(l_osd_pg_removing);
+}
+
+void PeeringState::ToDelete::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ // note: on a successful removal, this path doesn't execute. see
+ // _delete_some().
+ pl->get_perf_logger().dec(l_osd_pg_removing);
+
+ pl->cancel_local_background_io_reservation();
+}
+
+/*----WaitDeleteReserved----*/
+PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history,
+ "Started/ToDelete/WaitDeleteReseved")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+ context< ToDelete >().priority = ps->get_delete_priority();
+
+ pl->cancel_local_background_io_reservation();
+ pl->request_local_background_io_reservation(
+ context<ToDelete>().priority,
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ DeleteReserved()),
+ std::make_unique<PGPeeringEvent>(
+ ps->get_osdmap_epoch(),
+ ps->get_osdmap_epoch(),
+ DeleteInterrupted()));
+}
+
+boost::statechart::result PeeringState::ToDelete::react(
+ const ActMap& evt)
+{
+ DECLARE_LOCALS;
+ if (ps->get_delete_priority() != priority) {
+ psdout(10) << __func__ << " delete priority changed, resetting"
+ << dendl;
+ return transit<ToDelete>();
+ }
+ return discard_event();
+}
+
+void PeeringState::WaitDeleteReserved::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+}
+
+/*----Deleting-----*/
+PeeringState::Deleting::Deleting(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+ ps->deleting = true;
+ ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+
+ // clear log
+ PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+ ps->pg_log.roll_forward(rollbacker.get());
+
+ // adjust info to backfill
+ ps->info.set_last_backfill(hobject_t());
+ ps->pg_log.reset_backfill();
+ ps->dirty_info = true;
+
+ pl->on_removal(t);
+}
+
+boost::statechart::result PeeringState::Deleting::react(
+ const DeleteSome& evt)
+{
+ DECLARE_LOCALS;
+ std::pair<ghobject_t, bool> p;
+ p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
+ next);
+ next = p.first;
+ return p.second ? discard_event() : terminate();
+}
+
+void PeeringState::Deleting::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ ps->deleting = false;
+ pl->cancel_local_background_io_reservation();
+}
+
+/*--------GetInfo---------*/
+PeeringState::GetInfo::GetInfo(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+
+ DECLARE_LOCALS;
+ ps->check_past_interval_bounds();
+ ps->log_weirdness();
+ PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+
+ ceph_assert(ps->blocked_by.empty());
+
+ prior_set = ps->build_prior();
+ ps->prior_readable_down_osds = prior_set.down;
+
+ if (ps->prior_readable_down_osds.empty()) {
+ psdout(10) << " no prior_set down osds, will clear prior_readable_until_ub before activating"
+ << dendl;
+ }
+
+ ps->reset_min_peer_features();
+ get_infos();
+ if (prior_set.pg_down) {
+ post_event(IsDown());
+ } else if (peer_info_requested.empty()) {
+ post_event(GotInfo());
+ }
+}
+
+void PeeringState::GetInfo::get_infos()
+{
+ DECLARE_LOCALS;
+ PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+
+ ps->blocked_by.clear();
+ for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) {
+ pg_shard_t peer = *it;
+ if (peer == ps->pg_whoami) {
+ continue;
+ }
+ if (ps->peer_info.count(peer)) {
+ psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
+ continue;
+ }
+ if (peer_info_requested.count(peer)) {
+ psdout(10) << " already requested info from osd." << peer << dendl;
+ ps->blocked_by.insert(peer.osd);
+ } else if (!ps->get_osdmap()->is_up(peer.osd)) {
+ psdout(10) << " not querying info from down osd." << peer << dendl;
+ } else {
+ psdout(10) << " querying info from osd." << peer << dendl;
+ context< PeeringMachine >().send_query(
+ peer.osd,
+ pg_query_t(pg_query_t::INFO,
+ it->shard, ps->pg_whoami.shard,
+ ps->info.history,
+ ps->get_osdmap_epoch()));
+ peer_info_requested.insert(peer);
+ ps->blocked_by.insert(peer.osd);
+ }
+ }
+
+ ps->check_prior_readable_down_osds(ps->get_osdmap());
+
+ pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
+{
+
+ DECLARE_LOCALS;
+
+ auto p = peer_info_requested.find(infoevt.from);
+ if (p != peer_info_requested.end()) {
+ peer_info_requested.erase(p);
+ ps->blocked_by.erase(infoevt.from.osd);
+ }
+
+ epoch_t old_start = ps->info.history.last_epoch_started;
+ if (ps->proc_replica_info(
+ infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
+ // we got something new ...
+ PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+ if (old_start < ps->info.history.last_epoch_started) {
+ psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
+ prior_set = ps->build_prior();
+ ps->prior_readable_down_osds = prior_set.down;
+
+ // filter out any osds that got dropped from the probe set from
+ // peer_info_requested. this is less expensive than restarting
+ // peering (which would re-probe everyone).
+ auto p = peer_info_requested.begin();
+ while (p != peer_info_requested.end()) {
+ if (prior_set.probe.count(*p) == 0) {
+ psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
+ peer_info_requested.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+ get_infos();
+ }
+ psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
+ << hex << infoevt.features << dec << dendl;
+ ps->apply_peer_features(infoevt.features);
+
+ // are we done getting everything?
+ if (peer_info_requested.empty() && !prior_set.pg_down) {
+ psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
+ psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
+ psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
+ post_event(GotInfo());
+ }
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
+{
+ DECLARE_LOCALS;
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+
+ q.f->open_array_section("requested_info_from");
+ for (auto p = peer_info_requested.begin();
+ p != peer_info_requested.end();
+ ++p) {
+ q.f->open_object_section("osd");
+ q.f->dump_stream("osd") << *p;
+ if (ps->peer_info.count(*p)) {
+ q.f->open_object_section("got_info");
+ ps->peer_info[*p].dump(q.f);
+ q.f->close_section();
+ }
+ q.f->close_section();
+ }
+ q.f->close_section();
+
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "GetInfo");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::GetInfo::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
+ ps->blocked_by.clear();
+}
+
+/*------GetLog------------*/
+PeeringState::GetLog::GetLog(my_context ctx)
+ : my_base(ctx),
+ NamedState(
+ context< PeeringMachine >().state_history,
+ "Started/Primary/Peering/GetLog"),
+ msg(0)
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+
+ ps->log_weirdness();
+
+ // adjust acting?
+ if (!ps->choose_acting(auth_log_shard, false,
+ &context< Peering >().history_les_bound)) {
+ if (!ps->want_acting.empty()) {
+ post_event(NeedActingChange());
+ } else {
+ post_event(IsIncomplete());
+ }
+ return;
+ }
+
+ // am i the best?
+ if (auth_log_shard == ps->pg_whoami) {
+ post_event(GotLog());
+ return;
+ }
+
+ const pg_info_t& best = ps->peer_info[auth_log_shard];
+
+ // am i broken?
+ if (ps->info.last_update < best.log_tail) {
+ psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
+ post_event(IsIncomplete());
+ return;
+ }
+
+ // how much log to request?
+ eversion_t request_log_from = ps->info.last_update;
+ ceph_assert(!ps->acting_recovery_backfill.empty());
+ for (auto p = ps->acting_recovery_backfill.begin();
+ p != ps->acting_recovery_backfill.end();
+ ++p) {
+ if (*p == ps->pg_whoami) continue;
+ pg_info_t& ri = ps->peer_info[*p];
+ if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
+ ri.last_update < request_log_from)
+ request_log_from = ri.last_update;
+ }
+
+ // how much?
+ psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
+ context<PeeringMachine>().send_query(
+ auth_log_shard.osd,
+ pg_query_t(
+ pg_query_t::LOG,
+ auth_log_shard.shard, ps->pg_whoami.shard,
+ request_log_from, ps->info.history,
+ ps->get_osdmap_epoch()));
+
+ ceph_assert(ps->blocked_by.empty());
+ ps->blocked_by.insert(auth_log_shard.osd);
+ pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
+{
+ // make sure our log source didn't go down. we need to check
+ // explicitly because it may not be part of the prior set, which
+ // means the Peering state check won't catch it going down.
+ if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
+ psdout(10) << "GetLog: auth_log_shard osd."
+ << auth_log_shard.osd << " went down" << dendl;
+ post_event(advmap);
+ return transit< Reset >();
+ }
+
+ // let the Peering state do its checks.
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
+{
+ ceph_assert(!msg);
+ if (logevt.from != auth_log_shard) {
+ psdout(10) << "GetLog: discarding log from "
+ << "non-auth_log_shard osd." << logevt.from << dendl;
+ return discard_event();
+ }
+ psdout(10) << "GetLog: received master log from osd."
+ << logevt.from << dendl;
+ msg = logevt.msg;
+ post_event(GotLog());
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const GotLog&)
+{
+
+ DECLARE_LOCALS;
+ psdout(10) << "leaving GetLog" << dendl;
+ if (msg) {
+ psdout(10) << "processing master log" << dendl;
+ ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
+ msg->info, std::move(msg->log), std::move(msg->missing),
+ auth_log_shard);
+ }
+ ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+ return transit< GetMissing >();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->dump_stream("auth_log_shard") << auth_log_shard;
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "GetLog");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::GetLog::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_getlog_latency, dur);
+ ps->blocked_by.clear();
+}
+
+/*------WaitActingChange--------*/
+PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
+{
+ DECLARE_LOCALS;
+ OSDMapRef osdmap = advmap.osdmap;
+
+ psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
+ for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
+ if (!osdmap->is_up(*p)) {
+ psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
+ post_event(advmap);
+ return transit< Reset >();
+ }
+ }
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
+{
+ psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
+{
+ psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
+{
+ psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->dump_string("comment", "waiting for pg acting set to change");
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "WaitActingChange");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::WaitActingChange::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
+}
+
+/*------Down--------*/
+PeeringState::Down::Down(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ ps->state_clear(PG_STATE_PEERING);
+ ps->state_set(PG_STATE_DOWN);
+
+ auto &prior_set = context< Peering >().prior_set;
+ ceph_assert(ps->blocked_by.empty());
+ ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
+ pl->publish_stats_to_osd();
+}
+
+void PeeringState::Down::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+
+ ps->state_clear(PG_STATE_DOWN);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_down_latency, dur);
+
+ ps->blocked_by.clear();
+}
+
+boost::statechart::result PeeringState::Down::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->dump_string("comment",
+ "not enough up instances of this PG to go active");
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Down::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "Down");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
+{
+ DECLARE_LOCALS;
+
+ ceph_assert(ps->is_primary());
+ epoch_t old_start = ps->info.history.last_epoch_started;
+ if (!ps->peer_info.count(infoevt.from) &&
+ ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
+ ps->update_history(infoevt.notify.info.history);
+ }
+ // if we got something new to make pg escape down state
+ if (ps->info.history.last_epoch_started > old_start) {
+ psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
+ ps->state_clear(PG_STATE_DOWN);
+ ps->state_set(PG_STATE_PEERING);
+ return transit< GetInfo >();
+ }
+
+ return discard_event();
+}
+
+
+/*------Incomplete--------*/
+PeeringState::Incomplete::Incomplete(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
+{
+ context< PeeringMachine >().log_enter(state_name);
+ DECLARE_LOCALS;
+
+ ps->state_clear(PG_STATE_PEERING);
+ ps->state_set(PG_STATE_INCOMPLETE);
+
+ PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+ ceph_assert(ps->blocked_by.empty());
+ ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
+ pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
+ DECLARE_LOCALS;
+ int64_t poolnum = ps->info.pgid.pool();
+
+ // Reset if min_size turn smaller than previous value, pg might now be able to go active
+ if (!advmap.osdmap->have_pg_pool(poolnum) ||
+ advmap.lastmap->get_pools().find(poolnum)->second.min_size >
+ advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
+ post_event(advmap);
+ return transit< Reset >();
+ }
+
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
+ DECLARE_LOCALS;
+ psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
+ if (ps->proc_replica_info(
+ notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
+ // We got something new, try again!
+ return transit< GetLog >();
+ } else {
+ return discard_event();
+ }
+}
+
+boost::statechart::result PeeringState::Incomplete::react(
+ const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->dump_string("comment", "not enough complete instances of this PG");
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "Incomplete");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::Incomplete::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+
+ ps->state_clear(PG_STATE_INCOMPLETE);
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
+
+ ps->blocked_by.clear();
+}
+
+/*------GetMissing--------*/
+PeeringState::GetMissing::GetMissing(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
+{
+ context< PeeringMachine >().log_enter(state_name);
+
+ DECLARE_LOCALS;
+ ps->log_weirdness();
+ ceph_assert(!ps->acting_recovery_backfill.empty());
+ eversion_t since;
+ for (auto i = ps->acting_recovery_backfill.begin();
+ i != ps->acting_recovery_backfill.end();
+ ++i) {
+ if (*i == ps->get_primary()) continue;
+ const pg_info_t& pi = ps->peer_info[*i];
+ // reset this so to make sure the pg_missing_t is initialized and
+ // has the correct semantics even if we don't need to get a
+ // missing set from a shard. This way later additions due to
+ // lost+unfound delete work properly.
+ ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
+
+ if (pi.is_empty())
+ continue; // no pg data, nothing divergent
+
+ if (pi.last_update < ps->pg_log.get_tail()) {
+ psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
+ ps->peer_missing[*i].clear();
+ continue;
+ }
+ if (pi.last_backfill == hobject_t()) {
+ psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
+ ps->peer_missing[*i].clear();
+ continue;
+ }
+
+ if (pi.last_update == pi.last_complete && // peer has no missing
+ pi.last_update == ps->info.last_update) { // peer is up to date
+ // replica has no missing and identical log as us. no need to
+ // pull anything.
+ // FIXME: we can do better here. if last_update==last_complete we
+ // can infer the rest!
+ psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
+ ps->peer_missing[*i].clear();
+ continue;
+ }
+
+ // We pull the log from the peer's last_epoch_started to ensure we
+ // get enough log to detect divergent updates.
+ since.epoch = pi.last_epoch_started;
+ ceph_assert(pi.last_update >= ps->info.log_tail); // or else choose_acting() did a bad thing
+ if (pi.log_tail <= since) {
+ psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
+ context< PeeringMachine >().send_query(
+ i->osd,
+ pg_query_t(
+ pg_query_t::LOG,
+ i->shard, ps->pg_whoami.shard,
+ since, ps->info.history,
+ ps->get_osdmap_epoch()));
+ } else {
+ psdout(10) << " requesting fulllog+missing from osd." << *i
+ << " (want since " << since << " < log.tail "
+ << pi.log_tail << ")" << dendl;
+ context< PeeringMachine >().send_query(
+ i->osd, pg_query_t(
+ pg_query_t::FULLLOG,
+ i->shard, ps->pg_whoami.shard,
+ ps->info.history, ps->get_osdmap_epoch()));
+ }
+ peer_missing_requested.insert(*i);
+ ps->blocked_by.insert(i->osd);
+ }
+
+ if (peer_missing_requested.empty()) {
+ if (ps->need_up_thru) {
+ psdout(10) << " still need up_thru update before going active"
+ << dendl;
+ post_event(NeedUpThru());
+ return;
+ }
+
+ // all good!
+ post_event(Activate(ps->get_osdmap_epoch()));
+ } else {
+ pl->publish_stats_to_osd();
+ }
+}
+
+boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
+{
+ DECLARE_LOCALS;
+
+ peer_missing_requested.erase(logevt.from);
+ ps->proc_replica_log(logevt.msg->info,
+ logevt.msg->log,
+ std::move(logevt.msg->missing),
+ logevt.from);
+
+ if (peer_missing_requested.empty()) {
+ if (ps->need_up_thru) {
+ psdout(10) << " still need up_thru update before going active"
+ << dendl;
+ post_event(NeedUpThru());
+ } else {
+ psdout(10) << "Got last missing, don't need missing "
+ << "posting Activate" << dendl;
+ post_event(Activate(ps->get_osdmap_epoch()));
+ }
+ }
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
+{
+ DECLARE_LOCALS;
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+
+ q.f->open_array_section("peer_missing_requested");
+ for (auto p = peer_missing_requested.begin();
+ p != peer_missing_requested.end();
+ ++p) {
+ q.f->open_object_section("osd");
+ q.f->dump_stream("osd") << *p;
+ if (ps->peer_missing.count(*p)) {
+ q.f->open_object_section("got_missing");
+ ps->peer_missing[*p].dump(q.f);
+ q.f->close_section();
+ }
+ q.f->close_section();
+ }
+ q.f->close_section();
+
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "GetMising");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::GetMissing::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
+ ps->blocked_by.clear();
+}
+
+/*------WaitUpThru--------*/
+PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
+ : my_base(ctx),
+ NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
+{
+ context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
+{
+ DECLARE_LOCALS;
+ if (!ps->need_up_thru) {
+ post_event(Activate(ps->get_osdmap_epoch()));
+ }
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
+{
+ DECLARE_LOCALS;
+ psdout(10) << "Noting missing from osd." << logevt.from << dendl;
+ ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
+ ps->peer_info[logevt.from] = logevt.msg->info;
+ return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
+{
+ q.f->open_object_section("state");
+ q.f->dump_string("name", state_name);
+ q.f->dump_stream("enter_time") << enter_time;
+ q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
+ q.f->close_section();
+ return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q)
+{
+ q.f->dump_string("state", "WaitUpThru");
+ q.f->dump_bool("available_might_have_unfound", false);
+ return discard_event();
+}
+
+void PeeringState::WaitUpThru::exit()
+{
+ context< PeeringMachine >().log_exit(state_name, enter_time);
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
+}
+
+/*----PeeringState::PeeringMachine Methods-----*/
+#undef dout_prefix
+#define dout_prefix dpp->gen_prefix(*_dout)
+
+void PeeringState::PeeringMachine::log_enter(const char *state_name)
+{
+ DECLARE_LOCALS;
+ psdout(5) << "enter " << state_name << dendl;
+ pl->log_state_enter(state_name);
+}
+
+void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
+{
+ DECLARE_LOCALS;
+ utime_t dur = ceph_clock_now() - enter_time;
+ psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
+ pl->log_state_exit(state_name, enter_time, event_count, event_time);
+ event_count = 0;
+ event_time = utime_t();
+}
+
+ostream &operator<<(ostream &out, const PeeringState &ps) {
+ out << "pg[" << ps.info
+ << " " << pg_vector_string(ps.up);
+ if (ps.acting != ps.up)
+ out << "/" << pg_vector_string(ps.acting);
+ if (ps.is_ec_pg())
+ out << "p" << ps.get_primary();
+ if (!ps.async_recovery_targets.empty())
+ out << " async=[" << ps.async_recovery_targets << "]";
+ if (!ps.backfill_targets.empty())
+ out << " backfill=[" << ps.backfill_targets << "]";
+ out << " r=" << ps.get_role();
+ out << " lpr=" << ps.get_last_peering_reset();
+
+ if (ps.deleting)
+ out << " DELETING";
+
+ if (!ps.past_intervals.empty()) {
+ out << " pi=[" << ps.past_intervals.get_bounds()
+ << ")/" << ps.past_intervals.size();
+ }
+
+ if (ps.is_peered()) {
+ if (ps.last_update_ondisk != ps.info.last_update)
+ out << " luod=" << ps.last_update_ondisk;
+ if (ps.last_update_applied != ps.info.last_update)
+ out << " lua=" << ps.last_update_applied;
+ }
+
+ if (ps.pg_log.get_tail() != ps.info.log_tail ||
+ ps.pg_log.get_head() != ps.info.last_update)
+ out << " (info mismatch, " << ps.pg_log.get_log() << ")";
+
+ if (!ps.pg_log.get_log().empty()) {
+ if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
+ out << " (log bound mismatch, actual=["
+ << ps.pg_log.get_log().log.begin()->version << ","
+ << ps.pg_log.get_log().log.rbegin()->version << "]";
+ out << ")";
+ }
+ }
+
+ out << " crt=" << ps.pg_log.get_can_rollback_to();
+
+ if (ps.last_complete_ondisk != ps.info.last_complete)
+ out << " lcod " << ps.last_complete_ondisk;
+
+ out << " mlcod " << ps.min_last_complete_ondisk;
+
+ out << " " << pg_state_string(ps.get_state());
+ if (ps.should_send_notify())
+ out << " NOTIFY";
+
+ if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
+ out << " pruub " << ps.prior_readable_until_ub
+ << "@" << ps.get_prior_readable_down_osds();
+ }
+ return out;
+}
+
+std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const
+{
+ std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
+ async_by_num_missing;
+ replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
+ for (auto &p : get_acting_recovery_backfill()) {
+ if (p == get_primary()) {
+ continue;
+ }
+ auto pm = get_peer_missing().find(p);
+ assert(pm != get_peer_missing().end());
+ auto nm = pm->second.num_missing();
+ if (nm != 0) {
+ if (is_async_recovery_target(p)) {
+ async_by_num_missing.push_back(make_pair(nm, p));
+ } else {
+ replicas_by_num_missing.push_back(make_pair(nm, p));
+ }
+ }
+ }
+ // sort by number of missing objects, in ascending order.
+ auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
+ const std::pair<unsigned int, pg_shard_t> &rhs) {
+ return lhs.first < rhs.first;
+ };
+ // acting goes first
+ std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
+ // then async_recovery_targets
+ std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
+ replicas_by_num_missing.insert(replicas_by_num_missing.end(),
+ async_by_num_missing.begin(), async_by_num_missing.end());
+
+ std::vector<pg_shard_t> ret;
+ ret.reserve(replicas_by_num_missing.size());
+ for (auto p : replicas_by_num_missing) {
+ ret.push_back(p.second);
+ }
+ return ret;
+}
+
+
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
new file mode 100644
index 000000000..2cc340cb9
--- /dev/null
+++ b/src/osd/PeeringState.h
@@ -0,0 +1,2442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <string>
+#include <atomic>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+
+#include "PGLog.h"
+#include "PGStateUtils.h"
+#include "PGPeeringEvent.h"
+#include "osd_types.h"
+#include "os/ObjectStore.h"
+#include "OSDMap.h"
+#include "MissingLoc.h"
+#include "osd/osd_perf_counters.h"
+#include "common/ostream_temp.h"
+
+struct PGPool {
+ epoch_t cached_epoch;
+ int64_t id;
+ std::string name;
+
+ pg_pool_t info;
+ SnapContext snapc; // the default pool snapc, ready to go.
+
+ PGPool(OSDMapRef map, int64_t i, const pg_pool_t& info,
+ const std::string& name)
+ : cached_epoch(map->get_epoch()),
+ id(i),
+ name(name),
+ info(info) {
+ snapc = info.get_snap_context();
+ }
+
+ void update(OSDMapRef map);
+
+ ceph::timespan get_readable_interval(ConfigProxy &conf) const {
+ double v = 0;
+ if (info.opts.get(pool_opts_t::READ_LEASE_INTERVAL, &v)) {
+ return ceph::make_timespan(v);
+ } else {
+ auto hbi = conf->osd_heartbeat_grace;
+ auto fac = conf->osd_pool_default_read_lease_ratio;
+ return ceph::make_timespan(hbi * fac);
+ }
+ }
+};
+
+struct PeeringCtx;
+
+// [primary only] content recovery state
+struct BufferedRecoveryMessages {
+ ceph_release_t require_osd_release;
+ std::map<int, std::vector<MessageRef>> message_map;
+
+ BufferedRecoveryMessages(ceph_release_t r)
+ : require_osd_release(r) {
+ }
+ BufferedRecoveryMessages(ceph_release_t r, PeeringCtx &ctx);
+
+ void accept_buffered_messages(BufferedRecoveryMessages &m) {
+ for (auto &[target, ls] : m.message_map) {
+ auto &ovec = message_map[target];
+ // put buffered messages in front
+ ls.reserve(ls.size() + ovec.size());
+ ls.insert(ls.end(), ovec.begin(), ovec.end());
+ ovec.clear();
+ ovec.swap(ls);
+ }
+ }
+
+ void send_osd_message(int target, MessageRef m) {
+ message_map[target].push_back(std::move(m));
+ }
+ void send_notify(int to, const pg_notify_t &n);
+ void send_query(int to, spg_t spgid, const pg_query_t &q);
+ void send_info(int to, spg_t to_spgid,
+ epoch_t min_epoch, epoch_t cur_epoch,
+ const pg_info_t &info,
+ std::optional<pg_lease_t> lease = {},
+ std::optional<pg_lease_ack_t> lease_ack = {});
+};
+
+struct HeartbeatStamps : public RefCountedObject {
+ mutable ceph::mutex lock = ceph::make_mutex("HeartbeatStamps::lock");
+
+ const int osd;
+
+ // we maintain an upper and lower bound on the delta between our local
+ // mono_clock time (minus the startup_time) to the peer OSD's mono_clock
+ // time (minus its startup_time).
+ //
+ // delta is (remote_clock_time - local_clock_time), so that
+ // local_time + delta -> peer_time, and peer_time - delta -> local_time.
+ //
+ // we have an upper and lower bound value on this delta, meaning the
+ // value of the remote clock is somewhere between [my_time + lb, my_time + ub]
+ //
+ // conversely, if we have a remote timestamp T, then that is
+ // [T - ub, T - lb] in terms of the local clock. i.e., if you are
+ // substracting the delta, then take care that you swap the role of the
+ // lb and ub values.
+
+ /// lower bound on peer clock - local clock
+ std::optional<ceph::signedspan> peer_clock_delta_lb;
+
+ /// upper bound on peer clock - local clock
+ std::optional<ceph::signedspan> peer_clock_delta_ub;
+
+ /// highest up_from we've seen from this rank
+ epoch_t up_from = 0;
+
+ void print(std::ostream& out) const {
+ std::lock_guard l(lock);
+ out << "hbstamp(osd." << osd << " up_from " << up_from
+ << " peer_clock_delta [";
+ if (peer_clock_delta_lb) {
+ out << *peer_clock_delta_lb;
+ }
+ out << ",";
+ if (peer_clock_delta_ub) {
+ out << *peer_clock_delta_ub;
+ }
+ out << "])";
+ }
+
+ void sent_ping(std::optional<ceph::signedspan> *delta_ub) {
+ std::lock_guard l(lock);
+ // the non-primaries need a lower bound on remote clock - local clock. if
+ // we assume the transit for the last ping_reply was
+ // instantaneous, that would be (the negative of) our last
+ // peer_clock_delta_lb value.
+ if (peer_clock_delta_lb) {
+ *delta_ub = - *peer_clock_delta_lb;
+ }
+ }
+
+ void got_ping(epoch_t this_up_from,
+ ceph::signedspan now,
+ ceph::signedspan peer_send_stamp,
+ std::optional<ceph::signedspan> delta_ub,
+ ceph::signedspan *out_delta_ub) {
+ std::lock_guard l(lock);
+ if (this_up_from < up_from) {
+ return;
+ }
+ if (this_up_from > up_from) {
+ up_from = this_up_from;
+ }
+ peer_clock_delta_lb = peer_send_stamp - now;
+ peer_clock_delta_ub = delta_ub;
+ *out_delta_ub = - *peer_clock_delta_lb;
+ }
+
+ void got_ping_reply(ceph::signedspan now,
+ ceph::signedspan peer_send_stamp,
+ std::optional<ceph::signedspan> delta_ub) {
+ std::lock_guard l(lock);
+ peer_clock_delta_lb = peer_send_stamp - now;
+ peer_clock_delta_ub = delta_ub;
+ }
+
+private:
+ FRIEND_MAKE_REF(HeartbeatStamps);
+ HeartbeatStamps(int o)
+ : RefCountedObject(NULL),
+ osd(o) {}
+};
+using HeartbeatStampsRef = ceph::ref_t<HeartbeatStamps>;
+
+inline std::ostream& operator<<(std::ostream& out, const HeartbeatStamps& hb)
+{
+ hb.print(out);
+ return out;
+}
+
+
+struct PeeringCtx : BufferedRecoveryMessages {
+ ObjectStore::Transaction transaction;
+ HBHandle* handle = nullptr;
+
+ PeeringCtx(ceph_release_t r)
+ : BufferedRecoveryMessages(r) {}
+
+ PeeringCtx(const PeeringCtx &) = delete;
+ PeeringCtx &operator=(const PeeringCtx &) = delete;
+
+ PeeringCtx(PeeringCtx &&) = default;
+ PeeringCtx &operator=(PeeringCtx &&) = default;
+
+ void reset_transaction() {
+ transaction = ObjectStore::Transaction();
+ }
+};
+
+/**
+ * Wraps PeeringCtx to hide the difference between buffering messages to
+ * be sent after flush or immediately.
+ */
+struct PeeringCtxWrapper {
+ utime_t start_time;
+ BufferedRecoveryMessages &msgs;
+ ObjectStore::Transaction &transaction;
+ HBHandle * const handle = nullptr;
+
+ PeeringCtxWrapper(PeeringCtx &wrapped) :
+ msgs(wrapped),
+ transaction(wrapped.transaction),
+ handle(wrapped.handle) {}
+
+ PeeringCtxWrapper(BufferedRecoveryMessages &buf, PeeringCtx &wrapped)
+ : msgs(buf),
+ transaction(wrapped.transaction),
+ handle(wrapped.handle) {}
+
+ PeeringCtxWrapper(PeeringCtxWrapper &&ctx) = default;
+
+ void send_osd_message(int target, MessageRef m) {
+ msgs.send_osd_message(target, std::move(m));
+ }
+ void send_notify(int to, const pg_notify_t &n) {
+ msgs.send_notify(to, n);
+ }
+ void send_query(int to, spg_t spgid, const pg_query_t &q) {
+ msgs.send_query(to, spgid, q);
+ }
+ void send_info(int to, spg_t to_spgid,
+ epoch_t min_epoch, epoch_t cur_epoch,
+ const pg_info_t &info,
+ std::optional<pg_lease_t> lease = {},
+ std::optional<pg_lease_ack_t> lease_ack = {}) {
+ msgs.send_info(to, to_spgid, min_epoch, cur_epoch, info,
+ lease, lease_ack);
+ }
+};
+
+/* Encapsulates PG recovery process */
+class PeeringState : public MissingLoc::MappingInfo {
+public:
+ struct PeeringListener : public EpochSource {
+ /// Prepare t with written information
+ virtual void prepare_write(
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ PGLog &pglog,
+ bool dirty_info,
+ bool dirty_big_info,
+ bool need_write_epoch,
+ ObjectStore::Transaction &t) = 0;
+
+ /// Notify that info/history changed (generally to update scrub registration)
+ virtual void on_info_history_change() = 0;
+ /// Notify that a scrub has been requested
+ virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0;
+
+ /// Return current snap_trimq size
+ virtual uint64_t get_snap_trimq_size() const = 0;
+
+ /// Send cluster message to osd
+ virtual void send_cluster_message(
+ int osd, MessageRef m, epoch_t epoch, bool share_map_update=false) = 0;
+ /// Send pg_created to mon
+ virtual void send_pg_created(pg_t pgid) = 0;
+
+ virtual ceph::signedspan get_mnow() = 0;
+ virtual HeartbeatStampsRef get_hb_stamps(int peer) = 0;
+ virtual void schedule_renew_lease(epoch_t plr, ceph::timespan delay) = 0;
+ virtual void queue_check_readable(epoch_t lpr, ceph::timespan delay) = 0;
+ virtual void recheck_readable() = 0;
+
+ virtual unsigned get_target_pg_log_entries() const = 0;
+
+ // ============ Flush state ==================
+ /**
+ * try_flush_or_schedule_async()
+ *
+ * If true, caller may assume all past operations on this pg
+ * have been flushed. Else, caller will receive an on_flushed()
+ * call once the flush has completed.
+ */
+ virtual bool try_flush_or_schedule_async() = 0;
+ /// Arranges for a commit on t to call on_flushed() once flushed.
+ virtual void start_flush_on_transaction(
+ ObjectStore::Transaction &t) = 0;
+ /// Notification that all outstanding flushes for interval have completed
+ virtual void on_flushed() = 0;
+
+ //============= Recovery ====================
+ /// Arrange for even to be queued after delay
+ virtual void schedule_event_after(
+ PGPeeringEventRef event,
+ float delay) = 0;
+ /**
+ * request_local_background_io_reservation
+ *
+ * Request reservation at priority with on_grant queued on grant
+ * and on_preempt on preempt
+ */
+ virtual void request_local_background_io_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) = 0;
+ /// Modify pending local background reservation request priority
+ virtual void update_local_background_io_priority(
+ unsigned priority) = 0;
+ /// Cancel pending local background reservation request
+ virtual void cancel_local_background_io_reservation() = 0;
+
+ /**
+ * request_remote_background_io_reservation
+ *
+ * Request reservation at priority with on_grant queued on grant
+ * and on_preempt on preempt
+ */
+ virtual void request_remote_recovery_reservation(
+ unsigned priority,
+ PGPeeringEventURef on_grant,
+ PGPeeringEventURef on_preempt) = 0;
+ /// Cancel pending remote background reservation request
+ virtual void cancel_remote_recovery_reservation() = 0;
+
+ /// Arrange for on_commit to be queued upon commit of t
+ virtual void schedule_event_on_commit(
+ ObjectStore::Transaction &t,
+ PGPeeringEventRef on_commit) = 0;
+
+ //============================ HB =============================
+ /// Update hb set to peers
+ virtual void update_heartbeat_peers(std::set<int> peers) = 0;
+
+ /// Std::set targets being probed in this interval
+ virtual void set_probe_targets(const std::set<pg_shard_t> &probe_set) = 0;
+ /// Clear targets being probed in this interval
+ virtual void clear_probe_targets() = 0;
+
+ /// Queue for a pg_temp of wanted
+ virtual void queue_want_pg_temp(const std::vector<int> &wanted) = 0;
+ /// Clear queue for a pg_temp of wanted
+ virtual void clear_want_pg_temp() = 0;
+
+ /// Arrange for stats to be shipped to mon to be updated for this pg
+ virtual void publish_stats_to_osd() = 0;
+ /// Clear stats to be shipped to mon for this pg
+ virtual void clear_publish_stats() = 0;
+
+ /// Notification to check outstanding operation targets
+ virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
+ /// Notification to check outstanding blocklist
+ virtual void check_blocklisted_watchers() = 0;
+ /// Notification to clear state associated with primary
+ virtual void clear_primary_state() = 0;
+
+ // =================== Event notification ====================
+ virtual void on_pool_change() = 0;
+ virtual void on_role_change() = 0;
+ virtual void on_change(ObjectStore::Transaction &t) = 0;
+ virtual void on_activate(interval_set<snapid_t> to_trim) = 0;
+ virtual void on_activate_complete() = 0;
+ virtual void on_new_interval() = 0;
+ virtual Context *on_clean() = 0;
+ virtual void on_activate_committed() = 0;
+ virtual void on_active_exit() = 0;
+
+ // ====================== PG deletion =======================
+ /// Notification of removal complete, t must be populated to complete removal
+ virtual void on_removal(ObjectStore::Transaction &t) = 0;
+ /// Perform incremental removal work
+ virtual std::pair<ghobject_t, bool> do_delete_work(
+ ObjectStore::Transaction &t, ghobject_t _next) = 0;
+
+ // ======================= PG Merge =========================
+ virtual void clear_ready_to_merge() = 0;
+ virtual void set_not_ready_to_merge_target(pg_t pgid, pg_t src) = 0;
+ virtual void set_not_ready_to_merge_source(pg_t pgid) = 0;
+ virtual void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) = 0;
+ virtual void set_ready_to_merge_source(eversion_t lu) = 0;
+
+ // ==================== Std::map notifications ===================
+ virtual void on_active_actmap() = 0;
+ virtual void on_active_advmap(const OSDMapRef &osdmap) = 0;
+ virtual epoch_t oldest_stored_osdmap() = 0;
+
+ // ============ recovery reservation notifications ==========
+ virtual void on_backfill_reserved() = 0;
+ virtual void on_backfill_canceled() = 0;
+ virtual void on_recovery_reserved() = 0;
+
+ // ================recovery space accounting ================
+ virtual bool try_reserve_recovery_space(
+ int64_t primary_num_bytes, int64_t local_num_bytes) = 0;
+ virtual void unreserve_recovery_space() = 0;
+
+ // ================== Peering log events ====================
+ /// Get handler for rolling forward/back log entries
+ virtual PGLog::LogEntryHandlerRef get_log_handler(
+ ObjectStore::Transaction &t) = 0;
+
+ // ============ On disk representation changes ==============
+ virtual void rebuild_missing_set_with_deletes(PGLog &pglog) = 0;
+
+ // ======================= Logging ==========================
+ virtual PerfCounters &get_peering_perf() = 0;
+ virtual PerfCounters &get_perf_logger() = 0;
+ virtual void log_state_enter(const char *state) = 0;
+ virtual void log_state_exit(
+ const char *state_name, utime_t enter_time,
+ uint64_t events, utime_t event_dur) = 0;
+ virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
+
+ virtual OstreamTemp get_clog_info() = 0;
+ virtual OstreamTemp get_clog_error() = 0;
+ virtual OstreamTemp get_clog_debug() = 0;
+
+ virtual ~PeeringListener() {}
+ };
+
+ struct QueryState : boost::statechart::event< QueryState > {
+ ceph::Formatter *f;
+ explicit QueryState(ceph::Formatter *f) : f(f) {}
+ void print(std::ostream *out) const {
+ *out << "Query";
+ }
+ };
+
+ struct QueryUnfound : boost::statechart::event< QueryUnfound > {
+ ceph::Formatter *f;
+ explicit QueryUnfound(ceph::Formatter *f) : f(f) {}
+ void print(std::ostream *out) const {
+ *out << "QueryUnfound";
+ }
+ };
+
+ struct AdvMap : boost::statechart::event< AdvMap > {
+ OSDMapRef osdmap;
+ OSDMapRef lastmap;
+ std::vector<int> newup, newacting;
+ int up_primary, acting_primary;
+ AdvMap(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ std::vector<int>& newup, int up_primary,
+ std::vector<int>& newacting, int acting_primary):
+ osdmap(osdmap), lastmap(lastmap),
+ newup(newup),
+ newacting(newacting),
+ up_primary(up_primary),
+ acting_primary(acting_primary) {}
+ void print(std::ostream *out) const {
+ *out << "AdvMap";
+ }
+ };
+
+ struct ActMap : boost::statechart::event< ActMap > {
+ ActMap() : boost::statechart::event< ActMap >() {}
+ void print(std::ostream *out) const {
+ *out << "ActMap";
+ }
+ };
+ struct Activate : boost::statechart::event< Activate > {
+ epoch_t activation_epoch;
+ explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
+ activation_epoch(q) {}
+ void print(std::ostream *out) const {
+ *out << "Activate from " << activation_epoch;
+ }
+ };
+ struct ActivateCommitted : boost::statechart::event< ActivateCommitted > {
+ epoch_t epoch;
+ epoch_t activation_epoch;
+ explicit ActivateCommitted(epoch_t e, epoch_t ae)
+ : boost::statechart::event< ActivateCommitted >(),
+ epoch(e),
+ activation_epoch(ae) {}
+ void print(std::ostream *out) const {
+ *out << "ActivateCommitted from " << activation_epoch
+ << " processed at " << epoch;
+ }
+ };
+public:
+ struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
+ explicit UnfoundBackfill() {}
+ void print(std::ostream *out) const {
+ *out << "UnfoundBackfill";
+ }
+ };
+ struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
+ explicit UnfoundRecovery() {}
+ void print(std::ostream *out) const {
+ *out << "UnfoundRecovery";
+ }
+ };
+
+ struct RequestScrub : boost::statechart::event<RequestScrub> {
+ scrub_level_t deep;
+ scrub_type_t repair;
+ explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {}
+ void print(std::ostream *out) const {
+ *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow")
+ << ((repair==scrub_type_t::do_repair) ? " repair)" : ")");
+ }
+ };
+
+ TrivialEvent(Initialize)
+ TrivialEvent(GotInfo)
+ TrivialEvent(NeedUpThru)
+ TrivialEvent(Backfilled)
+ TrivialEvent(LocalBackfillReserved)
+ TrivialEvent(RejectTooFullRemoteReservation)
+ TrivialEvent(RequestBackfill)
+ TrivialEvent(RemoteRecoveryPreempted)
+ TrivialEvent(RemoteBackfillPreempted)
+ TrivialEvent(BackfillTooFull)
+ TrivialEvent(RecoveryTooFull)
+
+ TrivialEvent(MakePrimary)
+ TrivialEvent(MakeStray)
+ TrivialEvent(NeedActingChange)
+ TrivialEvent(IsIncomplete)
+ TrivialEvent(IsDown)
+
+ TrivialEvent(AllReplicasRecovered)
+ TrivialEvent(DoRecovery)
+ TrivialEvent(LocalRecoveryReserved)
+ TrivialEvent(AllRemotesReserved)
+ TrivialEvent(AllBackfillsReserved)
+ TrivialEvent(GoClean)
+
+ TrivialEvent(AllReplicasActivated)
+
+ TrivialEvent(IntervalFlush)
+
+ TrivialEvent(DeleteStart)
+ TrivialEvent(DeleteSome)
+
+ TrivialEvent(SetForceRecovery)
+ TrivialEvent(UnsetForceRecovery)
+ TrivialEvent(SetForceBackfill)
+ TrivialEvent(UnsetForceBackfill)
+
+ TrivialEvent(DeleteReserved)
+ TrivialEvent(DeleteInterrupted)
+
+ TrivialEvent(CheckReadable)
+
+ void start_handle(PeeringCtx *new_ctx);
+ void end_handle();
+ void begin_block_outgoing();
+ void end_block_outgoing();
+ void clear_blocked_outgoing();
+ private:
+
+ /* States */
+ struct Initial;
+ class PeeringMachine : public boost::statechart::state_machine< PeeringMachine, Initial > {
+ public:
+ PeeringState *state;
+ PGStateHistory *state_history;
+ CephContext *cct;
+ spg_t spgid;
+ DoutPrefixProvider *dpp;
+ PeeringListener *pl;
+
+ utime_t event_time;
+ uint64_t event_count;
+
+ void clear_event_counters() {
+ event_time = utime_t();
+ event_count = 0;
+ }
+
+ void log_enter(const char *state_name);
+ void log_exit(const char *state_name, utime_t duration);
+
+ PeeringMachine(
+ PeeringState *state, CephContext *cct,
+ spg_t spgid,
+ DoutPrefixProvider *dpp,
+ PeeringListener *pl,
+ PGStateHistory *state_history) :
+ state(state),
+ state_history(state_history),
+ cct(cct), spgid(spgid),
+ dpp(dpp), pl(pl),
+ event_count(0) {}
+
+ /* Accessor functions for state methods */
+ ObjectStore::Transaction& get_cur_transaction() {
+ ceph_assert(state->rctx);
+ return state->rctx->transaction;
+ }
+
+ PeeringCtxWrapper &get_recovery_ctx() {
+ assert(state->rctx);
+ return *(state->rctx);
+ }
+
+ void send_notify(int to, const pg_notify_t &n) {
+ ceph_assert(state->rctx);
+ state->rctx->send_notify(to, n);
+ }
+ void send_query(int to, const pg_query_t &query) {
+ state->rctx->send_query(
+ to,
+ spg_t(spgid.pgid, query.to),
+ query);
+ }
+ };
+ friend class PeeringMachine;
+
+ /* States */
+ // Initial
+ // Reset
+ // Start
+ // Started
+ // Primary
+ // WaitActingChange
+ // Peering
+ // GetInfo
+ // GetLog
+ // GetMissing
+ // WaitUpThru
+ // Incomplete
+ // Active
+ // Activating
+ // Clean
+ // Recovered
+ // Backfilling
+ // WaitRemoteBackfillReserved
+ // WaitLocalBackfillReserved
+ // NotBackfilling
+ // NotRecovering
+ // Recovering
+ // WaitRemoteRecoveryReserved
+ // WaitLocalRecoveryReserved
+ // ReplicaActive
+ // RepNotRecovering
+ // RepRecovering
+ // RepWaitBackfillReserved
+ // RepWaitRecoveryReserved
+ // Stray
+ // ToDelete
+ // WaitDeleteReserved
+ // Deleting
+ // Crashed
+
+ struct Crashed : boost::statechart::state< Crashed, PeeringMachine >, NamedState {
+ explicit Crashed(my_context ctx);
+ };
+
+ struct Reset;
+
+ struct Initial : boost::statechart::state< Initial, PeeringMachine >, NamedState {
+ explicit Initial(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::transition< Initialize, Reset >,
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+
+ boost::statechart::result react(const MNotifyRec&);
+ boost::statechart::result react(const MInfoRec&);
+ boost::statechart::result react(const MLogRec&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Reset : boost::statechart::state< Reset, PeeringMachine >, NamedState {
+ explicit Reset(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::custom_reaction< IntervalFlush >,
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const IntervalFlush&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Start;
+
+ struct Started : boost::statechart::state< Started, PeeringMachine, Start >, NamedState {
+ explicit Started(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< IntervalFlush >,
+ // ignored
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::custom_reaction<SetForceRecovery>,
+ boost::statechart::custom_reaction<UnsetForceRecovery>,
+ boost::statechart::custom_reaction<SetForceBackfill>,
+ boost::statechart::custom_reaction<UnsetForceBackfill>,
+ boost::statechart::custom_reaction<RequestScrub>,
+ boost::statechart::custom_reaction<CheckReadable>,
+ // crash
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const IntervalFlush&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Primary;
+ struct Stray;
+
+ struct Start : boost::statechart::state< Start, Started >, NamedState {
+ explicit Start(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::transition< MakePrimary, Primary >,
+ boost::statechart::transition< MakeStray, Stray >
+ > reactions;
+ };
+
+ struct Peering;
+ struct WaitActingChange;
+ struct Incomplete;
+ struct Down;
+
+ struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
+ explicit Primary(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::custom_reaction<SetForceRecovery>,
+ boost::statechart::custom_reaction<UnsetForceRecovery>,
+ boost::statechart::custom_reaction<SetForceBackfill>,
+ boost::statechart::custom_reaction<UnsetForceBackfill>,
+ boost::statechart::custom_reaction<RequestScrub>
+ > reactions;
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const MNotifyRec&);
+ boost::statechart::result react(const SetForceRecovery&);
+ boost::statechart::result react(const UnsetForceRecovery&);
+ boost::statechart::result react(const SetForceBackfill&);
+ boost::statechart::result react(const UnsetForceBackfill&);
+ boost::statechart::result react(const RequestScrub&);
+ };
+
+ struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
+ NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< MNotifyRec >
+ > reactions;
+ explicit WaitActingChange(my_context ctx);
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const MLogRec&);
+ boost::statechart::result react(const MInfoRec&);
+ boost::statechart::result react(const MNotifyRec&);
+ void exit();
+ };
+
+ struct GetInfo;
+ struct Active;
+
+ struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
+ PastIntervals::PriorSet prior_set;
+ bool history_les_bound; //< need osd_find_best_info_ignore_history_les
+
+ explicit Peering(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< Activate, Active >,
+ boost::statechart::custom_reaction< AdvMap >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const AdvMap &advmap);
+ };
+
+ struct WaitLocalRecoveryReserved;
+ struct Activating;
+ struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
+ explicit Active(my_context ctx);
+ void exit();
+
+ const std::set<pg_shard_t> remote_shards_to_reserve_recovery;
+ const std::set<pg_shard_t> remote_shards_to_reserve_backfill;
+ bool all_replicas_activated;
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MTrim >,
+ boost::statechart::custom_reaction< Backfilled >,
+ boost::statechart::custom_reaction< ActivateCommitted >,
+ boost::statechart::custom_reaction< AllReplicasActivated >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< DeferBackfill >,
+ boost::statechart::custom_reaction< UnfoundRecovery >,
+ boost::statechart::custom_reaction< UnfoundBackfill >,
+ boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+ boost::statechart::custom_reaction< RemoteReservationRevoked>,
+ boost::statechart::custom_reaction< DoRecovery>,
+ boost::statechart::custom_reaction< RenewLease>,
+ boost::statechart::custom_reaction< MLeaseAck>,
+ boost::statechart::custom_reaction< CheckReadable>
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const MInfoRec& infoevt);
+ boost::statechart::result react(const MNotifyRec& notevt);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const MTrim& trimevt);
+ boost::statechart::result react(const Backfilled&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const ActivateCommitted&);
+ boost::statechart::result react(const AllReplicasActivated&);
+ boost::statechart::result react(const RenewLease&);
+ boost::statechart::result react(const MLeaseAck&);
+ boost::statechart::result react(const DeferRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationRevoked&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DoRecovery&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const CheckReadable&);
+ void all_activated_and_committed();
+ };
+
+ struct Clean : boost::statechart::state< Clean, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction<SetForceRecovery>,
+ boost::statechart::custom_reaction<SetForceBackfill>
+ > reactions;
+ explicit Clean(my_context ctx);
+ void exit();
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+
+ struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< GoClean, Clean >,
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction< AllReplicasActivated >
+ > reactions;
+ explicit Recovered(my_context ctx);
+ void exit();
+ boost::statechart::result react(const AllReplicasActivated&) {
+ post_event(GoClean());
+ return forward_event();
+ }
+ };
+
+ struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< Backfilled >,
+ boost::statechart::custom_reaction< DeferBackfill >,
+ boost::statechart::custom_reaction< UnfoundBackfill >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+ boost::statechart::custom_reaction< RemoteReservationRevoked>
+ > reactions;
+ explicit Backfilling(my_context ctx);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull& evt) {
+ // for compat with old peers
+ post_event(RemoteReservationRevokedTooFull());
+ return discard_event();
+ }
+ void backfill_release_reservations();
+ boost::statechart::result react(const Backfilled& evt);
+ boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
+ boost::statechart::result react(const RemoteReservationRevoked& evt);
+ boost::statechart::result react(const DeferBackfill& evt);
+ boost::statechart::result react(const UnfoundBackfill& evt);
+ void cancel_backfill();
+ void exit();
+ };
+
+ struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationRevoked >,
+ boost::statechart::transition< AllBackfillsReserved, Backfilling >
+ > reactions;
+ std::set<pg_shard_t>::const_iterator backfill_osd_it;
+ explicit WaitRemoteBackfillReserved(my_context ctx);
+ void retry();
+ void exit();
+ boost::statechart::result react(const RemoteBackfillReserved& evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
+ boost::statechart::result react(const RemoteReservationRevoked& evt);
+ };
+
+ struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >
+ > reactions;
+ explicit WaitLocalBackfillReserved(my_context ctx);
+ boost::statechart::result react(const RemoteBackfillReserved& evt) {
+ /* no-op */
+ return discard_event();
+ }
+ void exit();
+ };
+
+ struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >
+ > reactions;
+ explicit NotBackfilling(my_context ctx);
+ void exit();
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const RemoteBackfillReserved& evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
+ };
+
+ struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< UnfoundRecovery >
+ > reactions;
+ explicit NotRecovering(my_context ctx);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const DeferRecovery& evt) {
+ /* no-op */
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundRecovery& evt) {
+ /* no-op */
+ return discard_event();
+ }
+ void exit();
+ };
+
+ struct ToDelete;
+ struct RepNotRecovering;
+ struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
+ explicit ReplicaActive(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< MQuery >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MTrim >,
+ boost::statechart::custom_reaction< Activate >,
+ boost::statechart::custom_reaction< ActivateCommitted >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< DeferBackfill >,
+ boost::statechart::custom_reaction< UnfoundRecovery >,
+ boost::statechart::custom_reaction< UnfoundBackfill >,
+ boost::statechart::custom_reaction< RemoteBackfillPreempted >,
+ boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+ boost::statechart::custom_reaction< RecoveryDone >,
+ boost::statechart::transition<DeleteStart, ToDelete>,
+ boost::statechart::custom_reaction< MLease >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MInfoRec& infoevt);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const MTrim& trimevt);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const MQuery&);
+ boost::statechart::result react(const Activate&);
+ boost::statechart::result react(const ActivateCommitted&);
+ boost::statechart::result react(const MLease&);
+ boost::statechart::result react(const RecoveryDone&) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const DeferBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundRecovery& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const UnfoundBackfill& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteBackfillPreempted& evt) {
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
+ return discard_event();
+ }
+ };
+
+ struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::transition< RecoveryDone, RepNotRecovering >,
+ // for compat with old peers
+ boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
+ boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+ boost::statechart::custom_reaction< BackfillTooFull >,
+ boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+ boost::statechart::custom_reaction< RemoteBackfillPreempted >
+ > reactions;
+ explicit RepRecovering(my_context ctx);
+ boost::statechart::result react(const RemoteRecoveryPreempted &evt);
+ boost::statechart::result react(const BackfillTooFull &evt);
+ boost::statechart::result react(const RemoteBackfillPreempted &evt);
+ void exit();
+ };
+
+ struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationCanceled >
+ > reactions;
+ explicit RepWaitBackfillReserved(my_context ctx);
+ void exit();
+ boost::statechart::result react(const RemoteBackfillReserved &evt);
+ boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull &evt);
+ boost::statechart::result react(const RemoteReservationCanceled &evt);
+ };
+
+ struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ // for compat with old peers
+ boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+ boost::statechart::custom_reaction< RemoteReservationCanceled >
+ > reactions;
+ explicit RepWaitRecoveryReserved(my_context ctx);
+ void exit();
+ boost::statechart::result react(const RemoteRecoveryReserved &evt);
+ boost::statechart::result react(const RemoteReservationRejectedTooFull &evt) {
+ // for compat with old peers
+ post_event(RemoteReservationCanceled());
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteReservationCanceled &evt);
+ };
+
+ struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
+ typedef boost::mpl::list<
+ boost::statechart::custom_reaction< RequestRecoveryPrio >,
+ boost::statechart::custom_reaction< RequestBackfillPrio >,
+ boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
+ boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
+ boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
+ > reactions;
+ explicit RepNotRecovering(my_context ctx);
+ boost::statechart::result react(const RequestRecoveryPrio &evt);
+ boost::statechart::result react(const RequestBackfillPrio &evt);
+ boost::statechart::result react(const RemoteBackfillReserved &evt) {
+ // my reservation completion raced with a RELEASE from primary
+ return discard_event();
+ }
+ boost::statechart::result react(const RemoteRecoveryReserved &evt) {
+ // my reservation completion raced with a RELEASE from primary
+ return discard_event();
+ }
+ boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
+ void exit();
+ };
+
+ struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< AllReplicasRecovered >,
+ boost::statechart::custom_reaction< DeferRecovery >,
+ boost::statechart::custom_reaction< UnfoundRecovery >,
+ boost::statechart::custom_reaction< RequestBackfill >
+ > reactions;
+ explicit Recovering(my_context ctx);
+ void exit();
+ void release_reservations(bool cancel = false);
+ boost::statechart::result react(const AllReplicasRecovered &evt);
+ boost::statechart::result react(const DeferRecovery& evt);
+ boost::statechart::result react(const UnfoundRecovery& evt);
+ boost::statechart::result react(const RequestBackfill &evt);
+ };
+
+ struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+ boost::statechart::transition< AllRemotesReserved, Recovering >
+ > reactions;
+ std::set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
+ explicit WaitRemoteRecoveryReserved(my_context ctx);
+ boost::statechart::result react(const RemoteRecoveryReserved &evt);
+ void exit();
+ };
+
+ struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
+ boost::statechart::custom_reaction< RecoveryTooFull >
+ > reactions;
+ explicit WaitLocalRecoveryReserved(my_context ctx);
+ void exit();
+ boost::statechart::result react(const RecoveryTooFull &evt);
+ };
+
+ struct Activating : boost::statechart::state< Activating, Active >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::transition< AllReplicasRecovered, Recovered >,
+ boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
+ > reactions;
+ explicit Activating(my_context ctx);
+ void exit();
+ };
+
+ struct Stray : boost::statechart::state< Stray, Started >,
+ NamedState {
+ explicit Stray(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< MQuery >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< MInfoRec >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< RecoveryDone >,
+ boost::statechart::transition<DeleteStart, ToDelete>
+ > reactions;
+ boost::statechart::result react(const MQuery& query);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const MInfoRec& infoevt);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const RecoveryDone&) {
+ return discard_event();
+ }
+ };
+
+ struct WaitDeleteReserved;
+ struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
+ unsigned priority = 0;
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< ActivateCommitted >,
+ boost::statechart::custom_reaction< DeleteSome >
+ > reactions;
+ explicit ToDelete(my_context ctx);
+ boost::statechart::result react(const ActMap &evt);
+ boost::statechart::result react(const DeleteSome &evt) {
+ // happens if we drop out of Deleting due to reprioritization etc.
+ return discard_event();
+ }
+ boost::statechart::result react(const ActivateCommitted&) {
+ // Can happens if we were activated as a stray but not actually pulled
+ // from prior to the pg going clean and sending a delete.
+ return discard_event();
+ }
+ void exit();
+ };
+
+ struct Deleting;
+ struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
+ ToDelete>, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::transition<DeleteReserved, Deleting>
+ > reactions;
+ explicit WaitDeleteReserved(my_context ctx);
+ void exit();
+ };
+
+ struct Deleting : boost::statechart::state<Deleting,
+ ToDelete>, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< DeleteSome >,
+ boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
+ > reactions;
+ ghobject_t next;
+ explicit Deleting(my_context ctx);
+ boost::statechart::result react(const DeleteSome &evt);
+ void exit();
+ };
+
+ struct GetLog;
+
+ struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
+ std::set<pg_shard_t> peer_info_requested;
+
+ explicit GetInfo(my_context ctx);
+ void exit();
+ void get_infos();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::transition< GotInfo, GetLog >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::transition< IsDown, Down >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MNotifyRec& infoevt);
+ };
+
+ struct GotLog : boost::statechart::event< GotLog > {
+ GotLog() : boost::statechart::event< GotLog >() {}
+ };
+
+ struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
+ pg_shard_t auth_log_shard;
+ boost::intrusive_ptr<MOSDPGLog> msg;
+
+ explicit GetLog(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::custom_reaction< GotLog >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::transition< NeedActingChange, WaitActingChange >,
+ boost::statechart::transition< IsIncomplete, Incomplete >
+ > reactions;
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MLogRec& logevt);
+ boost::statechart::result react(const GotLog&);
+ };
+
+ struct WaitUpThru;
+
+ struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
+ std::set<pg_shard_t> peer_missing_requested;
+
+ explicit GetMissing(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< MLogRec >,
+ boost::statechart::transition< NeedUpThru, WaitUpThru >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MLogRec& logevt);
+ };
+
+ struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
+ explicit WaitUpThru(my_context ctx);
+ void exit();
+
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< MLogRec >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const ActMap& am);
+ boost::statechart::result react(const MLogRec& logrec);
+ };
+
+ struct Down : boost::statechart::state< Down, Peering>, NamedState {
+ explicit Down(my_context ctx);
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< MNotifyRec >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const MNotifyRec& infoevt);
+ void exit();
+ };
+
+ struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< MNotifyRec >,
+ boost::statechart::custom_reaction< QueryUnfound >,
+ boost::statechart::custom_reaction< QueryState >
+ > reactions;
+ explicit Incomplete(my_context ctx);
+ boost::statechart::result react(const AdvMap &advmap);
+ boost::statechart::result react(const MNotifyRec& infoevt);
+ boost::statechart::result react(const QueryUnfound& q);
+ boost::statechart::result react(const QueryState& q);
+ void exit();
+ };
+
+ PGStateHistory state_history;
+ CephContext* cct;
+ spg_t spgid;
+ DoutPrefixProvider *dpp;
+ PeeringListener *pl;
+
+ /// context passed in by state machine caller
+ PeeringCtx *orig_ctx;
+
+ /// populated if we are buffering messages pending a flush
+ std::optional<BufferedRecoveryMessages> messages_pending_flush;
+
+ /**
+ * populated between start_handle() and end_handle(), points into
+ * the message lists for messages_pending_flush while blocking messages
+ * or into orig_ctx otherwise
+ */
+ std::optional<PeeringCtxWrapper> rctx;
+
+ /**
+ * OSDMap state
+ */
+ OSDMapRef osdmap_ref; ///< Reference to current OSDMap
+ PGPool pool; ///< Current pool state
+ epoch_t last_persisted_osdmap = 0; ///< Last osdmap epoch persisted
+
+
+ /**
+ * Peering state information
+ */
+ int role = -1; ///< 0 = primary, 1 = replica, -1=none.
+ uint64_t state = 0; ///< PG_STATE_*
+
+ pg_shard_t primary; ///< id/shard of primary
+ pg_shard_t pg_whoami; ///< my id/shard
+ pg_shard_t up_primary; ///< id/shard of primary of up set
+ std::vector<int> up; ///< crush mapping without temp pgs
+ std::set<pg_shard_t> upset; ///< up in set form
+ std::vector<int> acting; ///< actual acting set for the current interval
+ std::set<pg_shard_t> actingset; ///< acting in set form
+
+ /// union of acting, recovery, and backfill targets
+ std::set<pg_shard_t> acting_recovery_backfill;
+
+ std::vector<HeartbeatStampsRef> hb_stamps;
+
+ ceph::signedspan readable_interval = ceph::signedspan::zero();
+
+ /// how long we can service reads in this interval
+ ceph::signedspan readable_until = ceph::signedspan::zero();
+
+ /// upper bound on any acting OSDs' readable_until in this interval
+ ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+ /// upper bound from prior interval(s)
+ ceph::signedspan prior_readable_until_ub = ceph::signedspan::zero();
+
+ /// pg instances from prior interval(s) that may still be readable
+ std::set<int> prior_readable_down_osds;
+
+ /// [replica] upper bound we got from the primary (primary's clock)
+ ceph::signedspan readable_until_ub_from_primary = ceph::signedspan::zero();
+
+ /// [primary] last upper bound shared by primary to replicas
+ ceph::signedspan readable_until_ub_sent = ceph::signedspan::zero();
+
+ /// [primary] readable ub acked by acting set members
+ std::vector<ceph::signedspan> acting_readable_until_ub;
+
+ bool send_notify = false; ///< True if a notify needs to be sent to the primary
+
+ bool dirty_info = false; ///< small info structu on disk out of date
+ bool dirty_big_info = false; ///< big info structure on disk out of date
+
+ pg_info_t info; ///< current pg info
+ pg_info_t last_written_info; ///< last written info
+ PastIntervals past_intervals; ///< information about prior pg mappings
+ PGLog pg_log; ///< pg log
+
+ epoch_t last_peering_reset = 0; ///< epoch of last peering reset
+
+ /// last_update that has committed; ONLY DEFINED WHEN is_active()
+ eversion_t last_update_ondisk;
+ eversion_t last_complete_ondisk; ///< last_complete that has committed.
+ eversion_t last_update_applied; ///< last_update readable
+ /// last version to which rollback_info trimming has been applied
+ eversion_t last_rollback_info_trimmed_to_applied;
+
+ /// Counter to determine when pending flushes have completed
+ unsigned flushes_in_progress = 0;
+
+ /**
+ * Primary state
+ */
+ std::set<pg_shard_t> stray_set; ///< non-acting osds that have PG data.
+ std::map<pg_shard_t, pg_info_t> peer_info; ///< info from peers (stray or prior)
+ std::map<pg_shard_t, int64_t> peer_bytes; ///< Peer's num_bytes from peer_info
+ std::set<pg_shard_t> peer_purged; ///< peers purged
+ std::map<pg_shard_t, pg_missing_t> peer_missing; ///< peer missing sets
+ std::set<pg_shard_t> peer_log_requested; ///< logs i've requested (and start stamps)
+ std::set<pg_shard_t> peer_missing_requested; ///< missing sets requested
+
+ /// features supported by all peers
+ uint64_t peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ /// features supported by acting set
+ uint64_t acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ /// features supported by up and acting
+ uint64_t upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+
+ /// most recently consumed osdmap's require_osd_version
+ ceph_release_t last_require_osd_release = ceph_release_t::unknown;
+
+ std::vector<int> want_acting; ///< non-empty while peering needs a new acting set
+
+ // acting_recovery_backfill contains shards that are acting,
+ // async recovery targets, or backfill targets.
+ std::map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
+
+ /// up: min over last_complete_ondisk, peer_last_complete_ondisk
+ eversion_t min_last_complete_ondisk;
+ /// point to which the log should be trimmed
+ eversion_t pg_trim_to;
+
+ std::set<int> blocked_by; ///< osds we are blocked by (for pg stats)
+
+ bool need_up_thru = false; ///< true if osdmap with updated up_thru needed
+
+ /// I deleted these strays; ignore racing PGInfo from them
+ std::set<pg_shard_t> peer_activated;
+
+ std::set<pg_shard_t> backfill_targets; ///< osds to be backfilled
+ std::set<pg_shard_t> async_recovery_targets; ///< osds to be async recovered
+
+ /// osds which might have objects on them which are unfound on the primary
+ std::set<pg_shard_t> might_have_unfound;
+
+ bool deleting = false; /// true while in removing or OSD is shutting down
+ std::atomic<bool> deleted = {false}; /// true once deletion complete
+
+ MissingLoc missing_loc; ///< information about missing objects
+
+ bool backfill_reserved = false;
+ bool backfill_reserving = false;
+
+ PeeringMachine machine;
+
+ void update_osdmap_ref(OSDMapRef newmap) {
+ osdmap_ref = std::move(newmap);
+ }
+
+ void update_heartbeat_peers();
+ void query_unfound(Formatter *f, string state);
+ bool proc_replica_info(
+ pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch);
+ void remove_down_peer_info(const OSDMapRef &osdmap);
+ void check_recovery_sources(const OSDMapRef& map);
+ void set_last_peering_reset();
+ void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
+ bool should_restart_peering(
+ int newupprimary,
+ int newactingprimary,
+ const std::vector<int>& newup,
+ const std::vector<int>& newacting,
+ OSDMapRef lastmap,
+ OSDMapRef osdmap);
+ void start_peering_interval(
+ const OSDMapRef lastmap,
+ const std::vector<int>& newup, int up_primary,
+ const std::vector<int>& newacting, int acting_primary,
+ ObjectStore::Transaction &t);
+ void on_new_interval();
+ void clear_recovery_state();
+ void clear_primary_state();
+ void check_past_interval_bounds() const;
+ bool set_force_recovery(bool b);
+ bool set_force_backfill(bool b);
+
+ /// clip calculated priority to reasonable range
+ int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
+ /// get log recovery reservation priority
+ unsigned get_recovery_priority();
+ /// get backfill reservation priority
+ unsigned get_backfill_priority();
+ /// get priority for pg deletion
+ unsigned get_delete_priority();
+
+ bool check_prior_readable_down_osds(const OSDMapRef& map);
+
+ bool adjust_need_up_thru(const OSDMapRef osdmap);
+ PastIntervals::PriorSet build_prior();
+
+ void reject_reservation();
+
+ // acting std::set
+ std::map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
+ const std::map<pg_shard_t, pg_info_t> &infos,
+ bool restrict_to_up_acting,
+ bool *history_les_bound) const;
+
+ static void calc_ec_acting(
+ std::map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+ unsigned size,
+ const std::vector<int> &acting,
+ const std::vector<int> &up,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *backfill,
+ std::set<pg_shard_t> *acting_backfill,
+ std::ostream &ss);
+
+ static std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
+ select_replicated_primary(
+ map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+ uint64_t force_auth_primary_missing_objects,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const map<pg_shard_t, pg_info_t> &all_info,
+ const OSDMapRef osdmap,
+ ostream &ss);
+
+ static void calc_replicated_acting(
+ map<pg_shard_t, pg_info_t>::const_iterator primary_shard,
+ eversion_t oldest_auth_log_entry,
+ unsigned size,
+ const std::vector<int> &acting,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *backfill,
+ std::set<pg_shard_t> *acting_backfill,
+ const OSDMapRef osdmap,
+ const PGPool& pool,
+ std::ostream &ss);
+ static void calc_replicated_acting_stretch(
+ map<pg_shard_t, pg_info_t>::const_iterator primary_shard,
+ eversion_t oldest_auth_log_entry,
+ unsigned size,
+ const std::vector<int> &acting,
+ const std::vector<int> &up,
+ pg_shard_t up_primary,
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ bool restrict_to_up_acting,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *backfill,
+ std::set<pg_shard_t> *acting_backfill,
+ const OSDMapRef osdmap,
+ const PGPool& pool,
+ std::ostream &ss);
+
+ void choose_async_recovery_ec(
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ const pg_info_t &auth_info,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *async_recovery,
+ const OSDMapRef osdmap) const;
+ void choose_async_recovery_replicated(
+ const std::map<pg_shard_t, pg_info_t> &all_info,
+ const pg_info_t &auth_info,
+ std::vector<int> *want,
+ std::set<pg_shard_t> *async_recovery,
+ const OSDMapRef osdmap) const;
+
+ bool recoverable(const std::vector<int> &want) const;
+ bool choose_acting(pg_shard_t &auth_log_shard,
+ bool restrict_to_up_acting,
+ bool *history_les_bound,
+ bool request_pg_temp_change_only = false);
+
+ bool search_for_missing(
+ const pg_info_t &oinfo, const pg_missing_t &omissing,
+ pg_shard_t fromosd,
+ PeeringCtxWrapper &rctx);
+ void build_might_have_unfound();
+ void log_weirdness();
+ void activate(
+ ObjectStore::Transaction& t,
+ epoch_t activation_epoch,
+ PeeringCtxWrapper &ctx);
+
+ void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
+ void merge_log(
+ ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t&& olog, pg_shard_t from);
+
+ void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
+ void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t&& olog, pg_missing_t&& omissing,
+ pg_shard_t from);
+ void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
+ pg_missing_t&& omissing, pg_shard_t from);
+
+ void calc_min_last_complete_ondisk() {
+ eversion_t min = last_complete_ondisk;
+ ceph_assert(!acting_recovery_backfill.empty());
+ for (std::set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
+ i != acting_recovery_backfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ if (peer_last_complete_ondisk.count(*i) == 0)
+ return; // we don't have complete info
+ eversion_t a = peer_last_complete_ondisk[*i];
+ if (a < min)
+ min = a;
+ }
+ if (min == min_last_complete_ondisk)
+ return;
+ min_last_complete_ondisk = min;
+ return;
+ }
+
+ void fulfill_info(
+ pg_shard_t from, const pg_query_t &query,
+ std::pair<pg_shard_t, pg_info_t> &notify_info);
+ void fulfill_log(
+ pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
+ void fulfill_query(const MQuery& q, PeeringCtxWrapper &rctx);
+
+ void try_mark_clean();
+
+ void update_blocked_by();
+ void update_calc_stats();
+
+ void add_log_entry(const pg_log_entry_t& e, bool applied);
+
+ void calc_trim_to();
+ void calc_trim_to_aggressive();
+
+public:
+ PeeringState(
+ CephContext *cct,
+ pg_shard_t pg_whoami,
+ spg_t spgid,
+ const PGPool &pool,
+ OSDMapRef curmap,
+ DoutPrefixProvider *dpp,
+ PeeringListener *pl);
+
+ /// Process evt
+ void handle_event(const boost::statechart::event_base &evt,
+ PeeringCtx *rctx) {
+ start_handle(rctx);
+ machine.process_event(evt);
+ end_handle();
+ }
+
+ /// Process evt
+ void handle_event(PGPeeringEventRef evt,
+ PeeringCtx *rctx) {
+ start_handle(rctx);
+ machine.process_event(evt->get_event());
+ end_handle();
+ }
+
+ /// Init fresh instance of PG
+ void init(
+ int role,
+ const std::vector<int>& newup, int new_up_primary,
+ const std::vector<int>& newacting, int new_acting_primary,
+ const pg_history_t& history,
+ const PastIntervals& pi,
+ bool backfill,
+ ObjectStore::Transaction &t);
+
+ /// Init pg instance from disk state
+ template <typename F>
+ auto init_from_disk_state(
+ pg_info_t &&info_from_disk,
+ PastIntervals &&past_intervals_from_disk,
+ F &&pg_log_init) {
+ info = std::move(info_from_disk);
+ last_written_info = info;
+ past_intervals = std::move(past_intervals_from_disk);
+ auto ret = pg_log_init(pg_log);
+ log_weirdness();
+ return ret;
+ }
+
+ /// Std::set initial primary/acting
+ void init_primary_up_acting(
+ const std::vector<int> &newup,
+ const std::vector<int> &newacting,
+ int new_up_primary,
+ int new_acting_primary);
+ void init_hb_stamps();
+
+ /// Std::set initial role
+ void set_role(int r) {
+ role = r;
+ }
+
+ /// Std::set predicates used for determining readable and recoverable
+ void set_backend_predicates(
+ IsPGReadablePredicate *is_readable,
+ IsPGRecoverablePredicate *is_recoverable) {
+ missing_loc.set_backend_predicates(is_readable, is_recoverable);
+ }
+
+ /// Send current pg_info to peers
+ void share_pg_info();
+
+ /// Get stats for child pgs
+ void start_split_stats(
+ const std::set<spg_t>& childpgs, std::vector<object_stat_sum_t> *out);
+
+ /// Update new child with stats
+ void finish_split_stats(
+ const object_stat_sum_t& stats, ObjectStore::Transaction &t);
+
+ /// Split state for child_pgid into *child
+ void split_into(
+ pg_t child_pgid, PeeringState *child, unsigned split_bits);
+
+ /// Merge state from sources
+ void merge_from(
+ std::map<spg_t,PeeringState *>& sources,
+ PeeringCtx &rctx,
+ unsigned split_bits,
+ const pg_merge_meta_t& last_pg_merge_meta);
+
+ /// Permit stray replicas to purge now unnecessary state
+ void purge_strays();
+
+ /**
+ * update_stats
+ *
+ * Mechanism for updating stats and/or history. Pass t to mark
+ * dirty and write out. Return true if stats should be published
+ * to the osd.
+ */
+ void update_stats(
+ std::function<bool(pg_history_t &, pg_stat_t &)> f,
+ ObjectStore::Transaction *t = nullptr);
+
+ /**
+ * adjust_purged_snaps
+ *
+ * Mechanism for updating purged_snaps. Marks dirty_info, big_dirty_info.
+ */
+ void adjust_purged_snaps(
+ std::function<void(interval_set<snapid_t> &snaps)> f);
+
+ /// Updates info.hit_set to hset_history, does not dirty
+ void update_hset(const pg_hit_set_history_t &hset_history);
+
+ /// Get all pg_shards that needs recovery
+ std::vector<pg_shard_t> get_replica_recovery_order() const;
+
+ /**
+ * update_history
+ *
+ * Merges new_history into info.history clearing past_intervals and
+ * dirtying as needed.
+ *
+ * Calls PeeringListener::on_info_history_change()
+ */
+ void update_history(const pg_history_t& new_history);
+
+ /**
+ * prepare_stats_for_publish
+ *
+ * Returns updated pg_stat_t if stats have changed since
+ * pg_stats_publish adding in unstable_stats.
+ */
+ std::optional<pg_stat_t> prepare_stats_for_publish(
+ bool pg_stats_publish_valid,
+ const pg_stat_t &pg_stats_publish,
+ const object_stat_collection_t &unstable_stats);
+
+ /**
+ * Merge entries updating missing as necessary on all
+ * acting_recovery_backfill logs and missings (also missing_loc)
+ */
+ bool append_log_entries_update_missing(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObjectStore::Transaction &t,
+ std::optional<eversion_t> trim_to,
+ std::optional<eversion_t> roll_forward_to);
+
+ void append_log_with_trim_to_updated(
+ std::vector<pg_log_entry_t>&& log_entries,
+ eversion_t roll_forward_to,
+ ObjectStore::Transaction &t,
+ bool transaction_applied,
+ bool async) {
+ update_trim_to();
+ append_log(std::move(log_entries), pg_trim_to, roll_forward_to,
+ min_last_complete_ondisk, t, transaction_applied, async);
+ }
+
+ /**
+ * Updates local log to reflect new write from primary.
+ */
+ void append_log(
+ std::vector<pg_log_entry_t>&& logv,
+ eversion_t trim_to,
+ eversion_t roll_forward_to,
+ eversion_t min_last_complete_ondisk,
+ ObjectStore::Transaction &t,
+ bool transaction_applied,
+ bool async);
+
+ /**
+ * retrieve the min last_backfill among backfill targets
+ */
+ hobject_t earliest_backfill() const;
+
+
+ /**
+ * Updates local log/missing to reflect new oob log update from primary
+ */
+ void merge_new_log_entries(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObjectStore::Transaction &t,
+ std::optional<eversion_t> trim_to,
+ std::optional<eversion_t> roll_forward_to);
+
+ /// Update missing set to reflect e (TODOSAM: not sure why this is needed)
+ void add_local_next_event(const pg_log_entry_t& e) {
+ pg_log.missing_add_next_entry(e);
+ }
+
+ /// Update log trim boundary
+ void update_trim_to() {
+ bool hard_limit = (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
+ if (hard_limit)
+ calc_trim_to_aggressive();
+ else
+ calc_trim_to();
+ }
+
+ /// Pre-process pending update on hoid represented by logv
+ void pre_submit_op(
+ const hobject_t &hoid,
+ const std::vector<pg_log_entry_t>& logv,
+ eversion_t at_version);
+
+ /// Signal that oid has been locally recovered to version v
+ void recover_got(
+ const hobject_t &oid, eversion_t v,
+ bool is_delete,
+ ObjectStore::Transaction &t);
+
+ /// Signal that oid has been recovered on peer to version
+ void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &soid,
+ const eversion_t &version);
+
+ /// Notify that soid is being recovered on peer
+ void begin_peer_recover(
+ pg_shard_t peer,
+ const hobject_t soid);
+
+ /// Pull missing sets from all candidate peers
+ bool discover_all_missing(
+ BufferedRecoveryMessages &rctx);
+
+ /// Notify that hoid has been fully recocovered
+ void object_recovered(
+ const hobject_t &hoid,
+ const object_stat_sum_t &stat_diff) {
+ info.stats.stats.sum.add(stat_diff);
+ missing_loc.recovered(hoid);
+ }
+
+ /// Update info/stats to reflect backfill progress
+ void update_backfill_progress(
+ const hobject_t &updated_backfill,
+ const pg_stat_t &updated_stats,
+ bool preserve_local_num_bytes,
+ ObjectStore::Transaction &t);
+
+ /// Update info/stats to reflect completed backfill on hoid
+ void update_complete_backfill_object_stats(
+ const hobject_t &hoid,
+ const pg_stat_t &stats);
+
+ /// Update last_backfill for peer to new_last_backfill
+ void update_peer_last_backfill(
+ pg_shard_t peer,
+ const hobject_t &new_last_backfill);
+
+ /// Update info.stats with delta_stats for operation on soid
+ void apply_op_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats);
+
+ /**
+ * force_object_missing
+ *
+ * Force oid on peer to be missing at version. If the object does not
+ * currently need recovery, either candidates if provided or the remainder
+ * of the acting std::set will be deemed to have the object.
+ */
+ void force_object_missing(
+ const pg_shard_t &peer,
+ const hobject_t &oid,
+ eversion_t version) {
+ force_object_missing(std::set<pg_shard_t>{peer}, oid, version);
+ }
+ void force_object_missing(
+ const std::set<pg_shard_t> &peer,
+ const hobject_t &oid,
+ eversion_t version);
+
+ /// Update state prior to backfilling soid on targets
+ void prepare_backfill_for_missing(
+ const hobject_t &soid,
+ const eversion_t &version,
+ const std::vector<pg_shard_t> &targets);
+
+ /// Std::set targets with the right version for revert (see recover_primary)
+ void set_revert_with_targets(
+ const hobject_t &soid,
+ const std::set<pg_shard_t> &good_peers);
+
+ /// Update lcod for fromosd
+ void update_peer_last_complete_ondisk(
+ pg_shard_t fromosd,
+ eversion_t lcod) {
+ peer_last_complete_ondisk[fromosd] = lcod;
+ }
+
+ /// Update lcod
+ void update_last_complete_ondisk(
+ eversion_t lcod) {
+ last_complete_ondisk = lcod;
+ }
+
+ /// Update state to reflect recovery up to version
+ void recovery_committed_to(eversion_t version);
+
+ /// Mark recovery complete
+ void local_recovery_complete() {
+ info.last_complete = info.last_update;
+ }
+
+ /// Update last_requested pointer to v
+ void set_last_requested(version_t v) {
+ pg_log.set_last_requested(v);
+ }
+
+ /// Write dirty state to t
+ void write_if_dirty(ObjectStore::Transaction& t);
+
+ /// Mark write completed to v with persisted lc
+ void complete_write(eversion_t v, eversion_t lc);
+
+ /// Update local write applied pointer
+ void local_write_applied(eversion_t v) {
+ last_update_applied = v;
+ }
+
+ /// Updates peering state with new map
+ void advance_map(
+ OSDMapRef osdmap, ///< [in] new osdmap
+ OSDMapRef lastmap, ///< [in] prev osdmap
+ std::vector<int>& newup, ///< [in] new up set
+ int up_primary, ///< [in] new up primary
+ std::vector<int>& newacting, ///< [in] new acting
+ int acting_primary, ///< [in] new acting primary
+ PeeringCtx &rctx ///< [out] recovery context
+ );
+
+ /// Activates most recently updated map
+ void activate_map(
+ PeeringCtx &rctx ///< [out] recovery context
+ );
+
+ /// resets last_persisted_osdmap
+ void reset_last_persisted() {
+ last_persisted_osdmap = 0;
+ dirty_info = true;
+ dirty_big_info = true;
+ }
+
+ /// Signal shutdown beginning
+ void shutdown() {
+ deleting = true;
+ }
+
+ /// Signal shutdown complete
+ void set_delete_complete() {
+ deleted = true;
+ }
+
+ /// Dirty info and write out
+ void force_write_state(ObjectStore::Transaction &t) {
+ dirty_info = true;
+ dirty_big_info = true;
+ write_if_dirty(t);
+ }
+
+ /// Get current interval's readable_until
+ ceph::signedspan get_readable_until() const {
+ return readable_until;
+ }
+
+ /// Get prior intervals' readable_until upper bound
+ ceph::signedspan get_prior_readable_until_ub() const {
+ return prior_readable_until_ub;
+ }
+
+ /// Get prior intervals' readable_until down OSDs of note
+ const std::set<int>& get_prior_readable_down_osds() const {
+ return prior_readable_down_osds;
+ }
+
+ /// Reset prior intervals' readable_until upper bound (e.g., bc it passed)
+ void clear_prior_readable_until_ub() {
+ prior_readable_until_ub = ceph::signedspan::zero();
+ prior_readable_down_osds.clear();
+ info.history.prior_readable_until_ub = ceph::signedspan::zero();
+ }
+
+ void renew_lease(ceph::signedspan now) {
+ bool was_min = (readable_until_ub == readable_until);
+ readable_until_ub_sent = now + readable_interval;
+ if (was_min) {
+ recalc_readable_until();
+ }
+ }
+
+ void send_lease();
+ void schedule_renew_lease();
+
+ pg_lease_t get_lease() {
+ return pg_lease_t(readable_until, readable_until_ub_sent, readable_interval);
+ }
+
+ void proc_lease(const pg_lease_t& l);
+ void proc_lease_ack(int from, const pg_lease_ack_t& la);
+ void proc_renew_lease();
+
+ pg_lease_ack_t get_lease_ack() {
+ return pg_lease_ack_t(readable_until_ub_from_primary);
+ }
+
+ /// [primary] recalc readable_until[_ub] for the current interval
+ void recalc_readable_until();
+
+ //============================ const helpers ================================
+ const char *get_current_state() const {
+ return state_history.get_current_state();
+ }
+ epoch_t get_last_peering_reset() const {
+ return last_peering_reset;
+ }
+ eversion_t get_last_rollback_info_trimmed_to_applied() const {
+ return last_rollback_info_trimmed_to_applied;
+ }
+ /// Returns stable reference to internal pool structure
+ const PGPool &get_pool() const {
+ return pool;
+ }
+ /// Returns reference to current osdmap
+ const OSDMapRef &get_osdmap() const {
+ ceph_assert(osdmap_ref);
+ return osdmap_ref;
+ }
+ /// Returns epoch of current osdmap
+ epoch_t get_osdmap_epoch() const {
+ return get_osdmap()->get_epoch();
+ }
+
+ bool is_ec_pg() const override {
+ return pool.info.is_erasure();
+ }
+ int get_pg_size() const override {
+ return pool.info.size;
+ }
+ bool is_deleting() const {
+ return deleting;
+ }
+ bool is_deleted() const {
+ return deleted;
+ }
+ const std::set<pg_shard_t> &get_upset() const override {
+ return upset;
+ }
+ bool is_acting_recovery_backfill(pg_shard_t osd) const {
+ return acting_recovery_backfill.count(osd);
+ }
+ bool is_acting(pg_shard_t osd) const {
+ return has_shard(pool.info.is_erasure(), acting, osd);
+ }
+ bool is_up(pg_shard_t osd) const {
+ return has_shard(pool.info.is_erasure(), up, osd);
+ }
+ static bool has_shard(bool ec, const std::vector<int>& v, pg_shard_t osd) {
+ if (ec) {
+ return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
+ } else {
+ return std::find(v.begin(), v.end(), osd.osd) != v.end();
+ }
+ }
+ const PastIntervals& get_past_intervals() const {
+ return past_intervals;
+ }
+ /// acting osd that is not the primary
+ bool is_nonprimary() const {
+ return role >= 0 && pg_whoami != primary;
+ }
+ /// primary osd
+ bool is_primary() const {
+ return pg_whoami == primary;
+ }
+ bool pg_has_reset_since(epoch_t e) const {
+ return deleted || e < get_last_peering_reset();
+ }
+
+ int get_role() const {
+ return role;
+ }
+ const std::vector<int> &get_acting() const {
+ return acting;
+ }
+ const std::set<pg_shard_t> &get_actingset() const {
+ return actingset;
+ }
+ int get_acting_primary() const {
+ return primary.osd;
+ }
+ pg_shard_t get_primary() const {
+ return primary;
+ }
+ const std::vector<int> &get_up() const {
+ return up;
+ }
+ int get_up_primary() const {
+ return up_primary.osd;
+ }
+
+ bool is_backfill_target(pg_shard_t osd) const {
+ return backfill_targets.count(osd);
+ }
+ const std::set<pg_shard_t> &get_backfill_targets() const {
+ return backfill_targets;
+ }
+ bool is_async_recovery_target(pg_shard_t peer) const {
+ return async_recovery_targets.count(peer);
+ }
+ const std::set<pg_shard_t> &get_async_recovery_targets() const {
+ return async_recovery_targets;
+ }
+ const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+ return acting_recovery_backfill;
+ }
+
+ const PGLog &get_pg_log() const {
+ return pg_log;
+ }
+
+ bool state_test(uint64_t m) const { return (state & m) != 0; }
+ void state_set(uint64_t m) { state |= m; }
+ void state_clear(uint64_t m) { state &= ~m; }
+
+ bool is_complete() const { return info.last_complete == info.last_update; }
+ bool should_send_notify() const { return send_notify; }
+
+ uint64_t get_state() const { return state; }
+ bool is_active() const { return state_test(PG_STATE_ACTIVE); }
+ bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
+ bool is_peering() const { return state_test(PG_STATE_PEERING); }
+ bool is_down() const { return state_test(PG_STATE_DOWN); }
+ bool is_recovery_unfound() const {
+ return state_test(PG_STATE_RECOVERY_UNFOUND);
+ }
+ bool is_backfilling() const {
+ return state_test(PG_STATE_BACKFILLING);
+ }
+ bool is_backfill_unfound() const {
+ return state_test(PG_STATE_BACKFILL_UNFOUND);
+ }
+ bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
+ bool is_clean() const { return state_test(PG_STATE_CLEAN); }
+ bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
+ bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
+ bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
+ bool is_peered() const {
+ return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
+ }
+ bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
+ bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
+ bool is_repair() const { return state_test(PG_STATE_REPAIR); }
+ bool is_empty() const { return info.last_update == eversion_t(0,0); }
+
+ bool get_need_up_thru() const {
+ return need_up_thru;
+ }
+
+ bool is_forced_recovery_or_backfill() const {
+ return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+ }
+
+ bool is_backfill_reserved() const {
+ return backfill_reserved;
+ }
+
+ bool is_backfill_reserving() const {
+ return backfill_reserving;
+ }
+
+ ceph_release_t get_last_require_osd_release() const {
+ return last_require_osd_release;
+ }
+
+ const pg_info_t &get_info() const {
+ return info;
+ }
+
+ const decltype(peer_info) &get_peer_info() const {
+ return peer_info;
+ }
+ const decltype(peer_missing) &get_peer_missing() const {
+ return peer_missing;
+ }
+ const pg_missing_const_i &get_peer_missing(const pg_shard_t &peer) const {
+ if (peer == pg_whoami) {
+ return pg_log.get_missing();
+ } else {
+ assert(peer_missing.count(peer));
+ return peer_missing.find(peer)->second;
+ }
+ }
+ const pg_info_t&get_peer_info(pg_shard_t peer) const {
+ assert(peer_info.count(peer));
+ return peer_info.find(peer)->second;
+ }
+ bool has_peer_info(pg_shard_t peer) const {
+ return peer_info.count(peer);
+ }
+
+ bool needs_recovery() const;
+ bool needs_backfill() const;
+
+ /**
+ * Returns whether a particular object can be safely read on this replica
+ */
+ bool can_serve_replica_read(const hobject_t &hoid) {
+ ceph_assert(!is_primary());
+ return !pg_log.get_log().has_write_since(
+ hoid, get_min_last_complete_ondisk());
+ }
+
+ /**
+ * Returns whether the current acting set is able to go active
+ * and serve writes. It needs to satisfy min_size and any
+ * applicable stretch cluster constraints.
+ */
+ bool acting_set_writeable() {
+ return (actingset.size() >= pool.info.min_size) &&
+ (pool.info.stretch_set_can_peer(acting, *get_osdmap(), NULL));
+ }
+
+ /**
+ * Returns whether all peers which might have unfound objects have been
+ * queried or marked lost.
+ */
+ bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
+ bool all_missing_unfound() const {
+ const auto& missing = pg_log.get_missing();
+ if (!missing.have_missing())
+ return false;
+ for (auto& m : missing.get_items()) {
+ if (!missing_loc.is_unfound(m.first))
+ return false;
+ }
+ return true;
+ }
+
+ bool perform_deletes_during_peering() const {
+ return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
+ }
+
+
+ bool have_unfound() const {
+ return missing_loc.have_unfound();
+ }
+ uint64_t get_num_unfound() const {
+ return missing_loc.num_unfound();
+ }
+
+ bool have_missing() const {
+ return pg_log.get_missing().num_missing() > 0;
+ }
+ unsigned int get_num_missing() const {
+ return pg_log.get_missing().num_missing();
+ }
+
+ const MissingLoc &get_missing_loc() const {
+ return missing_loc;
+ }
+
+ const MissingLoc::missing_by_count_t &get_missing_by_count() const {
+ return missing_loc.get_missing_by_count();
+ }
+
+ eversion_t get_min_last_complete_ondisk() const {
+ return min_last_complete_ondisk;
+ }
+
+ eversion_t get_pg_trim_to() const {
+ return pg_trim_to;
+ }
+
+ eversion_t get_last_update_applied() const {
+ return last_update_applied;
+ }
+
+ eversion_t get_last_update_ondisk() const {
+ return last_update_ondisk;
+ }
+
+ bool debug_has_dirty_state() const {
+ return dirty_info || dirty_big_info;
+ }
+
+ std::string get_pg_state_string() const {
+ return pg_state_string(state);
+ }
+
+ /// Dump representation of past_intervals to out
+ void print_past_intervals(std::ostream &out) const {
+ out << "[" << past_intervals.get_bounds()
+ << ")/" << past_intervals.size();
+ }
+
+ void dump_history(ceph::Formatter *f) const {
+ state_history.dump(f);
+ }
+
+ /// Dump formatted peering status
+ void dump_peering_state(ceph::Formatter *f);
+
+private:
+ /// Mask feature vector with feature set from new peer
+ void apply_peer_features(uint64_t f) { peer_features &= f; }
+
+ /// Reset feature vector to default
+ void reset_min_peer_features() {
+ peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ }
+public:
+ /// Get feature vector common to all known peers with this pg
+ uint64_t get_min_peer_features() const { return peer_features; }
+
+ /// Get feature vector common to acting set
+ uint64_t get_min_acting_features() const { return acting_features; }
+
+ /// Get feature vector common to up/acting set
+ uint64_t get_min_upacting_features() const { return upacting_features; }
+
+
+ // Flush control interface
+private:
+ /**
+ * Start additional flush (blocks needs_flush/activation until
+ * complete_flush is called once for each start_flush call as
+ * required by start_flush_on_transaction).
+ */
+ void start_flush(ObjectStore::Transaction &t) {
+ flushes_in_progress++;
+ pl->start_flush_on_transaction(t);
+ }
+public:
+ /// True if there are outstanding flushes
+ bool needs_flush() const {
+ return flushes_in_progress > 0;
+ }
+ /// Must be called once per start_flush
+ void complete_flush();
+
+ friend std::ostream &operator<<(std::ostream &out, const PeeringState &ps);
+};
+
+std::ostream &operator<<(std::ostream &out, const PeeringState &ps);
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
new file mode 100644
index 000000000..c1673bf70
--- /dev/null
+++ b/src/osd/PrimaryLogPG.cc
@@ -0,0 +1,15470 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "boost/tuple/tuple.hpp"
+#include "boost/intrusive_ptr.hpp"
+#include "PG.h"
+#include "pg_scrubber.h"
+#include "PrimaryLogPG.h"
+#include "OSD.h"
+#include "PrimaryLogScrub.h"
+#include "OpRequest.h"
+#include "ScrubStore.h"
+#include "Session.h"
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+
+#include "cls/cas/cls_cas_ops.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "common/perf_counters.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDPGTrim.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MCommandReply.h"
+#include "messages/MOSDScrubReserve.h"
+#include "common/EventTrace.h"
+
+#include "common/config.h"
+#include "include/compat.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+#include "json_spirit/json_spirit_value.h"
+#include "json_spirit/json_spirit_reader.h"
+#include "include/ceph_assert.h" // json_spirit clobbers it
+#include "include/rados/rados_types.hpp"
+
+#ifdef WITH_LTTNG
+#include "tracing/osd.h"
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+#include <sstream>
+#include <utility>
+
+#include <errno.h>
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+#include <common/CDC.h>
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
+
+using std::list;
+using std::ostream;
+using std::pair;
+using std::make_pair;
+using std::map;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::string_view;
+using std::stringstream;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::Formatter;
+using ceph::decode;
+using ceph::decode_noclear;
+using ceph::encode;
+using ceph::encode_destructively;
+
+using namespace ceph::osd::scheduler;
+using TOPNSPC::common::cmd_getval;
+
+template <typename T>
+static ostream& _prefix(std::ostream *_dout, T *pg) {
+ return pg->gen_prefix(*_dout);
+}
+
+/**
+ * The CopyCallback class defines an interface for completions to the
+ * copy_start code. Users of the copy infrastructure must implement
+ * one and give an instance of the class to start_copy.
+ *
+ * The implementer is responsible for making sure that the CopyCallback
+ * can associate itself with the correct copy operation.
+ */
+class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
+protected:
+ CopyCallback() {}
+ /**
+ * results.get<0>() is the return code: 0 for success; -ECANCELED if
+ * the operation was cancelled by the local OSD; -errno for other issues.
+ * results.get<1>() is a pointer to a CopyResults object, which you are
+ * responsible for deleting.
+ */
+ void finish(CopyCallbackResults results_) override = 0;
+
+public:
+ /// Provide the final size of the copied object to the CopyCallback
+ ~CopyCallback() override {}
+};
+
+template <typename T>
+class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
+ PrimaryLogPGRef pg;
+ unique_ptr<GenContext<T>> c;
+ epoch_t e;
+public:
+ BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(T t) override {
+ std::scoped_lock locker{*pg};
+ if (pg->pg_has_reset_since(e))
+ c.reset();
+ else
+ c.release()->complete(t);
+ }
+ bool sync_finish(T t) {
+ // we assume here all blessed/wrapped Contexts can complete synchronously.
+ c.release()->complete(t);
+ return true;
+ }
+};
+
+GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) {
+ return new BlessedGenContext<ThreadPool::TPHandle&>(
+ this, c, get_osdmap_epoch());
+}
+
+template <typename T>
+class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
+ PrimaryLogPGRef pg;
+ unique_ptr<GenContext<T>> c;
+ epoch_t e;
+public:
+ UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(T t) override {
+ if (pg->pg_has_reset_since(e))
+ c.reset();
+ else
+ c.release()->complete(t);
+ }
+ bool sync_finish(T t) {
+ // we assume here all blessed/wrapped Contexts can complete synchronously.
+ c.release()->complete(t);
+ return true;
+ }
+};
+
+GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) {
+ return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
+ this, c, get_osdmap_epoch());
+}
+
+class PrimaryLogPG::BlessedContext : public Context {
+ PrimaryLogPGRef pg;
+ unique_ptr<Context> c;
+ epoch_t e;
+public:
+ BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(int r) override {
+ std::scoped_lock locker{*pg};
+ if (pg->pg_has_reset_since(e))
+ c.reset();
+ else
+ c.release()->complete(r);
+ }
+ bool sync_finish(int r) override {
+ // we assume here all blessed/wrapped Contexts can complete synchronously.
+ c.release()->complete(r);
+ return true;
+ }
+};
+
+Context *PrimaryLogPG::bless_context(Context *c) {
+ return new BlessedContext(this, c, get_osdmap_epoch());
+}
+
+class PrimaryLogPG::C_PG_ObjectContext : public Context {
+ PrimaryLogPGRef pg;
+ ObjectContext *obc;
+ public:
+ C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
+ pg(p), obc(o) {}
+ void finish(int r) override {
+ pg->object_context_destructor_callback(obc);
+ }
+};
+
+struct OnReadComplete : public Context {
+ PrimaryLogPG *pg;
+ PrimaryLogPG::OpContext *opcontext;
+ OnReadComplete(
+ PrimaryLogPG *pg,
+ PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
+ void finish(int r) override {
+ opcontext->finish_read(pg);
+ }
+ ~OnReadComplete() override {}
+};
+
+class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
+ PrimaryLogPGRef pg;
+ ObjectContextRef obc;
+ public:
+ C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
+ pg(p), obc(o) {}
+ bool sync_finish(int r) override {
+ pg->_applied_recovered_object(obc);
+ return true;
+ }
+ void finish(int r) override {
+ std::scoped_lock locker{*pg};
+ pg->_applied_recovered_object(obc);
+ }
+};
+
+class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
+ PrimaryLogPGRef pg;
+ epoch_t epoch;
+ eversion_t last_complete;
+ public:
+ C_OSD_CommittedPushedObject(
+ PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
+ pg(p), epoch(epoch), last_complete(lc) {
+ }
+ void finish(int r) override {
+ pg->_committed_pushed_object(epoch, last_complete);
+ }
+};
+
+class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
+ PrimaryLogPGRef pg;
+ public:
+ explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
+ pg(p) {}
+ bool sync_finish(int r) override {
+ pg->_applied_recovered_object_replica();
+ return true;
+ }
+ void finish(int r) override {
+ std::scoped_lock locker{*pg};
+ pg->_applied_recovered_object_replica();
+ }
+};
+
+// OpContext
+void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
+{
+ inflightreads = 1;
+ list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
+ pair<bufferlist*, Context*> > > in;
+ in.swap(pending_async_reads);
+ pg->pgbackend->objects_read_async(
+ obc->obs.oi.soid,
+ in,
+ new OnReadComplete(pg, this), pg->get_pool().fast_read);
+}
+void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
+{
+ ceph_assert(inflightreads > 0);
+ --inflightreads;
+ if (async_reads_complete()) {
+ ceph_assert(pg->in_progress_async_reads.size());
+ ceph_assert(pg->in_progress_async_reads.front().second == this);
+ pg->in_progress_async_reads.pop_front();
+
+ // Restart the op context now that all reads have been
+ // completed. Read failures will be handled by the op finisher
+ pg->execute_ctx(this);
+ }
+}
+
+class CopyFromCallback : public PrimaryLogPG::CopyCallback {
+public:
+ PrimaryLogPG::CopyResults *results = nullptr;
+ PrimaryLogPG::OpContext *ctx;
+ OSDOp &osd_op;
+ uint32_t truncate_seq;
+ uint64_t truncate_size;
+ bool have_truncate = false;
+
+ CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
+ : ctx(ctx), osd_op(osd_op) {
+ }
+ ~CopyFromCallback() override {}
+
+ void finish(PrimaryLogPG::CopyCallbackResults results_) override {
+ results = results_.get<1>();
+ int r = results_.get<0>();
+
+ // Only use truncate_{seq,size} from the original object if the client
+ // did not sent us these parameters
+ if (!have_truncate) {
+ truncate_seq = results->truncate_seq;
+ truncate_size = results->truncate_size;
+ }
+
+ // for finish_copyfrom
+ ctx->user_at_version = results->user_version;
+
+ if (r >= 0) {
+ ctx->pg->execute_ctx(ctx);
+ } else {
+ if (r != -ECANCELED) { // on cancel just toss it out; client resends
+ if (ctx->op)
+ ctx->pg->osd->reply_op_error(ctx->op, r);
+ } else if (results->should_requeue) {
+ if (ctx->op)
+ ctx->pg->requeue_op(ctx->op);
+ }
+ ctx->pg->close_op_ctx(ctx);
+ }
+ }
+
+ bool is_temp_obj_used() {
+ return results->started_temp_obj;
+ }
+ uint64_t get_data_size() {
+ return results->object_size;
+ }
+ void set_truncate(uint32_t seq, uint64_t size) {
+ truncate_seq = seq;
+ truncate_size = size;
+ have_truncate = true;
+ }
+};
+
+struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
+ CopyFromCallback *copy_from_callback;
+
+ explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
+ : copy_from_callback(copy_from_callback) {
+ }
+
+ int execute() override {
+ // instance will be destructed after this method completes
+ copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
+ return 0;
+ }
+};
+
+// ======================
+// PGBackend::Listener
+
+void PrimaryLogPG::on_local_recover(
+ const hobject_t &hoid,
+ const ObjectRecoveryInfo &_recovery_info,
+ ObjectContextRef obc,
+ bool is_delete,
+ ObjectStore::Transaction *t
+ )
+{
+ dout(10) << __func__ << ": " << hoid << dendl;
+
+ ObjectRecoveryInfo recovery_info(_recovery_info);
+ clear_object_snap_mapping(t, hoid);
+ if (!is_delete && recovery_info.soid.is_snap()) {
+ OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+ set<snapid_t> snaps;
+ dout(20) << " snapset " << recovery_info.ss << dendl;
+ auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
+ if (p != recovery_info.ss.clone_snaps.end()) {
+ snaps.insert(p->second.begin(), p->second.end());
+ dout(20) << " snaps " << snaps << dendl;
+ snap_mapper.add_oid(
+ recovery_info.soid,
+ snaps,
+ &_t);
+ } else {
+ derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
+ }
+ }
+ if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
+ recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
+ ceph_assert(is_primary());
+ const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
+ if (latest->op == pg_log_entry_t::LOST_REVERT &&
+ latest->reverting_to == recovery_info.version) {
+ dout(10) << " got old revert version " << recovery_info.version
+ << " for " << *latest << dendl;
+ recovery_info.version = latest->version;
+ // update the attr to the revert event version
+ recovery_info.oi.prior_version = recovery_info.oi.version;
+ recovery_info.oi.version = latest->version;
+ bufferlist bl;
+ encode(recovery_info.oi, bl,
+ get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ ceph_assert(!pool.info.is_erasure());
+ t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
+ if (obc)
+ obc->attr_cache[OI_ATTR] = bl;
+ }
+ }
+
+ // keep track of active pushes for scrub
+ ++active_pushes;
+
+ recovery_state.recover_got(
+ recovery_info.soid,
+ recovery_info.version,
+ is_delete,
+ *t);
+
+ if (is_primary()) {
+ if (!is_delete) {
+ obc->obs.exists = true;
+
+ bool got = obc->get_recovery_read();
+ ceph_assert(got);
+
+ ceph_assert(recovering.count(obc->obs.oi.soid));
+ recovering[obc->obs.oi.soid] = obc;
+ obc->obs.oi = recovery_info.oi; // may have been updated above
+ }
+
+ t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+
+ publish_stats_to_osd();
+ release_backoffs(hoid);
+ if (!is_unreadable_object(hoid)) {
+ auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
+ if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
+ dout(20) << " kicking unreadable waiters on " << hoid << dendl;
+ requeue_ops(unreadable_object_entry->second);
+ waiting_for_unreadable_object.erase(unreadable_object_entry);
+ }
+ }
+ } else {
+ t->register_on_applied(
+ new C_OSD_AppliedRecoveredObjectReplica(this));
+
+ }
+
+ t->register_on_commit(
+ new C_OSD_CommittedPushedObject(
+ this,
+ get_osdmap_epoch(),
+ info.last_complete));
+}
+
+void PrimaryLogPG::on_global_recover(
+ const hobject_t &soid,
+ const object_stat_sum_t &stat_diff,
+ bool is_delete)
+{
+ recovery_state.object_recovered(soid, stat_diff);
+ publish_stats_to_osd();
+ dout(10) << "pushed " << soid << " to all replicas" << dendl;
+ auto i = recovering.find(soid);
+ ceph_assert(i != recovering.end());
+
+ if (i->second && i->second->rwstate.recovery_read_marker) {
+ // recover missing won't have had an obc, but it gets filled in
+ // during on_local_recover
+ ceph_assert(i->second);
+ list<OpRequestRef> requeue_list;
+ i->second->drop_recovery_read(&requeue_list);
+ requeue_ops(requeue_list);
+ }
+
+ backfills_in_flight.erase(soid);
+
+ recovering.erase(i);
+ finish_recovery_op(soid);
+ release_backoffs(soid);
+ auto degraded_object_entry = waiting_for_degraded_object.find(soid);
+ if (degraded_object_entry != waiting_for_degraded_object.end()) {
+ dout(20) << " kicking degraded waiters on " << soid << dendl;
+ requeue_ops(degraded_object_entry->second);
+ waiting_for_degraded_object.erase(degraded_object_entry);
+ }
+ auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
+ if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
+ dout(20) << " kicking unreadable waiters on " << soid << dendl;
+ requeue_ops(unreadable_object_entry->second);
+ waiting_for_unreadable_object.erase(unreadable_object_entry);
+ }
+ finish_degraded_object(soid);
+}
+
+void PrimaryLogPG::schedule_recovery_work(
+ GenContext<ThreadPool::TPHandle&> *c)
+{
+ osd->queue_recovery_context(this, c);
+}
+
+void PrimaryLogPG::replica_clear_repop_obc(
+ const vector<pg_log_entry_t> &logv,
+ ObjectStore::Transaction &t)
+{
+ for (auto &&e: logv) {
+ /* Have to blast all clones, they share a snapset */
+ object_contexts.clear_range(
+ e.soid.get_object_boundary(), e.soid.get_head());
+ ceph_assert(
+ snapset_contexts.find(e.soid.get_head()) ==
+ snapset_contexts.end());
+ }
+}
+
+bool PrimaryLogPG::should_send_op(
+ pg_shard_t peer,
+ const hobject_t &hoid) {
+ if (peer == get_primary())
+ return true;
+ ceph_assert(recovery_state.has_peer_info(peer));
+ bool should_send =
+ hoid.pool != (int64_t)info.pgid.pool() ||
+ hoid <= last_backfill_started ||
+ hoid <= recovery_state.get_peer_info(peer).last_backfill;
+ if (!should_send) {
+ ceph_assert(is_backfill_target(peer));
+ dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
+ << ", object " << hoid
+ << " beyond std::max(last_backfill_started "
+ << ", peer_info[peer].last_backfill "
+ << recovery_state.get_peer_info(peer).last_backfill
+ << ")" << dendl;
+ return should_send;
+ }
+ if (is_async_recovery_target(peer) &&
+ recovery_state.get_peer_missing(peer).is_missing(hoid)) {
+ should_send = false;
+ dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
+ << ", object " << hoid
+ << " which is pending recovery in async_recovery_targets" << dendl;
+ }
+ return should_send;
+}
+
+
+ConnectionRef PrimaryLogPG::get_con_osd_cluster(
+ int peer, epoch_t from_epoch)
+{
+ return osd->get_con_osd_cluster(peer, from_epoch);
+}
+
+PerfCounters *PrimaryLogPG::get_logger()
+{
+ return osd->logger;
+}
+
+
+// ====================
+// missing objects
+
+bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
+{
+ return recovery_state.get_pg_log().get_missing().get_items().count(soid);
+}
+
+void PrimaryLogPG::maybe_kick_recovery(
+ const hobject_t &soid)
+{
+ eversion_t v;
+ bool work_started = false;
+ if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
+ return;
+
+ map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
+ if (p != recovering.end()) {
+ dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
+ } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
+ dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
+ } else {
+ dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ if (is_missing_object(soid)) {
+ recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
+ } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
+ prep_object_replica_deletes(soid, v, h, &work_started);
+ } else {
+ prep_object_replica_pushes(soid, v, h, &work_started);
+ }
+ pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
+ }
+}
+
+void PrimaryLogPG::wait_for_unreadable_object(
+ const hobject_t& soid, OpRequestRef op)
+{
+ ceph_assert(is_unreadable_object(soid));
+ maybe_kick_recovery(soid);
+ waiting_for_unreadable_object[soid].push_back(op);
+ op->mark_delayed("waiting for missing object");
+}
+
+bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
+{
+ /* The conditions below may clear (on_local_recover, before we queue
+ * the transaction) before we actually requeue the degraded waiters
+ * in on_global_recover after the transaction completes.
+ */
+ if (waiting_for_degraded_object.count(soid))
+ return true;
+ if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
+ return true;
+ ceph_assert(!get_acting_recovery_backfill().empty());
+ for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
+ i != get_acting_recovery_backfill().end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
+ auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
+ // If an object is missing on an async_recovery_target, return false.
+ // This will not block the op and the object is async recovered later.
+ if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
+ peer_missing_entry->second.get_items().count(soid)) {
+ if (is_async_recovery_target(peer))
+ continue;
+ else
+ return true;
+ }
+ // Object is degraded if after last_backfill AND
+ // we are backfilling it
+ if (is_backfill_target(peer) &&
+ recovery_state.get_peer_info(peer).last_backfill <= soid &&
+ last_backfill_started >= soid &&
+ backfills_in_flight.count(soid))
+ return true;
+ }
+ return false;
+}
+
+bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
+{
+ for (auto &i: get_async_recovery_targets()) {
+ auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
+ if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
+ peer_missing_entry->second.get_items().count(soid)) {
+ dout(30) << __func__ << " " << soid << dendl;
+ return true;
+ }
+ }
+ return false;
+}
+
+void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
+{
+ ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
+
+ maybe_kick_recovery(soid);
+ waiting_for_degraded_object[soid].push_back(op);
+ op->mark_delayed("waiting for degraded object");
+}
+
+void PrimaryLogPG::block_write_on_full_cache(
+ const hobject_t& _oid, OpRequestRef op)
+{
+ const hobject_t oid = _oid.get_head();
+ dout(20) << __func__ << ": blocking object " << oid
+ << " on full cache" << dendl;
+ objects_blocked_on_cache_full.insert(oid);
+ waiting_for_cache_not_full.push_back(op);
+ op->mark_delayed("waiting for cache not full");
+}
+
+void PrimaryLogPG::block_for_clean(
+ const hobject_t& oid, OpRequestRef op)
+{
+ dout(20) << __func__ << ": blocking object " << oid
+ << " on primary repair" << dendl;
+ waiting_for_clean_to_primary_repair.push_back(op);
+ op->mark_delayed("waiting for clean to repair");
+}
+
+void PrimaryLogPG::block_write_on_snap_rollback(
+ const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
+{
+ dout(20) << __func__ << ": blocking object " << oid.get_head()
+ << " on snap promotion " << obc->obs.oi.soid << dendl;
+ // otherwise, we'd have blocked in do_op
+ ceph_assert(oid.is_head());
+ ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
+ objects_blocked_on_snap_promotion[oid] = obc;
+ wait_for_blocked_object(obc->obs.oi.soid, op);
+}
+
+void PrimaryLogPG::block_write_on_degraded_snap(
+ const hobject_t& snap, OpRequestRef op)
+{
+ dout(20) << __func__ << ": blocking object " << snap.get_head()
+ << " on degraded snap " << snap << dendl;
+ // otherwise, we'd have blocked in do_op
+ ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
+ objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
+ wait_for_degraded_object(snap, op);
+}
+
+bool PrimaryLogPG::maybe_await_blocked_head(
+ const hobject_t &hoid,
+ OpRequestRef op)
+{
+ ObjectContextRef obc;
+ obc = object_contexts.lookup(hoid.get_head());
+ if (obc) {
+ if (obc->is_blocked()) {
+ wait_for_blocked_object(obc->obs.oi.soid, op);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ return false;
+}
+
+void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
+{
+ dout(10) << __func__ << " " << soid << " " << op << dendl;
+ waiting_for_blocked_object[soid].push_back(op);
+ op->mark_delayed("waiting for blocked object");
+}
+
+void PrimaryLogPG::maybe_force_recovery()
+{
+ // no force if not in degraded/recovery/backfill states
+ if (!is_degraded() &&
+ !state_test(PG_STATE_RECOVERING |
+ PG_STATE_RECOVERY_WAIT |
+ PG_STATE_BACKFILLING |
+ PG_STATE_BACKFILL_WAIT |
+ PG_STATE_BACKFILL_TOOFULL))
+ return;
+
+ if (recovery_state.get_pg_log().get_log().approx_size() <
+ cct->_conf->osd_max_pg_log_entries *
+ cct->_conf->osd_force_recovery_pg_log_entries_factor)
+ return;
+
+ // find the oldest missing object
+ version_t min_version = recovery_state.get_pg_log().get_log().head.version;
+ hobject_t soid;
+ if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
+ min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
+ soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
+ }
+ ceph_assert(!get_acting_recovery_backfill().empty());
+ for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
+ it != get_acting_recovery_backfill().end();
+ ++it) {
+ if (*it == get_primary()) continue;
+ pg_shard_t peer = *it;
+ auto it_missing = recovery_state.get_peer_missing().find(peer);
+ if (it_missing != recovery_state.get_peer_missing().end() &&
+ !it_missing->second.get_rmissing().empty()) {
+ const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
+ dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
+ << " oid " << min_obj->second << dendl;
+ if (min_version > min_obj->first) {
+ min_version = min_obj->first;
+ soid = min_obj->second;
+ }
+ }
+ }
+
+ // recover it
+ if (soid != hobject_t())
+ maybe_kick_recovery(soid);
+}
+
+bool PrimaryLogPG::check_laggy(OpRequestRef& op)
+{
+ if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
+ SERVER_OCTOPUS)) {
+ dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
+ return true;
+ }
+ if (state_test(PG_STATE_WAIT)) {
+ dout(10) << __func__ << " PG is WAIT state" << dendl;
+ } else if (!state_test(PG_STATE_LAGGY)) {
+ auto mnow = osd->get_mnow();
+ auto ru = recovery_state.get_readable_until();
+ if (mnow <= ru) {
+ // not laggy
+ return true;
+ }
+ dout(10) << __func__
+ << " mnow " << mnow
+ << " > readable_until " << ru << dendl;
+
+ if (!is_primary()) {
+ osd->reply_op_error(op, -EAGAIN);
+ return false;
+ }
+
+ // go to laggy state
+ state_set(PG_STATE_LAGGY);
+ publish_stats_to_osd();
+ }
+ dout(10) << __func__ << " not readable" << dendl;
+ waiting_for_readable.push_back(op);
+ op->mark_delayed("waiting for readable");
+ return false;
+}
+
+bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
+{
+ if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
+ SERVER_OCTOPUS)) {
+ return true;
+ }
+ if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
+ return true; // not laggy
+ }
+ dout(10) << __func__ << " not readable" << dendl;
+ waiting_for_readable.push_front(op);
+ op->mark_delayed("waiting for readable");
+ return false;
+}
+
+void PrimaryLogPG::recheck_readable()
+{
+ if (!is_wait() && !is_laggy()) {
+ dout(20) << __func__ << " wasn't wait or laggy" << dendl;
+ return;
+ }
+ auto mnow = osd->get_mnow();
+ bool pub = false;
+ if (is_wait()) {
+ auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
+ if (mnow < prior_readable_until_ub) {
+ dout(10) << __func__ << " still wait (mnow " << mnow
+ << " < prior_readable_until_ub " << prior_readable_until_ub
+ << ")" << dendl;
+ } else {
+ dout(10) << __func__ << " no longer wait (mnow " << mnow
+ << " >= prior_readable_until_ub " << prior_readable_until_ub
+ << ")" << dendl;
+ state_clear(PG_STATE_WAIT);
+ recovery_state.clear_prior_readable_until_ub();
+ pub = true;
+ }
+ }
+ if (is_laggy()) {
+ auto ru = recovery_state.get_readable_until();
+ if (ru == ceph::signedspan::zero()) {
+ dout(10) << __func__ << " still laggy (mnow " << mnow
+ << ", readable_until zero)" << dendl;
+ } else if (mnow >= ru) {
+ dout(10) << __func__ << " still laggy (mnow " << mnow
+ << " >= readable_until " << ru << ")" << dendl;
+ } else {
+ dout(10) << __func__ << " no longer laggy (mnow " << mnow
+ << " < readable_until " << ru << ")" << dendl;
+ state_clear(PG_STATE_LAGGY);
+ pub = true;
+ }
+ }
+ if (pub) {
+ publish_stats_to_osd();
+ }
+ if (!is_laggy() && !is_wait()) {
+ requeue_ops(waiting_for_readable);
+ }
+}
+
+bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
+{
+ bufferlist bl;
+
+ // If filter has expressed an interest in an xattr, load it.
+ if (!filter.get_xattr().empty()) {
+ int ret = pgbackend->objects_get_attr(
+ sobj,
+ filter.get_xattr(),
+ &bl);
+ dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
+ if (ret < 0) {
+ if (ret != -ENODATA || filter.reject_empty_xattr()) {
+ return false;
+ }
+ }
+ }
+
+ return filter.filter(sobj, bl);
+}
+
+std::pair<int, std::unique_ptr<const PGLSFilter>>
+PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
+{
+ string type;
+ // storing non-const PGLSFilter for the sake of ::init()
+ std::unique_ptr<PGLSFilter> filter;
+
+ try {
+ decode(type, iter);
+ }
+ catch (ceph::buffer::error& e) {
+ return { -EINVAL, nullptr };
+ }
+
+ if (type.compare("plain") == 0) {
+ filter = std::make_unique<PGLSPlainFilter>();
+ } else {
+ std::size_t dot = type.find(".");
+ if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
+ return { -EINVAL, nullptr };
+ }
+
+ const std::string class_name = type.substr(0, dot);
+ const std::string filter_name = type.substr(dot + 1);
+ ClassHandler::ClassData *cls = NULL;
+ int r = ClassHandler::get_instance().open_class(class_name, &cls);
+ if (r != 0) {
+ derr << "Error opening class '" << class_name << "': "
+ << cpp_strerror(r) << dendl;
+ if (r != -EPERM) // propagate permission error
+ r = -EINVAL;
+ return { r, nullptr };
+ } else {
+ ceph_assert(cls);
+ }
+
+ ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
+ if (class_filter == NULL) {
+ derr << "Error finding filter '" << filter_name << "' in class "
+ << class_name << dendl;
+ return { -EINVAL, nullptr };
+ }
+ filter.reset(class_filter->fn());
+ if (!filter) {
+ // Object classes are obliged to return us something, but let's
+ // give an error rather than asserting out.
+ derr << "Buggy class " << class_name << " failed to construct "
+ "filter " << filter_name << dendl;
+ return { -EINVAL, nullptr };
+ }
+ }
+
+ ceph_assert(filter);
+ int r = filter->init(iter);
+ if (r < 0) {
+ derr << "Error initializing filter " << type << ": "
+ << cpp_strerror(r) << dendl;
+ return { -EINVAL, nullptr };
+ } else {
+ // Successfully constructed and initialized, return it.
+ return std::make_pair(0, std::move(filter));
+ }
+}
+
+
+// ==========================================================
+
+void PrimaryLogPG::do_command(
+ const string_view& orig_prefix,
+ const cmdmap_t& cmdmap,
+ const bufferlist& idata,
+ std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+ string format;
+ cmd_getval(cmdmap, "format", format);
+ std::unique_ptr<Formatter> f(Formatter::create(
+ format, "json-pretty", "json-pretty"));
+ int ret = 0;
+ stringstream ss; // stderr error message stream
+ bufferlist outbl; // if empty at end, we'll dump formatter as output
+
+ // get final prefix:
+ // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
+ // - ceph tell <pgid> foo -> prefix=foo
+ string prefix(orig_prefix);
+ string command;
+ cmd_getval(cmdmap, "cmd", command);
+ if (command.size()) {
+ prefix = command;
+ }
+
+ if (prefix == "query") {
+ f->open_object_section("pg");
+ f->dump_stream("snap_trimq") << snap_trimq;
+ f->dump_unsigned("snap_trimq_len", snap_trimq.size());
+ recovery_state.dump_peering_state(f.get());
+
+ f->open_array_section("recovery_state");
+ handle_query_state(f.get());
+ f->close_section();
+
+ if (is_primary() && is_active() && m_scrubber) {
+ m_scrubber->dump(f.get());
+ }
+
+ f->open_object_section("agent_state");
+ if (agent_state)
+ agent_state->dump(f.get());
+ f->close_section();
+
+ f->close_section();
+ }
+
+ else if (prefix == "mark_unfound_lost") {
+ string mulcmd;
+ cmd_getval(cmdmap, "mulcmd", mulcmd);
+ int mode = -1;
+ if (mulcmd == "revert") {
+ if (pool.info.is_erasure()) {
+ ss << "mode must be 'delete' for ec pool";
+ ret = -EINVAL;
+ goto out;
+ }
+ mode = pg_log_entry_t::LOST_REVERT;
+ } else if (mulcmd == "delete") {
+ mode = pg_log_entry_t::LOST_DELETE;
+ } else {
+ ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
+ ret = -EINVAL;
+ goto out;
+ }
+ ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
+ mode == pg_log_entry_t::LOST_DELETE);
+
+ if (!is_primary()) {
+ ss << "not primary";
+ ret = -EROFS;
+ goto out;
+ }
+
+ uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
+ if (!unfound) {
+ ss << "pg has no unfound objects";
+ goto out; // make command idempotent
+ }
+
+ if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
+ ss << "pg has " << unfound
+ << " unfound objects but we haven't probed all sources, not marking lost";
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mark_all_unfound_lost(mode, on_finish);
+ return;
+ }
+
+ else if (prefix == "list_unfound") {
+ hobject_t offset;
+ string offset_json;
+ bool show_offset = false;
+ if (cmd_getval(cmdmap, "offset", offset_json)) {
+ json_spirit::Value v;
+ try {
+ if (!json_spirit::read(offset_json, v))
+ throw std::runtime_error("bad json");
+ offset.decode(v);
+ } catch (std::runtime_error& e) {
+ ss << "error parsing offset: " << e.what();
+ ret = -EINVAL;
+ goto out;
+ }
+ show_offset = true;
+ }
+ f->open_object_section("missing");
+ if (show_offset) {
+ f->open_object_section("offset");
+ offset.dump(f.get());
+ f->close_section();
+ }
+ auto &needs_recovery_map = recovery_state.get_missing_loc()
+ .get_needs_recovery();
+ f->dump_int("num_missing", needs_recovery_map.size());
+ f->dump_int("num_unfound", get_num_unfound());
+ map<hobject_t, pg_missing_item>::const_iterator p =
+ needs_recovery_map.upper_bound(offset);
+ {
+ f->open_array_section("objects");
+ int32_t num = 0;
+ for (; p != needs_recovery_map.end() &&
+ num < cct->_conf->osd_command_max_records;
+ ++p) {
+ if (recovery_state.get_missing_loc().is_unfound(p->first)) {
+ f->open_object_section("object");
+ {
+ f->open_object_section("oid");
+ p->first.dump(f.get());
+ f->close_section();
+ }
+ p->second.dump(f.get()); // have, need keys
+ {
+ f->open_array_section("locations");
+ for (auto &&r : recovery_state.get_missing_loc().get_locations(
+ p->first)) {
+ f->dump_stream("shard") << r;
+ }
+ f->close_section();
+ }
+ f->close_section();
+ num++;
+ }
+ }
+ f->close_section();
+ }
+ // Get possible locations of missing objects from pg information
+ PeeringState::QueryUnfound q(f.get());
+ recovery_state.handle_event(q, 0);
+ f->dump_bool("more", p != needs_recovery_map.end());
+ f->close_section();
+ }
+
+ else if (prefix == "scrub" ||
+ prefix == "deep_scrub") {
+ bool deep = (prefix == "deep_scrub");
+ int64_t time;
+ cmd_getval(cmdmap, "time", time, (int64_t)0);
+
+ if (is_primary()) {
+ const pg_pool_t *p = &pool.info;
+ double pool_scrub_max_interval = 0;
+ double scrub_max_interval;
+ if (deep) {
+ p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
+ scrub_max_interval = pool_scrub_max_interval > 0 ?
+ pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
+ } else {
+ p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
+ scrub_max_interval = pool_scrub_max_interval > 0 ?
+ pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
+ }
+ // Instead of marking must_scrub force a schedule scrub
+ utime_t stamp = ceph_clock_now();
+ if (time == 0)
+ stamp -= scrub_max_interval;
+ else
+ stamp -= (float)time;
+ stamp -= 100.0; // push back last scrub more for good measure
+ if (deep) {
+ set_last_deep_scrub_stamp(stamp);
+ } else {
+ set_last_scrub_stamp(stamp);
+ }
+ f->open_object_section("result");
+ f->dump_bool("deep", deep);
+ f->dump_stream("stamp") << stamp;
+ f->close_section();
+ } else {
+ ss << "Not primary";
+ ret = -EPERM;
+ }
+ outbl.append(ss.str());
+ }
+
+ else {
+ ret = -ENOSYS;
+ ss << "prefix '" << prefix << "' not implemented";
+ }
+
+ out:
+ if (ret >= 0 && outbl.length() == 0) {
+ f->flush(outbl);
+ }
+ on_finish(ret, ss.str(), outbl);
+}
+
+
+// ==========================================================
+
+void PrimaryLogPG::do_pg_op(OpRequestRef op)
+{
+ const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+ dout(10) << "do_pg_op " << *m << dendl;
+
+ op->mark_started();
+
+ int result = 0;
+ string cname, mname;
+
+ snapid_t snapid = m->get_snapid();
+
+ vector<OSDOp> ops = m->ops;
+
+ for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
+ std::unique_ptr<const PGLSFilter> filter;
+ OSDOp& osd_op = *p;
+ auto bp = p->indata.cbegin();
+ switch (p->op.op) {
+ case CEPH_OSD_OP_PGNLS_FILTER:
+ try {
+ decode(cname, bp);
+ decode(mname, bp);
+ }
+ catch (const ceph::buffer::error& e) {
+ dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
+ result = -EINVAL;
+ break;
+ }
+ std::tie(result, filter) = get_pgls_filter(bp);
+ if (result < 0)
+ break;
+
+ ceph_assert(filter);
+
+ // fall through
+
+ case CEPH_OSD_OP_PGNLS:
+ if (snapid != CEPH_NOSNAP) {
+ result = -EINVAL;
+ break;
+ }
+ if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
+ dout(10) << " pgnls pg=" << m->get_pg()
+ << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
+ << " != " << info.pgid << dendl;
+ result = 0; // hmm?
+ } else {
+ unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
+ p->op.pgls.count);
+
+ dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
+ << dendl;
+ // read into a buffer
+ vector<hobject_t> sentries;
+ pg_nls_response_t response;
+ try {
+ decode(response.handle, bp);
+ }
+ catch (const ceph::buffer::error& e) {
+ dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
+ result = -EINVAL;
+ break;
+ }
+
+ hobject_t next;
+ hobject_t lower_bound = response.handle;
+ hobject_t pg_start = info.pgid.pgid.get_hobj_start();
+ hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+ dout(10) << " pgnls lower_bound " << lower_bound
+ << " pg_end " << pg_end << dendl;
+ if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
+ (lower_bound != hobject_t() && lower_bound < pg_start))) {
+ // this should only happen with a buggy client.
+ dout(10) << "outside of PG bounds " << pg_start << " .. "
+ << pg_end << dendl;
+ result = -EINVAL;
+ break;
+ }
+
+ hobject_t current = lower_bound;
+ int r = pgbackend->objects_list_partial(
+ current,
+ list_size,
+ list_size,
+ &sentries,
+ &next);
+ if (r != 0) {
+ result = -EINVAL;
+ break;
+ }
+
+ map<hobject_t, pg_missing_item>::const_iterator missing_iter =
+ recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
+ vector<hobject_t>::iterator ls_iter = sentries.begin();
+ hobject_t _max = hobject_t::get_max();
+ while (1) {
+ const hobject_t &mcand =
+ missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
+ _max :
+ missing_iter->first;
+ const hobject_t &lcand =
+ ls_iter == sentries.end() ?
+ _max :
+ *ls_iter;
+
+ hobject_t candidate;
+ if (mcand == lcand) {
+ candidate = mcand;
+ if (!mcand.is_max()) {
+ ++ls_iter;
+ ++missing_iter;
+ }
+ } else if (mcand < lcand) {
+ candidate = mcand;
+ ceph_assert(!mcand.is_max());
+ ++missing_iter;
+ } else {
+ candidate = lcand;
+ ceph_assert(!lcand.is_max());
+ ++ls_iter;
+ }
+
+ dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
+ << " vs lower bound 0x" << lower_bound.get_hash()
+ << std::dec << dendl;
+
+ if (candidate >= next) {
+ break;
+ }
+
+ if (response.entries.size() == list_size) {
+ next = candidate;
+ break;
+ }
+
+ if (candidate.snap != CEPH_NOSNAP)
+ continue;
+
+ // skip internal namespace
+ if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
+ continue;
+
+ if (recovery_state.get_missing_loc().is_deleted(candidate))
+ continue;
+
+ // skip wrong namespace
+ if (m->get_hobj().nspace != librados::all_nspaces &&
+ candidate.get_namespace() != m->get_hobj().nspace)
+ continue;
+
+ if (filter && !pgls_filter(*filter, candidate))
+ continue;
+
+ dout(20) << "pgnls item 0x" << std::hex
+ << candidate.get_hash()
+ << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
+ << std::dec << " "
+ << candidate.oid.name << dendl;
+
+ librados::ListObjectImpl item;
+ item.nspace = candidate.get_namespace();
+ item.oid = candidate.oid.name;
+ item.locator = candidate.get_key();
+ response.entries.push_back(item);
+ }
+
+ if (next.is_max() &&
+ missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
+ ls_iter == sentries.end()) {
+ result = 1;
+
+ // Set response.handle to the start of the next PG according
+ // to the object sort order.
+ response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+ } else {
+ response.handle = next;
+ }
+ dout(10) << "pgnls handle=" << response.handle << dendl;
+ encode(response, osd_op.outdata);
+ dout(10) << " pgnls result=" << result << " outdata.length()="
+ << osd_op.outdata.length() << dendl;
+ }
+ break;
+
+ case CEPH_OSD_OP_PGLS_FILTER:
+ try {
+ decode(cname, bp);
+ decode(mname, bp);
+ }
+ catch (const ceph::buffer::error& e) {
+ dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
+ result = -EINVAL;
+ break;
+ }
+ std::tie(result, filter) = get_pgls_filter(bp);
+ if (result < 0)
+ break;
+
+ ceph_assert(filter);
+
+ // fall through
+
+ case CEPH_OSD_OP_PGLS:
+ if (snapid != CEPH_NOSNAP) {
+ result = -EINVAL;
+ break;
+ }
+ if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
+ dout(10) << " pgls pg=" << m->get_pg()
+ << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
+ << " != " << info.pgid << dendl;
+ result = 0; // hmm?
+ } else {
+ unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
+ p->op.pgls.count);
+
+ dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
+ // read into a buffer
+ vector<hobject_t> sentries;
+ pg_ls_response_t response;
+ try {
+ decode(response.handle, bp);
+ }
+ catch (const ceph::buffer::error& e) {
+ dout(0) << "unable to decode PGLS handle in " << *m << dendl;
+ result = -EINVAL;
+ break;
+ }
+
+ hobject_t next;
+ hobject_t current = response.handle;
+ int r = pgbackend->objects_list_partial(
+ current,
+ list_size,
+ list_size,
+ &sentries,
+ &next);
+ if (r != 0) {
+ result = -EINVAL;
+ break;
+ }
+
+ ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
+
+ map<hobject_t, pg_missing_item>::const_iterator missing_iter =
+ recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
+ vector<hobject_t>::iterator ls_iter = sentries.begin();
+ hobject_t _max = hobject_t::get_max();
+ while (1) {
+ const hobject_t &mcand =
+ missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
+ _max :
+ missing_iter->first;
+ const hobject_t &lcand =
+ ls_iter == sentries.end() ?
+ _max :
+ *ls_iter;
+
+ hobject_t candidate;
+ if (mcand == lcand) {
+ candidate = mcand;
+ if (!mcand.is_max()) {
+ ++ls_iter;
+ ++missing_iter;
+ }
+ } else if (mcand < lcand) {
+ candidate = mcand;
+ ceph_assert(!mcand.is_max());
+ ++missing_iter;
+ } else {
+ candidate = lcand;
+ ceph_assert(!lcand.is_max());
+ ++ls_iter;
+ }
+
+ if (candidate >= next) {
+ break;
+ }
+
+ if (response.entries.size() == list_size) {
+ next = candidate;
+ break;
+ }
+
+ if (candidate.snap != CEPH_NOSNAP)
+ continue;
+
+ // skip wrong namespace
+ if (candidate.get_namespace() != m->get_hobj().nspace)
+ continue;
+
+ if (recovery_state.get_missing_loc().is_deleted(candidate))
+ continue;
+
+ if (filter && !pgls_filter(*filter, candidate))
+ continue;
+
+ response.entries.push_back(make_pair(candidate.oid,
+ candidate.get_key()));
+ }
+ if (next.is_max() &&
+ missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
+ ls_iter == sentries.end()) {
+ result = 1;
+ }
+ response.handle = next;
+ encode(response, osd_op.outdata);
+ dout(10) << " pgls result=" << result << " outdata.length()="
+ << osd_op.outdata.length() << dendl;
+ }
+ break;
+
+ case CEPH_OSD_OP_PG_HITSET_LS:
+ {
+ list< pair<utime_t,utime_t> > ls;
+ for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
+ p != info.hit_set.history.end();
+ ++p)
+ ls.push_back(make_pair(p->begin, p->end));
+ if (hit_set)
+ ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
+ encode(ls, osd_op.outdata);
+ }
+ break;
+
+ case CEPH_OSD_OP_PG_HITSET_GET:
+ {
+ utime_t stamp(osd_op.op.hit_set_get.stamp);
+ if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
+ // read the current in-memory HitSet, not the version we've
+ // checkpointed.
+ if (!hit_set) {
+ result= -ENOENT;
+ break;
+ }
+ encode(*hit_set, osd_op.outdata);
+ result = osd_op.outdata.length();
+ } else {
+ // read an archived HitSet.
+ hobject_t oid;
+ for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
+ p != info.hit_set.history.end();
+ ++p) {
+ if (stamp >= p->begin && stamp <= p->end) {
+ oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ break;
+ }
+ }
+ if (oid == hobject_t()) {
+ result = -ENOENT;
+ break;
+ }
+ if (!pool.info.is_replicated()) {
+ // FIXME: EC not supported yet
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (is_unreadable_object(oid)) {
+ wait_for_unreadable_object(oid, op);
+ return;
+ }
+ result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_SCRUBLS:
+ result = do_scrub_ls(m, &osd_op);
+ break;
+
+ default:
+ result = -EINVAL;
+ break;
+ }
+
+ if (result < 0)
+ break;
+ }
+
+ // reply
+ MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+ false);
+ reply->claim_op_out_data(ops);
+ reply->set_result(result);
+ reply->set_reply_versions(info.last_update, info.last_user_version);
+ osd->send_message_osd_client(reply, m->get_connection());
+}
+
+int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
+{
+ if (m->get_pg() != info.pgid.pgid) {
+ dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
+ return -EINVAL; // hmm?
+ }
+ auto bp = osd_op->indata.cbegin();
+ scrub_ls_arg_t arg;
+ try {
+ arg.decode(bp);
+ } catch (ceph::buffer::error&) {
+ dout(10) << " corrupted scrub_ls_arg_t" << dendl;
+ return -EINVAL;
+ }
+
+ int r = 0;
+ scrub_ls_result_t result = {.interval = info.history.same_interval_since};
+
+ if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
+ r = -EAGAIN;
+ } else {
+ bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
+ if (store_queried) {
+ encode(result, osd_op->outdata);
+ } else {
+ // the scrubber's store is not initialized
+ r = -ENOENT;
+ }
+ }
+
+ return r;
+}
+
+/**
+ * Releases locks
+ *
+ * @param manager [in] manager with locks to release
+ */
+void PrimaryLogPG::release_object_locks(
+ ObcLockManager &lock_manager) {
+ std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
+ bool requeue_recovery = false;
+ bool requeue_snaptrim = false;
+ lock_manager.put_locks(
+ &to_req,
+ &requeue_recovery,
+ &requeue_snaptrim);
+ if (requeue_recovery)
+ queue_recovery();
+ if (requeue_snaptrim)
+ snap_trimmer_machine.process_event(TrimWriteUnblocked());
+
+ if (!to_req.empty()) {
+ // requeue at front of scrub blocking queue if we are blocked by scrub
+ for (auto &&p: to_req) {
+ if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
+ for (auto& op : p.second) {
+ op->mark_delayed("waiting for scrub");
+ }
+
+ waiting_for_scrub.splice(
+ waiting_for_scrub.begin(),
+ p.second,
+ p.second.begin(),
+ p.second.end());
+ } else if (is_laggy()) {
+ for (auto& op : p.second) {
+ op->mark_delayed("waiting for readable");
+ }
+ waiting_for_readable.splice(
+ waiting_for_readable.begin(),
+ p.second,
+ p.second.begin(),
+ p.second.end());
+ } else {
+ requeue_ops(p.second);
+ }
+ }
+ }
+}
+
+PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
+ const PGPool &_pool,
+ const map<string,string>& ec_profile, spg_t p) :
+ PG(o, curmap, _pool, p),
+ pgbackend(
+ PGBackend::build_pg_backend(
+ _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
+ object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
+ new_backfill(false),
+ temp_seq(0),
+ snap_trimmer_machine(this)
+{
+ recovery_state.set_backend_predicates(
+ pgbackend->get_is_readable_predicate(),
+ pgbackend->get_is_recoverable_predicate());
+ snap_trimmer_machine.initiate();
+
+ m_scrubber = make_unique<PrimaryLogScrub>(this);
+}
+
+PrimaryLogPG::~PrimaryLogPG()
+{
+ m_scrubber.reset();
+}
+
+void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
+{
+ src_oloc = oloc;
+ if (oloc.key.empty())
+ src_oloc.key = oid.name;
+}
+
+void PrimaryLogPG::handle_backoff(OpRequestRef& op)
+{
+ auto m = op->get_req<MOSDBackoff>();
+ auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
+ if (!session)
+ return; // drop it.
+ hobject_t begin = info.pgid.pgid.get_hobj_start();
+ hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+ if (begin < m->begin) {
+ begin = m->begin;
+ }
+ if (end > m->end) {
+ end = m->end;
+ }
+ dout(10) << __func__ << " backoff ack id " << m->id
+ << " [" << begin << "," << end << ")" << dendl;
+ session->ack_backoff(cct, m->pgid, m->id, begin, end);
+}
+
+void PrimaryLogPG::do_request(
+ OpRequestRef& op,
+ ThreadPool::TPHandle &handle)
+{
+ if (op->osd_trace) {
+ op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
+ op->pg_trace.event("do request");
+ }
+#ifdef HAVE_JAEGER
+ if (op->osd_parent_span) {
+ auto do_req_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+ }
+#endif
+// make sure we have a new enough map
+ auto p = waiting_for_map.find(op->get_source());
+ if (p != waiting_for_map.end()) {
+ // preserve ordering
+ dout(20) << __func__ << " waiting_for_map "
+ << p->first << " not empty, queueing" << dendl;
+ p->second.push_back(op);
+ op->mark_delayed("waiting_for_map not empty");
+ return;
+ }
+ if (!have_same_or_newer_map(op->min_epoch)) {
+ dout(20) << __func__ << " min " << op->min_epoch
+ << ", queue on waiting_for_map " << op->get_source() << dendl;
+ waiting_for_map[op->get_source()].push_back(op);
+ op->mark_delayed("op must wait for map");
+ osd->request_osdmap_update(op->min_epoch);
+ return;
+ }
+
+ if (can_discard_request(op)) {
+ return;
+ }
+
+ // pg-wide backoffs
+ const Message *m = op->get_req();
+ int msg_type = m->get_type();
+ if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
+ auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
+ if (!session)
+ return; // drop it.
+ if (msg_type == CEPH_MSG_OSD_OP) {
+ if (session->check_backoff(cct, info.pgid,
+ info.pgid.pgid.get_hobj_start(), m)) {
+ return;
+ }
+
+ bool backoff =
+ is_down() ||
+ is_incomplete() ||
+ (!is_active() && is_peered());
+ if (g_conf()->osd_backoff_on_peering && !backoff) {
+ if (is_peering()) {
+ backoff = true;
+ }
+ }
+ if (backoff) {
+ add_pg_backoff(session);
+ return;
+ }
+ }
+ // pg backoff acks at pg-level
+ if (msg_type == CEPH_MSG_OSD_BACKOFF) {
+ const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
+ if (ba->begin != ba->end) {
+ handle_backoff(op);
+ return;
+ }
+ }
+ }
+
+ if (!is_peered()) {
+ // Delay unless PGBackend says it's ok
+ if (pgbackend->can_handle_while_inactive(op)) {
+ bool handled = pgbackend->handle_message(op);
+ ceph_assert(handled);
+ return;
+ } else {
+ waiting_for_peered.push_back(op);
+ op->mark_delayed("waiting for peered");
+ return;
+ }
+ }
+
+ if (recovery_state.needs_flush()) {
+ dout(20) << "waiting for flush on " << op << dendl;
+ waiting_for_flush.push_back(op);
+ op->mark_delayed("waiting for flush");
+ return;
+ }
+
+ ceph_assert(is_peered() && !recovery_state.needs_flush());
+ if (pgbackend->handle_message(op))
+ return;
+
+ switch (msg_type) {
+ case CEPH_MSG_OSD_OP:
+ case CEPH_MSG_OSD_BACKOFF:
+ if (!is_active()) {
+ dout(20) << " peered, not active, waiting for active on " << op << dendl;
+ waiting_for_active.push_back(op);
+ op->mark_delayed("waiting for active");
+ return;
+ }
+ switch (msg_type) {
+ case CEPH_MSG_OSD_OP:
+ // verify client features
+ if ((pool.info.has_tiers() || pool.info.is_tier()) &&
+ !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
+ osd->reply_op_error(op, -EOPNOTSUPP);
+ return;
+ }
+ do_op(op);
+ break;
+ case CEPH_MSG_OSD_BACKOFF:
+ // object-level backoff acks handled in osdop context
+ handle_backoff(op);
+ break;
+ }
+ break;
+
+ case MSG_OSD_PG_SCAN:
+ do_scan(op, handle);
+ break;
+
+ case MSG_OSD_PG_BACKFILL:
+ do_backfill(op);
+ break;
+
+ case MSG_OSD_PG_BACKFILL_REMOVE:
+ do_backfill_remove(op);
+ break;
+
+ case MSG_OSD_SCRUB_RESERVE:
+ {
+ if (!m_scrubber) {
+ osd->reply_op_error(op, -EAGAIN);
+ return;
+ }
+ auto m = op->get_req<MOSDScrubReserve>();
+ switch (m->type) {
+ case MOSDScrubReserve::REQUEST:
+ m_scrubber->handle_scrub_reserve_request(op);
+ break;
+ case MOSDScrubReserve::GRANT:
+ m_scrubber->handle_scrub_reserve_grant(op, m->from);
+ break;
+ case MOSDScrubReserve::REJECT:
+ m_scrubber->handle_scrub_reserve_reject(op, m->from);
+ break;
+ case MOSDScrubReserve::RELEASE:
+ m_scrubber->handle_scrub_reserve_release(op);
+ break;
+ }
+ }
+ break;
+
+ case MSG_OSD_REP_SCRUB:
+ replica_scrub(op, handle);
+ break;
+
+ case MSG_OSD_REP_SCRUBMAP:
+ do_replica_scrub_map(op);
+ break;
+
+ case MSG_OSD_PG_UPDATE_LOG_MISSING:
+ do_update_log_missing(op);
+ break;
+
+ case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+ do_update_log_missing_reply(op);
+ break;
+
+ default:
+ ceph_abort_msg("bad message type in do_request");
+ }
+}
+
+/** do_op - do an op
+ * pg lock will be held (if multithreaded)
+ * osd_lock NOT held.
+ */
+void PrimaryLogPG::do_op(OpRequestRef& op)
+{
+ FUNCTRACE(cct);
+ // NOTE: take a non-const pointer here; we must be careful not to
+ // change anything that will break other reads on m (operator<<).
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+ if (m->finish_decode()) {
+ op->reset_desc(); // for TrackedOp
+ m->clear_payload();
+ }
+
+ dout(20) << __func__ << ": op " << *m << dendl;
+
+ const hobject_t head = m->get_hobj().get_head();
+
+ if (!info.pgid.pgid.contains(
+ info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
+ derr << __func__ << " " << info.pgid.pgid << " does not contain "
+ << head << " pg_num " << pool.info.get_pg_num() << " hash "
+ << std::hex << head.get_hash() << std::dec << dendl;
+ osd->clog->warn() << info.pgid.pgid << " does not contain " << head
+ << " op " << *m;
+ ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
+ return;
+ }
+
+ bool can_backoff =
+ m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
+ ceph::ref_t<Session> session;
+ if (can_backoff) {
+ session = static_cast<Session*>(m->get_connection()->get_priv().get());
+ if (!session.get()) {
+ dout(10) << __func__ << " no session" << dendl;
+ return;
+ }
+
+ if (session->check_backoff(cct, info.pgid, head, m)) {
+ return;
+ }
+ }
+
+ if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
+ // not implemented.
+ dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
+ osd->reply_op_error(op, -EINVAL);
+ return;
+ }
+
+ {
+ int r = op->maybe_init_op_info(*get_osdmap());
+ if (r) {
+ osd->reply_op_error(op, r);
+ return;
+ }
+ }
+
+ if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+ op->may_read() &&
+ !(op->may_write() || op->may_cache())) {
+ // balanced reads; any replica will do
+ if (!(is_primary() || is_nonprimary())) {
+ osd->handle_misdirected_op(this, op);
+ return;
+ }
+ } else {
+ // normal case; must be primary
+ if (!is_primary()) {
+ osd->handle_misdirected_op(this, op);
+ return;
+ }
+ }
+
+ if (!check_laggy(op)) {
+ return;
+ }
+
+ if (!op_has_sufficient_caps(op)) {
+ osd->reply_op_error(op, -EPERM);
+ return;
+ }
+
+ if (op->includes_pg_op()) {
+ return do_pg_op(op);
+ }
+
+ // object name too long?
+ if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
+ dout(4) << "do_op name is longer than "
+ << cct->_conf->osd_max_object_name_len
+ << " bytes" << dendl;
+ osd->reply_op_error(op, -ENAMETOOLONG);
+ return;
+ }
+ if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
+ dout(4) << "do_op locator is longer than "
+ << cct->_conf->osd_max_object_name_len
+ << " bytes" << dendl;
+ osd->reply_op_error(op, -ENAMETOOLONG);
+ return;
+ }
+ if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
+ dout(4) << "do_op namespace is longer than "
+ << cct->_conf->osd_max_object_namespace_len
+ << " bytes" << dendl;
+ osd->reply_op_error(op, -ENAMETOOLONG);
+ return;
+ }
+ if (m->get_hobj().oid.name.empty()) {
+ dout(4) << "do_op empty oid name is not allowed" << dendl;
+ osd->reply_op_error(op, -EINVAL);
+ return;
+ }
+
+ if (int r = osd->store->validate_hobject_key(head)) {
+ dout(4) << "do_op object " << head << " invalid for backing store: "
+ << r << dendl;
+ osd->reply_op_error(op, r);
+ return;
+ }
+
+ // blocklisted?
+ if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
+ dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
+ osd->reply_op_error(op, -EBLOCKLISTED);
+ return;
+ }
+
+ // order this op as a write?
+ bool write_ordered = op->rwordered();
+
+ // discard due to cluster full transition? (we discard any op that
+ // originates before the cluster or pool is marked full; the client
+ // will resend after the full flag is removed or if they expect the
+ // op to succeed despite being full). The except is FULL_FORCE and
+ // FULL_TRY ops, which there is no reason to discard because they
+ // bypass all full checks anyway. If this op isn't write or
+ // read-ordered, we skip.
+ // FIXME: we exclude mds writes for now.
+ if (write_ordered && !(m->get_source().is_mds() ||
+ m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
+ m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
+ info.history.last_epoch_marked_full > m->get_map_epoch()) {
+ dout(10) << __func__ << " discarding op sent before full " << m << " "
+ << *m << dendl;
+ return;
+ }
+ // mds should have stopped writing before this point.
+ // We can't allow OSD to become non-startable even if mds
+ // could be writing as part of file removals.
+ if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
+ !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+ dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
+ return;
+ }
+ int64_t poolid = get_pgid().pool();
+ if (op->may_write()) {
+
+ const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
+ if (!pi) {
+ return;
+ }
+
+ // invalid?
+ if (m->get_snapid() != CEPH_NOSNAP) {
+ dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
+ osd->reply_op_error(op, -EINVAL);
+ return;
+ }
+
+ // too big?
+ if (cct->_conf->osd_max_write_size &&
+ m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
+ // journal can't hold commit!
+ derr << "do_op msg data len " << m->get_data_len()
+ << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
+ << " on " << *m << dendl;
+ osd->reply_op_error(op, -OSD_WRITETOOBIG);
+ return;
+ }
+ }
+
+ dout(10) << "do_op " << *m
+ << (op->may_write() ? " may_write" : "")
+ << (op->may_read() ? " may_read" : "")
+ << (op->may_cache() ? " may_cache" : "")
+ << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
+ << " flags " << ceph_osd_flag_string(m->get_flags())
+ << dendl;
+
+#ifdef HAVE_JAEGER
+ if (op->osd_parent_span) {
+ auto do_op_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+ }
+#endif
+ // missing object?
+ if (is_unreadable_object(head)) {
+ if (!is_primary()) {
+ osd->reply_op_error(op, -EAGAIN);
+ return;
+ }
+ if (can_backoff &&
+ (g_conf()->osd_backoff_on_degraded ||
+ (g_conf()->osd_backoff_on_unfound &&
+ recovery_state.get_missing_loc().is_unfound(head)))) {
+ add_backoff(session, head, head);
+ maybe_kick_recovery(head);
+ } else {
+ wait_for_unreadable_object(head, op);
+ }
+ return;
+ }
+
+ if (write_ordered) {
+ // degraded object?
+ if (is_degraded_or_backfilling_object(head)) {
+ if (can_backoff && g_conf()->osd_backoff_on_degraded) {
+ add_backoff(session, head, head);
+ maybe_kick_recovery(head);
+ } else {
+ wait_for_degraded_object(head, op);
+ }
+ return;
+ }
+
+ if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
+ dout(20) << __func__ << ": waiting for scrub" << dendl;
+ waiting_for_scrub.push_back(op);
+ op->mark_delayed("waiting for scrub");
+ return;
+ }
+ if (!check_laggy_requeue(op)) {
+ return;
+ }
+
+ // blocked on snap?
+ if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
+ blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
+ hobject_t to_wait_on(head);
+ to_wait_on.snap = blocked_iter->second;
+ wait_for_degraded_object(to_wait_on, op);
+ return;
+ }
+ if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
+ blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
+ wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
+ return;
+ }
+ if (objects_blocked_on_cache_full.count(head)) {
+ block_write_on_full_cache(head, op);
+ return;
+ }
+ }
+
+ // dup/resent?
+ if (op->may_write() || op->may_cache()) {
+ // warning: we will get back *a* request for this reqid, but not
+ // necessarily the most recent. this happens with flush and
+ // promote ops, but we can't possible have both in our log where
+ // the original request is still not stable on disk, so for our
+ // purposes here it doesn't matter which one we get.
+ eversion_t version;
+ version_t user_version;
+ int return_code = 0;
+ vector<pg_log_op_return_item_t> op_returns;
+ bool got = check_in_progress_op(
+ m->get_reqid(), &version, &user_version, &return_code, &op_returns);
+ if (got) {
+ dout(3) << __func__ << " dup " << m->get_reqid()
+ << " version " << version << dendl;
+ if (already_complete(version)) {
+ osd->reply_op_error(op, return_code, version, user_version, op_returns);
+ } else {
+ dout(10) << " waiting for " << version << " to commit" << dendl;
+ // always queue ondisk waiters, so that we can requeue if needed
+ waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
+ op_returns);
+ op->mark_delayed("waiting for ondisk");
+ }
+ return;
+ }
+ }
+
+ ObjectContextRef obc;
+ bool can_create = op->may_write();
+ hobject_t missing_oid;
+
+ // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
+ const hobject_t& oid =
+ m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
+
+ // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
+ for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
+ OSDOp& osd_op = *p;
+
+ if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
+ if (m->get_snapid() != CEPH_SNAPDIR) {
+ dout(10) << "LIST_SNAPS with incorrect context" << dendl;
+ osd->reply_op_error(op, -EINVAL);
+ return;
+ }
+ } else {
+ if (m->get_snapid() == CEPH_SNAPDIR) {
+ dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
+ osd->reply_op_error(op, -EINVAL);
+ return;
+ }
+ }
+ }
+
+ // io blocked on obc?
+ if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
+ maybe_await_blocked_head(oid, op)) {
+ return;
+ }
+
+ if (!is_primary()) {
+ if (!recovery_state.can_serve_replica_read(oid)) {
+ dout(20) << __func__
+ << ": unstable write on replica, bouncing to primary "
+ << *m << dendl;
+ osd->reply_op_error(op, -EAGAIN);
+ return;
+ }
+ dout(20) << __func__ << ": serving replica read on oid " << oid
+ << dendl;
+ }
+
+ int r = find_object_context(
+ oid, &obc, can_create,
+ m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
+ &missing_oid);
+
+ // LIST_SNAPS needs the ssc too
+ if (obc &&
+ m->get_snapid() == CEPH_SNAPDIR &&
+ !obc->ssc) {
+ obc->ssc = get_snapset_context(oid, true);
+ }
+
+ if (r == -EAGAIN) {
+ // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
+ // we have to wait for the object.
+ if (is_primary()) {
+ // missing the specific snap we need; requeue and wait.
+ ceph_assert(!op->may_write()); // only happens on a read/cache
+ wait_for_unreadable_object(missing_oid, op);
+ return;
+ }
+ } else if (r == 0) {
+ if (is_unreadable_object(obc->obs.oi.soid)) {
+ dout(10) << __func__ << ": clone " << obc->obs.oi.soid
+ << " is unreadable, waiting" << dendl;
+ wait_for_unreadable_object(obc->obs.oi.soid, op);
+ return;
+ }
+
+ // degraded object? (the check above was for head; this could be a clone)
+ if (write_ordered &&
+ obc->obs.oi.soid.snap != CEPH_NOSNAP &&
+ is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
+ dout(10) << __func__ << ": clone " << obc->obs.oi.soid
+ << " is degraded, waiting" << dendl;
+ wait_for_degraded_object(obc->obs.oi.soid, op);
+ return;
+ }
+ }
+
+ bool in_hit_set = false;
+ if (hit_set) {
+ if (obc.get()) {
+ if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
+ in_hit_set = true;
+ } else {
+ if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
+ in_hit_set = true;
+ }
+ if (!op->hitset_inserted) {
+ hit_set->insert(oid);
+ op->hitset_inserted = true;
+ if (hit_set->is_full() ||
+ hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
+ hit_set_persist();
+ }
+ }
+ }
+
+ if (agent_state) {
+ if (agent_choose_mode(false, op))
+ return;
+ }
+
+ if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
+ if (recover_adjacent_clones(obc, op)) {
+ return;
+ }
+ if (maybe_handle_manifest(op,
+ write_ordered,
+ obc))
+ return;
+ }
+
+ if (maybe_handle_cache(op,
+ write_ordered,
+ obc,
+ r,
+ missing_oid,
+ false,
+ in_hit_set))
+ return;
+
+ if (r && (r != -ENOENT || !obc)) {
+ // copy the reqids for copy get on ENOENT
+ if (r == -ENOENT &&
+ (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
+ fill_in_copy_get_noent(op, oid, m->ops[0]);
+ return;
+ }
+ dout(20) << __func__ << ": find_object_context got error " << r << dendl;
+ if (op->may_write() &&
+ get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+ record_write_error(op, oid, nullptr, r);
+ } else {
+ osd->reply_op_error(op, r);
+ }
+ return;
+ }
+
+ // make sure locator is consistent
+ object_locator_t oloc(obc->obs.oi.soid);
+ if (m->get_object_locator() != oloc) {
+ dout(10) << " provided locator " << m->get_object_locator()
+ << " != object's " << obc->obs.oi.soid << dendl;
+ osd->clog->warn() << "bad locator " << m->get_object_locator()
+ << " on object " << oloc
+ << " op " << *m;
+ }
+
+ // io blocked on obc?
+ if (obc->is_blocked() &&
+ !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
+ wait_for_blocked_object(obc->obs.oi.soid, op);
+ return;
+ }
+
+ dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
+
+ OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
+
+ if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
+ dout(20) << __func__ << ": skipping rw locks" << dendl;
+ } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
+ dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
+
+ // verify there is in fact a flush in progress
+ // FIXME: we could make this a stronger test.
+ map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
+ if (p == flush_ops.end()) {
+ dout(10) << __func__ << " no flush in progress, aborting" << dendl;
+ reply_ctx(ctx, -EINVAL);
+ return;
+ }
+ } else if (!get_rw_locks(write_ordered, ctx)) {
+ dout(20) << __func__ << " waiting for rw locks " << dendl;
+ op->mark_delayed("waiting for rw locks");
+ close_op_ctx(ctx);
+ return;
+ }
+ dout(20) << __func__ << " obc " << *obc << dendl;
+
+ if (r) {
+ dout(20) << __func__ << " returned an error: " << r << dendl;
+ if (op->may_write() &&
+ get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+ record_write_error(op, oid, nullptr, r,
+ ctx->op->allows_returnvec() ? ctx : nullptr);
+ } else {
+ osd->reply_op_error(op, r);
+ }
+ close_op_ctx(ctx);
+ return;
+ }
+
+ if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
+ ctx->ignore_cache = true;
+ }
+
+ if ((op->may_read()) && (obc->obs.oi.is_lost())) {
+ // This object is lost. Reading from it returns an error.
+ dout(20) << __func__ << ": object " << obc->obs.oi.soid
+ << " is lost" << dendl;
+ reply_ctx(ctx, -ENFILE);
+ return;
+ }
+ if (!op->may_write() &&
+ !op->may_cache() &&
+ (!obc->obs.exists ||
+ ((m->get_snapid() != CEPH_SNAPDIR) &&
+ obc->obs.oi.is_whiteout()))) {
+ // copy the reqids for copy get on ENOENT
+ if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
+ fill_in_copy_get_noent(op, oid, m->ops[0]);
+ close_op_ctx(ctx);
+ return;
+ }
+ reply_ctx(ctx, -ENOENT);
+ return;
+ }
+
+ op->mark_started();
+
+ execute_ctx(ctx);
+ utime_t prepare_latency = ceph_clock_now();
+ prepare_latency -= op->get_dequeued_time();
+ osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
+ if (op->may_read() && op->may_write()) {
+ osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
+ } else if (op->may_read()) {
+ osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
+ } else if (op->may_write() || op->may_cache()) {
+ osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
+ }
+
+ // force recovery of the oldest missing object if too many logs
+ maybe_force_recovery();
+}
+
+PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
+ OpRequestRef op,
+ bool write_ordered,
+ ObjectContextRef obc)
+{
+ ceph_assert(obc);
+ if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
+ dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
+ return cache_result_t::NOOP;
+ }
+
+ // if it is write-ordered and blocked, stop now
+ if (obc->is_blocked() && write_ordered) {
+ // we're already doing something with this object
+ dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
+ return cache_result_t::NOOP;
+ }
+
+ vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
+ for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
+ OSDOp& osd_op = *p;
+ ceph_osd_op& op = osd_op.op;
+ if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
+ op.op == CEPH_OSD_OP_SET_CHUNK ||
+ op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
+ op.op == CEPH_OSD_OP_TIER_PROMOTE ||
+ op.op == CEPH_OSD_OP_TIER_FLUSH ||
+ op.op == CEPH_OSD_OP_TIER_EVICT) {
+ return cache_result_t::NOOP;
+ }
+ }
+
+ switch (obc->obs.oi.manifest.type) {
+ case object_manifest_t::TYPE_REDIRECT:
+ if (op->may_write() || write_ordered) {
+ do_proxy_write(op, obc);
+ } else {
+ // promoted object
+ if (obc->obs.oi.size != 0) {
+ return cache_result_t::NOOP;
+ }
+ do_proxy_read(op, obc);
+ }
+ return cache_result_t::HANDLED_PROXY;
+ case object_manifest_t::TYPE_CHUNKED:
+ {
+ if (can_proxy_chunked_read(op, obc)) {
+ map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
+ if (p != flush_ops.end()) {
+ do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
+ return cache_result_t::HANDLED_PROXY;
+ }
+ do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
+ return cache_result_t::HANDLED_PROXY;
+ }
+
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+ hobject_t head = m->get_hobj();
+
+ if (is_degraded_or_backfilling_object(head)) {
+ dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
+ wait_for_degraded_object(head, op);
+ return cache_result_t::BLOCKED_RECOVERY;
+ }
+
+ if (m_scrubber->write_blocked_by_scrub(head)) {
+ dout(20) << __func__ << ": waiting for scrub" << dendl;
+ waiting_for_scrub.push_back(op);
+ op->mark_delayed("waiting for scrub");
+ return cache_result_t::BLOCKED_RECOVERY;
+ }
+ if (!check_laggy_requeue(op)) {
+ return cache_result_t::BLOCKED_RECOVERY;
+ }
+
+ for (auto& p : obc->obs.oi.manifest.chunk_map) {
+ if (p.second.is_missing()) {
+ auto m = op->get_req<MOSDOp>();
+ const object_locator_t oloc = m->get_object_locator();
+ promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
+ return cache_result_t::BLOCKED_PROMOTE;
+ }
+ }
+ return cache_result_t::NOOP;
+ }
+ default:
+ ceph_abort_msg("unrecognized manifest type");
+ }
+
+ return cache_result_t::NOOP;
+}
+
+void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
+ MOSDOpReply *orig_reply, int r,
+ OpContext *ctx_for_op_returns)
+{
+ dout(20) << __func__ << " r=" << r << dendl;
+ ceph_assert(op->may_write());
+ const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
+ mempool::osd_pglog::list<pg_log_entry_t> entries;
+ entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
+ get_next_version(), eversion_t(), 0,
+ reqid, utime_t(), r));
+ if (ctx_for_op_returns) {
+ entries.back().set_op_returns(*ctx_for_op_returns->ops);
+ dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
+ }
+
+ struct OnComplete {
+ PrimaryLogPG *pg;
+ OpRequestRef op;
+ boost::intrusive_ptr<MOSDOpReply> orig_reply;
+ int r;
+ OnComplete(
+ PrimaryLogPG *pg,
+ OpRequestRef op,
+ MOSDOpReply *orig_reply,
+ int r)
+ : pg(pg), op(op),
+ orig_reply(orig_reply, false /* take over ref */), r(r)
+ {}
+ void operator()() {
+ ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
+ auto m = op->get_req<MOSDOp>();
+ MOSDOpReply *reply = orig_reply.detach();
+ ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
+ pg->osd->send_message_osd_client(reply, m->get_connection());
+ }
+ };
+
+ ObcLockManager lock_manager;
+ submit_log_entries(
+ entries,
+ std::move(lock_manager),
+ std::optional<std::function<void(void)> >(
+ OnComplete(this, op, orig_reply, r)),
+ op,
+ r);
+}
+
+PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
+ OpRequestRef op,
+ bool write_ordered,
+ ObjectContextRef obc,
+ int r, hobject_t missing_oid,
+ bool must_promote,
+ bool in_hit_set,
+ ObjectContextRef *promote_obc)
+{
+ // return quickly if caching is not enabled
+ if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
+ return cache_result_t::NOOP;
+
+ if (op &&
+ op->get_req() &&
+ op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
+ (op->get_req<MOSDOp>()->get_flags() &
+ CEPH_OSD_FLAG_IGNORE_CACHE)) {
+ dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
+ return cache_result_t::NOOP;
+ }
+
+ must_promote = must_promote || op->need_promote();
+
+ if (obc)
+ dout(25) << __func__ << " " << obc->obs.oi << " "
+ << (obc->obs.exists ? "exists" : "DNE")
+ << " missing_oid " << missing_oid
+ << " must_promote " << (int)must_promote
+ << " in_hit_set " << (int)in_hit_set
+ << dendl;
+ else
+ dout(25) << __func__ << " (no obc)"
+ << " missing_oid " << missing_oid
+ << " must_promote " << (int)must_promote
+ << " in_hit_set " << (int)in_hit_set
+ << dendl;
+
+ // if it is write-ordered and blocked, stop now
+ if (obc.get() && obc->is_blocked() && write_ordered) {
+ // we're already doing something with this object
+ dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
+ return cache_result_t::NOOP;
+ }
+
+ if (r == -ENOENT && missing_oid == hobject_t()) {
+ // we know this object is logically absent (e.g., an undefined clone)
+ return cache_result_t::NOOP;
+ }
+
+ if (obc.get() && obc->obs.exists) {
+ osd->logger->inc(l_osd_op_cache_hit);
+ return cache_result_t::NOOP;
+ }
+ if (!is_primary()) {
+ dout(20) << __func__ << " cache miss; ask the primary" << dendl;
+ osd->reply_op_error(op, -EAGAIN);
+ return cache_result_t::REPLIED_WITH_EAGAIN;
+ }
+
+ if (missing_oid == hobject_t() && obc.get()) {
+ missing_oid = obc->obs.oi.soid;
+ }
+
+ auto m = op->get_req<MOSDOp>();
+ const object_locator_t oloc = m->get_object_locator();
+
+ if (op->need_skip_handle_cache()) {
+ return cache_result_t::NOOP;
+ }
+
+ OpRequestRef promote_op;
+
+ switch (pool.info.cache_mode) {
+ case pg_pool_t::CACHEMODE_WRITEBACK:
+ if (agent_state &&
+ agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+ if (!op->may_write() && !op->may_cache() &&
+ !write_ordered && !must_promote) {
+ dout(20) << __func__ << " cache pool full, proxying read" << dendl;
+ do_proxy_read(op);
+ return cache_result_t::HANDLED_PROXY;
+ }
+ dout(20) << __func__ << " cache pool full, waiting" << dendl;
+ block_write_on_full_cache(missing_oid, op);
+ return cache_result_t::BLOCKED_FULL;
+ }
+
+ if (must_promote || (!hit_set && !op->need_skip_promote())) {
+ promote_object(obc, missing_oid, oloc, op, promote_obc);
+ return cache_result_t::BLOCKED_PROMOTE;
+ }
+
+ if (op->may_write() || op->may_cache()) {
+ do_proxy_write(op);
+
+ // Promote too?
+ if (!op->need_skip_promote() &&
+ maybe_promote(obc, missing_oid, oloc, in_hit_set,
+ pool.info.min_write_recency_for_promote,
+ OpRequestRef(),
+ promote_obc)) {
+ return cache_result_t::BLOCKED_PROMOTE;
+ }
+ return cache_result_t::HANDLED_PROXY;
+ } else {
+ do_proxy_read(op);
+
+ // Avoid duplicate promotion
+ if (obc.get() && obc->is_blocked()) {
+ if (promote_obc)
+ *promote_obc = obc;
+ return cache_result_t::BLOCKED_PROMOTE;
+ }
+
+ // Promote too?
+ if (!op->need_skip_promote()) {
+ (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
+ pool.info.min_read_recency_for_promote,
+ promote_op, promote_obc);
+ }
+
+ return cache_result_t::HANDLED_PROXY;
+ }
+ ceph_abort_msg("unreachable");
+ return cache_result_t::NOOP;
+
+ case pg_pool_t::CACHEMODE_READONLY:
+ // TODO: clean this case up
+ if (!obc.get() && r == -ENOENT) {
+ // we don't have the object and op's a read
+ promote_object(obc, missing_oid, oloc, op, promote_obc);
+ return cache_result_t::BLOCKED_PROMOTE;
+ }
+ if (!r) { // it must be a write
+ do_cache_redirect(op);
+ return cache_result_t::HANDLED_REDIRECT;
+ }
+ // crap, there was a failure of some kind
+ return cache_result_t::NOOP;
+
+ case pg_pool_t::CACHEMODE_FORWARD:
+ // this mode is deprecated; proxy instead
+ case pg_pool_t::CACHEMODE_PROXY:
+ if (!must_promote) {
+ if (op->may_write() || op->may_cache() || write_ordered) {
+ do_proxy_write(op);
+ return cache_result_t::HANDLED_PROXY;
+ } else {
+ do_proxy_read(op);
+ return cache_result_t::HANDLED_PROXY;
+ }
+ }
+ // ugh, we're forced to promote.
+ if (agent_state &&
+ agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+ dout(20) << __func__ << " cache pool full, waiting" << dendl;
+ block_write_on_full_cache(missing_oid, op);
+ return cache_result_t::BLOCKED_FULL;
+ }
+ promote_object(obc, missing_oid, oloc, op, promote_obc);
+ return cache_result_t::BLOCKED_PROMOTE;
+
+ case pg_pool_t::CACHEMODE_READFORWARD:
+ // this mode is deprecated; proxy instead
+ case pg_pool_t::CACHEMODE_READPROXY:
+ // Do writeback to the cache tier for writes
+ if (op->may_write() || write_ordered || must_promote) {
+ if (agent_state &&
+ agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+ dout(20) << __func__ << " cache pool full, waiting" << dendl;
+ block_write_on_full_cache(missing_oid, op);
+ return cache_result_t::BLOCKED_FULL;
+ }
+ promote_object(obc, missing_oid, oloc, op, promote_obc);
+ return cache_result_t::BLOCKED_PROMOTE;
+ }
+
+ // If it is a read, we can read, we need to proxy it
+ do_proxy_read(op);
+ return cache_result_t::HANDLED_PROXY;
+
+ default:
+ ceph_abort_msg("unrecognized cache_mode");
+ }
+ return cache_result_t::NOOP;
+}
+
+bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
+ const hobject_t& missing_oid,
+ const object_locator_t& oloc,
+ bool in_hit_set,
+ uint32_t recency,
+ OpRequestRef promote_op,
+ ObjectContextRef *promote_obc)
+{
+ dout(20) << __func__ << " missing_oid " << missing_oid
+ << " in_hit_set " << in_hit_set << dendl;
+
+ switch (recency) {
+ case 0:
+ break;
+ case 1:
+ // Check if in the current hit set
+ if (in_hit_set) {
+ break;
+ } else {
+ // not promoting
+ return false;
+ }
+ break;
+ default:
+ {
+ unsigned count = (int)in_hit_set;
+ if (count) {
+ // Check if in other hit sets
+ const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
+ for (map<time_t,HitSetRef>::reverse_iterator itor =
+ agent_state->hit_set_map.rbegin();
+ itor != agent_state->hit_set_map.rend();
+ ++itor) {
+ if (!itor->second->contains(oid)) {
+ break;
+ }
+ ++count;
+ if (count >= recency) {
+ break;
+ }
+ }
+ }
+ if (count >= recency) {
+ break;
+ }
+ return false; // not promoting
+ }
+ break;
+ }
+
+ if (osd->promote_throttle()) {
+ dout(10) << __func__ << " promote throttled" << dendl;
+ return false;
+ }
+ promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
+ return true;
+}
+
+void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDOp>();
+ int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
+ MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
+ flags, false);
+ request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
+ reply->set_redirect(redir);
+ dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
+ << op << dendl;
+ m->get_connection()->send_message(reply);
+ return;
+}
+
+struct C_ProxyRead : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ PrimaryLogPG::ProxyReadOpRef prdop;
+ utime_t start;
+ C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+ const PrimaryLogPG::ProxyReadOpRef& prd)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), prdop(prd), start(ceph_clock_now())
+ {}
+ void finish(int r) override {
+ if (prdop->canceled)
+ return;
+ std::scoped_lock locker{*pg};
+ if (prdop->canceled) {
+ return;
+ }
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ pg->finish_proxy_read(oid, tid, r);
+ pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
+ }
+ }
+};
+
+struct C_ProxyChunkRead : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ PrimaryLogPG::ProxyReadOpRef prdop;
+ utime_t start;
+ ObjectOperation *obj_op;
+ int op_index = 0;
+ uint64_t req_offset = 0;
+ ObjectContextRef obc;
+ uint64_t req_total_len = 0;
+ C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+ const PrimaryLogPG::ProxyReadOpRef& prd)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
+ {}
+ void finish(int r) override {
+ if (prdop->canceled)
+ return;
+ std::scoped_lock locker{*pg};
+ if (prdop->canceled) {
+ return;
+ }
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ if (r >= 0) {
+ if (!prdop->ops[op_index].outdata.length()) {
+ ceph_assert(req_total_len);
+ bufferlist list;
+ bufferptr bptr(req_total_len);
+ list.push_back(std::move(bptr));
+ prdop->ops[op_index].outdata.append(list);
+ }
+ ceph_assert(obj_op);
+ uint64_t copy_offset;
+ if (req_offset >= prdop->ops[op_index].op.extent.offset) {
+ copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
+ } else {
+ copy_offset = 0;
+ }
+ prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
+ obj_op->ops[0].outdata.length(),
+ obj_op->ops[0].outdata.c_str());
+ }
+
+ pg->finish_proxy_read(oid, tid, r);
+ pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
+ if (obj_op) {
+ delete obj_op;
+ }
+ }
+ }
+};
+
+void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
+{
+ // NOTE: non-const here because the ProxyReadOp needs mutable refs to
+ // stash the result in the request's OSDOp vector
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ object_locator_t oloc;
+ hobject_t soid;
+ /* extensible tier */
+ if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
+ switch (obc->obs.oi.manifest.type) {
+ case object_manifest_t::TYPE_REDIRECT:
+ oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
+ soid = obc->obs.oi.manifest.redirect_target;
+ break;
+ default:
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ } else {
+ /* proxy */
+ soid = m->get_hobj();
+ oloc = object_locator_t(m->get_object_locator());
+ oloc.pool = pool.info.tier_of;
+ }
+ unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+
+ // pass through some original flags that make sense.
+ // - leave out redirection and balancing flags since we are
+ // already proxying through the primary
+ // - leave off read/write/exec flags that are derived from the op
+ flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
+ CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ENFORCE_SNAPC |
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE);
+
+ dout(10) << __func__ << " Start proxy read for " << *m << dendl;
+
+ ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
+
+ ObjectOperation obj_op;
+ obj_op.dup(prdop->ops);
+
+ if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+ (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
+ for (unsigned i = 0; i < obj_op.ops.size(); i++) {
+ ceph_osd_op op = obj_op.ops[i].op;
+ switch (op.op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SYNC_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_CHECKSUM:
+ case CEPH_OSD_OP_CMPEXT:
+ op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
+ ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ }
+ }
+ }
+
+ C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
+ prdop);
+ ceph_tid_t tid = osd->objecter->read(
+ soid.oid, oloc, obj_op,
+ m->get_snapid(), NULL,
+ flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+ &prdop->user_version,
+ &prdop->data_offset,
+ m->get_features());
+ fin->tid = tid;
+ prdop->objecter_tid = tid;
+ proxyread_ops[tid] = prdop;
+ in_progress_proxy_ops[soid].push_back(op);
+}
+
+void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
+{
+ dout(10) << __func__ << " " << oid << " tid " << tid
+ << " " << cpp_strerror(r) << dendl;
+
+ map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
+ if (p == proxyread_ops.end()) {
+ dout(10) << __func__ << " no proxyread_op found" << dendl;
+ return;
+ }
+ ProxyReadOpRef prdop = p->second;
+ if (tid != prdop->objecter_tid) {
+ dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
+ << " tid " << prdop->objecter_tid << dendl;
+ return;
+ }
+ if (oid != prdop->soid) {
+ dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
+ << " soid " << prdop->soid << dendl;
+ return;
+ }
+ proxyread_ops.erase(tid);
+
+ map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
+ if (q == in_progress_proxy_ops.end()) {
+ dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
+ return;
+ }
+ ceph_assert(q->second.size());
+ list<OpRequestRef>::iterator it = std::find(q->second.begin(),
+ q->second.end(),
+ prdop->op);
+ ceph_assert(it != q->second.end());
+ OpRequestRef op = *it;
+ q->second.erase(it);
+ if (q->second.size() == 0) {
+ in_progress_proxy_ops.erase(oid);
+ } else if (std::find(q->second.begin(),
+ q->second.end(),
+ prdop->op) != q->second.end()) {
+ /* multiple read case */
+ dout(20) << __func__ << " " << oid << " is not completed " << dendl;
+ return;
+ }
+
+ osd->logger->inc(l_osd_tier_proxy_read);
+
+ auto m = op->get_req<MOSDOp>();
+ OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
+ ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
+ ctx->user_at_version = prdop->user_version;
+ ctx->data_off = prdop->data_offset;
+ ctx->ignore_log_op_stats = true;
+ complete_read_ctx(r, ctx);
+}
+
+void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
+{
+ map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
+ if (p == in_progress_proxy_ops.end())
+ return;
+
+ list<OpRequestRef>& ls = p->second;
+ dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
+ requeue_ops(ls);
+ in_progress_proxy_ops.erase(p);
+}
+
+void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
+ vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << " " << prdop->soid << dendl;
+ prdop->canceled = true;
+
+ // cancel objecter op, if we can
+ if (prdop->objecter_tid) {
+ tids->push_back(prdop->objecter_tid);
+ for (uint32_t i = 0; i < prdop->ops.size(); i++) {
+ prdop->ops[i].outdata.clear();
+ }
+ proxyread_ops.erase(prdop->objecter_tid);
+ prdop->objecter_tid = 0;
+ }
+}
+
+void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << dendl;
+
+ // cancel proxy reads
+ map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
+ while (p != proxyread_ops.end()) {
+ cancel_proxy_read((p++)->second, tids);
+ }
+
+ // cancel proxy writes
+ map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
+ while (q != proxywrite_ops.end()) {
+ cancel_proxy_write((q++)->second, tids);
+ }
+
+ if (requeue) {
+ map<hobject_t, list<OpRequestRef>>::iterator p =
+ in_progress_proxy_ops.begin();
+ while (p != in_progress_proxy_ops.end()) {
+ list<OpRequestRef>& ls = p->second;
+ dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
+ << " requests" << dendl;
+ requeue_ops(ls);
+ in_progress_proxy_ops.erase(p++);
+ }
+ } else {
+ in_progress_proxy_ops.clear();
+ }
+}
+
+struct C_ProxyWrite_Commit : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ PrimaryLogPG::ProxyWriteOpRef pwop;
+ C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+ const PrimaryLogPG::ProxyWriteOpRef& pw)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), pwop(pw)
+ {}
+ void finish(int r) override {
+ if (pwop->canceled)
+ return;
+ std::scoped_lock locker{*pg};
+ if (pwop->canceled) {
+ return;
+ }
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ pg->finish_proxy_write(oid, tid, r);
+ }
+ }
+};
+
+void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
+{
+ // NOTE: non-const because ProxyWriteOp takes a mutable ref
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ object_locator_t oloc;
+ SnapContext snapc(m->get_snap_seq(), m->get_snaps());
+ hobject_t soid;
+ /* extensible tier */
+ if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
+ switch (obc->obs.oi.manifest.type) {
+ case object_manifest_t::TYPE_REDIRECT:
+ oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
+ soid = obc->obs.oi.manifest.redirect_target;
+ break;
+ default:
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ } else {
+ /* proxy */
+ soid = m->get_hobj();
+ oloc = object_locator_t(m->get_object_locator());
+ oloc.pool = pool.info.tier_of;
+ }
+
+ unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+ if (!(op->may_write() || op->may_cache())) {
+ flags |= CEPH_OSD_FLAG_RWORDERED;
+ }
+ if (op->allows_returnvec()) {
+ flags |= CEPH_OSD_FLAG_RETURNVEC;
+ }
+
+ dout(10) << __func__ << " Start proxy write for " << *m << dendl;
+
+ ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
+ pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
+ pwop->mtime = m->get_mtime();
+
+ ObjectOperation obj_op;
+ obj_op.dup(pwop->ops);
+
+ C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
+ this, soid, get_last_peering_reset(), pwop);
+ ceph_tid_t tid = osd->objecter->mutate(
+ soid.oid, oloc, obj_op, snapc,
+ ceph::real_clock::from_ceph_timespec(pwop->mtime),
+ flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+ &pwop->user_version, pwop->reqid);
+ fin->tid = tid;
+ pwop->objecter_tid = tid;
+ proxywrite_ops[tid] = pwop;
+ in_progress_proxy_ops[soid].push_back(op);
+}
+
+void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
+ ObjectContextRef obc, bool write_ordered)
+{
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ OSDOp *osd_op = NULL;
+ for (unsigned int i = 0; i < m->ops.size(); i++) {
+ osd_op = &m->ops[i];
+ uint64_t cursor = osd_op->op.extent.offset;
+ uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
+ uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
+ object_manifest_t *manifest = &obc->obs.oi.manifest;
+ map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
+
+ while (cursor < op_length) {
+ chunk_index = 0;
+ chunk_length = 0;
+ /* find the right chunk position for cursor */
+ for (auto &p : manifest->chunk_map) {
+ if (p.first <= cursor && p.first + p.second.length > cursor) {
+ chunk_length = p.second.length;
+ chunk_index = p.first;
+ break;
+ }
+ }
+ /* no index */
+ if (!chunk_index && !chunk_length) {
+ if (cursor == osd_op->op.extent.offset) {
+ OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
+ ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
+ ctx->data_off = osd_op->op.extent.offset;
+ ctx->ignore_log_op_stats = true;
+ complete_read_ctx(0, ctx);
+ }
+ break;
+ }
+ uint64_t next_length = chunk_length;
+ /* the size to read -> | op length | */
+ /* | a chunk | */
+ if (cursor + next_length > op_length) {
+ next_length = op_length - cursor;
+ }
+ /* the size to read -> | op length | */
+ /* | a chunk | */
+ if (cursor + next_length > chunk_index + chunk_length) {
+ next_length = chunk_index + chunk_length - cursor;
+ }
+
+ chunk_read[cursor] = {{chunk_index, next_length}};
+ cursor += next_length;
+ }
+
+ req_len = cursor - osd_op->op.extent.offset;
+ for (auto &p : chunk_read) {
+ auto chunks = p.second.begin();
+ dout(20) << __func__ << " chunk_index: " << chunks->first
+ << " next_length: " << chunks->second << " cursor: "
+ << p.first << dendl;
+ do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
+ }
+ }
+}
+
+struct RefCountCallback : public Context {
+public:
+ PrimaryLogPG::OpContext *ctx;
+ OSDOp& osd_op;
+ bool requeue = false;
+
+ RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
+ : ctx(ctx), osd_op(osd_op) {}
+ void finish(int r) override {
+ // NB: caller must already have pg->lock held
+ ctx->obc->stop_block();
+ ctx->pg->kick_object_context_blocked(ctx->obc);
+ if (r >= 0) {
+ osd_op.rval = 0;
+ ctx->pg->execute_ctx(ctx);
+ } else {
+ // on cancel simply toss op out,
+ // or requeue as requested
+ if (r != -ECANCELED) {
+ if (ctx->op)
+ ctx->pg->osd->reply_op_error(ctx->op, r);
+ } else if (requeue) {
+ if (ctx->op)
+ ctx->pg->requeue_op(ctx->op);
+ }
+ ctx->pg->close_op_ctx(ctx);
+ }
+ }
+ void set_requeue(bool rq) {
+ requeue = rq;
+ }
+};
+
+struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
+ OSDOp& osd_op;
+
+ explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
+ }
+
+ int execute() override {
+ return osd_op.rval;
+ }
+};
+
+struct C_SetManifestRefCountDone : public Context {
+ PrimaryLogPGRef pg;
+ PrimaryLogPG::ManifestOpRef mop;
+ hobject_t soid;
+ C_SetManifestRefCountDone(PrimaryLogPG *p,
+ PrimaryLogPG::ManifestOpRef mop, hobject_t soid) :
+ pg(p), mop(mop), soid(soid) {}
+ void finish(int r) override {
+ if (r == -ECANCELED)
+ return;
+ std::scoped_lock locker{*pg};
+ auto it = pg->manifest_ops.find(soid);
+ if (it == pg->manifest_ops.end()) {
+ // raced with cancel_manifest_ops
+ return;
+ }
+ if (it->second->cb) {
+ it->second->cb->complete(r);
+ }
+ pg->manifest_ops.erase(it);
+ mop.reset();
+ }
+};
+
+struct C_SetDedupChunks : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ uint64_t offset;
+
+ C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), offset(offset)
+ {}
+ void finish(int r) override {
+ if (r == -ECANCELED)
+ return;
+ std::scoped_lock locker{*pg};
+ if (last_peering_reset != pg->get_last_peering_reset()) {
+ return;
+ }
+ pg->finish_set_dedup(oid, r, tid, offset);
+ }
+};
+
+void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << dendl;
+ auto p = manifest_ops.begin();
+ while (p != manifest_ops.end()) {
+ auto mop = p->second;
+ // cancel objecter op, if we can
+ if (mop->objecter_tid) {
+ tids->push_back(mop->objecter_tid);
+ mop->objecter_tid = 0;
+ }
+ if (mop->cb) {
+ mop->cb->set_requeue(requeue);
+ mop->cb->complete(-ECANCELED);
+ }
+ manifest_ops.erase(p++);
+ }
+}
+
+int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op)
+{
+ int cnt = 0;
+ // head
+ for (auto &p : obc->obs.oi.manifest.chunk_map) {
+ if (p.second.oid.oid.name == fp_oid) {
+ cnt++;
+ }
+ }
+ // snap
+ SnapSet& ss = obc->ssc->snapset;
+ const OSDMapRef& osdmap = get_osdmap();
+ for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
+ p != ss.clones.rend();
+ ++p) {
+ object_ref_delta_t refs;
+ ObjectContextRef obc_l = nullptr;
+ ObjectContextRef obc_g = nullptr;
+ hobject_t clone_oid = obc->obs.oi.soid;
+ clone_oid.snap = *p;
+ if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
+ return -EBUSY;
+ }
+ ObjectContextRef clone_obc = get_object_context(clone_oid, false);
+ if (!clone_obc) {
+ break;
+ }
+ if (recover_adjacent_clones(clone_obc, op)) {
+ return -EAGAIN;
+ }
+ get_adjacent_clones(clone_obc, obc_l, obc_g);
+ clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
+ obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
+ nullptr,
+ refs);
+ for (auto p = refs.begin(); p != refs.end(); ++p) {
+ if (p->first.oid.name == fp_oid && p->second > 0) {
+ cnt += p->second;
+ }
+ }
+ }
+
+ return cnt;
+}
+
+bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
+{
+ if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
+ return false;
+ }
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ bool has_manifest_op = std::any_of(
+ begin(m->ops),
+ end(m->ops),
+ [](const auto& osd_op) {
+ return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
+ });
+ if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
+ return false;
+ }
+ ceph_assert(op);
+
+ const SnapSet& snapset = obc->ssc->snapset;
+ auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
+ auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
+ hobject_t cid = obc->obs.oi.soid;
+ cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
+ if (is_unreadable_object(cid)) {
+ dout(10) << __func__ << ": clone " << cid
+ << " is unreadable, waiting" << dendl;
+ wait_for_unreadable_object(cid, op);
+ return true;
+ }
+ return false;
+ };
+ if (s != snapset.clones.begin()) {
+ if (is_unreadable_snap(s - 1)) {
+ return true;
+ }
+ }
+ if (s != snapset.clones.end()) {
+ if (is_unreadable_snap(s + 1)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
+{
+ auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
+ obc->obs.oi.soid.snap);
+ if (s != obc->ssc->snapset.clones.begin()) {
+ auto s_iter = s - 1;
+ hobject_t cid = obc->obs.oi.soid;
+ object_ref_delta_t refs;
+ cid.snap = *s_iter;
+ ObjectContextRef cobc = get_object_context(cid, false, NULL);
+ ceph_assert(cobc);
+ return cobc;
+ }
+ return nullptr;
+}
+
+void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
+{
+ for (auto p = refs.begin(); p != refs.end(); ++p) {
+ int dec_ref_count = p->second;
+ ceph_assert(dec_ref_count < 0);
+ while (dec_ref_count < 0) {
+ dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
+ refcount_manifest(soid, p->first,
+ refcount_t::DECREMENT_REF, NULL, std::nullopt);
+ dec_ref_count++;
+ }
+ }
+}
+
+
+void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
+ ObjectContextRef& _l, ObjectContextRef& _g)
+{
+ const SnapSet& snapset = src_obc->ssc->snapset;
+ const object_info_t& oi = src_obc->obs.oi;
+
+ auto get_context = [this, &oi, &snapset](auto iter)
+ -> ObjectContextRef {
+ hobject_t cid = oi.soid;
+ cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
+ ObjectContextRef obc = get_object_context(cid, false, NULL);
+ ceph_assert(obc);
+ return obc;
+ };
+
+ // check adjacent clones
+ auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
+
+ // We *must* find the clone iff it's not head,
+ // let s == snapset.clones.end() mean head
+ ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
+
+ if (s != snapset.clones.begin()) {
+ _l = get_context(s - 1);
+ }
+
+ if (s != snapset.clones.end()) {
+ _g = get_context(s + 1);
+ }
+}
+
+bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
+ OSDOp& osd_op)
+{
+ object_ref_delta_t refs;
+ ObjectContextRef obc_l, obc_g;
+ get_adjacent_clones(ctx->obc, obc_l, obc_g);
+ set_chunk.calc_refs_to_inc_on_set(
+ obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+ obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+ refs);
+ if (!refs.is_empty()) {
+ /* This is called by set-chunk, so we only consider a single chunk for the time being */
+ ceph_assert(refs.size() == 1);
+ auto p = refs.begin();
+ int inc_ref_count = p->second;
+ if (inc_ref_count > 0) {
+ /*
+ * In set-chunk case, the first thing we should do is to increment
+ * the reference the targe object has prior to update object_manifest in object_info_t.
+ * So, call directly refcount_manifest.
+ */
+ ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
+ C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, ctx->obs->oi.soid);
+ ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, p->first,
+ refcount_t::INCREMENT_REF, fin, std::nullopt);
+ mop->objecter_tid = tid;
+ manifest_ops[ctx->obs->oi.soid] = mop;
+ ctx->obc->start_block();
+ return true;
+ } else if (inc_ref_count < 0) {
+ hobject_t src = ctx->obs->oi.soid;
+ hobject_t tgt = p->first;
+ ctx->register_on_commit(
+ [src, tgt, this](){
+ refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
+ });
+ return false;
+ }
+ }
+
+ return false;
+}
+
+void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
+{
+ object_ref_delta_t refs;
+ ObjectContextRef cobc = nullptr;
+ ObjectContextRef obc = ctx->obc;
+ for (auto &p : ctx->obs->oi.manifest.chunk_map) {
+ if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
+ ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
+ if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
+ ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
+ ctx->delta_stats.num_objects_manifest--;
+ }
+ }
+ }
+ // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
+ cobc = get_prev_clone_obc(obc);
+ obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
+ cobc ? &cobc->obs.oi.manifest : nullptr,
+ ctx->clean_regions,
+ refs);
+ if (!refs.is_empty()) {
+ hobject_t soid = obc->obs.oi.soid;
+ ctx->register_on_commit(
+ [soid, this, refs](){
+ dec_refcount(soid, refs);
+ });
+ }
+}
+
+void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
+{
+ ceph_assert(oi.has_manifest());
+ ceph_assert(ctx->obc->ssc);
+
+ if (oi.manifest.is_chunked()) {
+ object_ref_delta_t refs;
+ ObjectContextRef obc_l, obc_g;
+ get_adjacent_clones(ctx->obc, obc_l, obc_g);
+ oi.manifest.calc_refs_to_drop_on_removal(
+ obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+ obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+ refs);
+
+ if (!refs.is_empty()) {
+ hobject_t soid = ctx->obc->obs.oi.soid;
+ ctx->register_on_commit(
+ [soid, this, refs](){
+ dec_refcount(soid, refs);
+ });
+ }
+ } else if (oi.manifest.is_redirect() &&
+ oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
+ ctx->register_on_commit(
+ [oi, this](){
+ refcount_manifest(oi.soid, oi.manifest.redirect_target,
+ refcount_t::DECREMENT_REF, NULL, std::nullopt);
+ });
+ }
+}
+
+ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
+ Context *cb, std::optional<bufferlist> chunk)
+{
+ unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_FLAG_RWORDERED;
+
+ dout(10) << __func__ << " Start refcount from " << src_soid
+ << " to " << tgt_soid << dendl;
+
+ ObjectOperation obj_op;
+ bufferlist in;
+ if (type == refcount_t::INCREMENT_REF) {
+ cls_cas_chunk_get_ref_op call;
+ call.source = src_soid.get_head();
+ ::encode(call, in);
+ obj_op.call("cas", "chunk_get_ref", in);
+ } else if (type == refcount_t::DECREMENT_REF) {
+ cls_cas_chunk_put_ref_op call;
+ call.source = src_soid.get_head();
+ ::encode(call, in);
+ obj_op.call("cas", "chunk_put_ref", in);
+ } else if (type == refcount_t::CREATE_OR_GET_REF) {
+ cls_cas_chunk_create_or_get_ref_op get_call;
+ get_call.source = src_soid.get_head();
+ ceph_assert(chunk);
+ get_call.data = move(*chunk);
+ ::encode(get_call, in);
+ obj_op.call("cas", "chunk_create_or_get_ref", in);
+ } else {
+ ceph_assert(0 == "unrecognized type");
+ }
+
+ Context *c = nullptr;
+ if (cb) {
+ c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
+ }
+
+ object_locator_t oloc(tgt_soid);
+ ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
+ ceph_assert(src_obc);
+ auto tid = osd->objecter->mutate(
+ tgt_soid.oid, oloc, obj_op, SnapContext(),
+ ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
+ flags, c);
+ return tid;
+}
+
+void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
+ uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
+ uint64_t req_total_len, bool write_ordered)
+{
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ object_manifest_t *manifest = &obc->obs.oi.manifest;
+ if (!manifest->chunk_map.count(chunk_index)) {
+ return;
+ }
+ uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
+ hobject_t soid = manifest->chunk_map[chunk_index].oid;
+ hobject_t ori_soid = m->get_hobj();
+ object_locator_t oloc(soid);
+ unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+ if (write_ordered) {
+ flags |= CEPH_OSD_FLAG_RWORDERED;
+ }
+
+ if (!chunk_length || soid == hobject_t()) {
+ return;
+ }
+
+ /* same as do_proxy_read() */
+ flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
+ CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ENFORCE_SNAPC |
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE);
+
+ dout(10) << __func__ << " Start do chunk proxy read for " << *m
+ << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
+ << " req_length: " << req_length << dendl;
+
+ ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
+
+ ObjectOperation *pobj_op = new ObjectOperation;
+ OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
+
+ if (chunk_index <= req_offset) {
+ osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
+ } else {
+ ceph_abort_msg("chunk_index > req_offset");
+ }
+ osd_op.op.extent.length = req_length;
+
+ ObjectOperation obj_op;
+ obj_op.dup(pobj_op->ops);
+
+ C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
+ prdop);
+ fin->obj_op = pobj_op;
+ fin->op_index = op_index;
+ fin->req_offset = req_offset;
+ fin->obc = obc;
+ fin->req_total_len = req_total_len;
+
+ ceph_tid_t tid = osd->objecter->read(
+ soid.oid, oloc, obj_op,
+ m->get_snapid(), NULL,
+ flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+ &prdop->user_version,
+ &prdop->data_offset,
+ m->get_features());
+ fin->tid = tid;
+ prdop->objecter_tid = tid;
+ proxyread_ops[tid] = prdop;
+ in_progress_proxy_ops[ori_soid].push_back(op);
+}
+
+bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
+{
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+ OSDOp *osd_op = NULL;
+ bool ret = true;
+ for (unsigned int i = 0; i < m->ops.size(); i++) {
+ osd_op = &m->ops[i];
+ ceph_osd_op op = osd_op->op;
+ switch (op.op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SYNC_READ: {
+ uint64_t cursor = osd_op->op.extent.offset;
+ uint64_t remain = osd_op->op.extent.length;
+
+ /* requested chunks exist in chunk_map ? */
+ for (auto &p : obc->obs.oi.manifest.chunk_map) {
+ if (p.first <= cursor && p.first + p.second.length > cursor) {
+ if (!p.second.is_missing()) {
+ return false;
+ }
+ if (p.second.length >= remain) {
+ remain = 0;
+ break;
+ } else {
+ remain = remain - p.second.length;
+ }
+ cursor += p.second.length;
+ }
+ }
+
+ if (remain) {
+ dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
+ return false;
+ }
+ continue;
+ }
+ default:
+ return false;
+ }
+ }
+ return ret;
+}
+
+void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
+{
+ dout(10) << __func__ << " " << oid << " tid " << tid
+ << " " << cpp_strerror(r) << dendl;
+
+ map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
+ if (p == proxywrite_ops.end()) {
+ dout(10) << __func__ << " no proxywrite_op found" << dendl;
+ return;
+ }
+ ProxyWriteOpRef pwop = p->second;
+ ceph_assert(tid == pwop->objecter_tid);
+ ceph_assert(oid == pwop->soid);
+
+ proxywrite_ops.erase(tid);
+
+ map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
+ if (q == in_progress_proxy_ops.end()) {
+ dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
+ delete pwop->ctx;
+ pwop->ctx = NULL;
+ return;
+ }
+ list<OpRequestRef>& in_progress_op = q->second;
+ ceph_assert(in_progress_op.size());
+ list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
+ in_progress_op.end(),
+ pwop->op);
+ ceph_assert(it != in_progress_op.end());
+ in_progress_op.erase(it);
+ if (in_progress_op.size() == 0) {
+ in_progress_proxy_ops.erase(oid);
+ } else if (std::find(in_progress_op.begin(),
+ in_progress_op.end(),
+ pwop->op) != in_progress_op.end()) {
+ if (pwop->ctx)
+ delete pwop->ctx;
+ pwop->ctx = NULL;
+ dout(20) << __func__ << " " << oid << " tid " << tid
+ << " in_progress_op size: "
+ << in_progress_op.size() << dendl;
+ return;
+ }
+
+ osd->logger->inc(l_osd_tier_proxy_write);
+
+ auto m = pwop->op->get_req<MOSDOp>();
+ ceph_assert(m != NULL);
+
+ if (!pwop->sent_reply) {
+ // send commit.
+ assert(pwop->ctx->reply == nullptr);
+ MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
+ true /* we claim it below */);
+ reply->set_reply_versions(eversion_t(), pwop->user_version);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ reply->claim_op_out_data(pwop->ops);
+ dout(10) << " sending commit on " << pwop << " " << reply << dendl;
+ osd->send_message_osd_client(reply, m->get_connection());
+ pwop->sent_reply = true;
+ pwop->ctx->op->mark_commit_sent();
+ }
+
+ delete pwop->ctx;
+ pwop->ctx = NULL;
+}
+
+void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
+ vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << " " << pwop->soid << dendl;
+ pwop->canceled = true;
+
+ // cancel objecter op, if we can
+ if (pwop->objecter_tid) {
+ tids->push_back(pwop->objecter_tid);
+ delete pwop->ctx;
+ pwop->ctx = NULL;
+ proxywrite_ops.erase(pwop->objecter_tid);
+ pwop->objecter_tid = 0;
+ }
+}
+
+class PromoteCallback: public PrimaryLogPG::CopyCallback {
+ ObjectContextRef obc;
+ PrimaryLogPG *pg;
+ utime_t start;
+public:
+ PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
+ : obc(obc_),
+ pg(pg_),
+ start(ceph_clock_now()) {}
+
+ void finish(PrimaryLogPG::CopyCallbackResults results) override {
+ PrimaryLogPG::CopyResults *results_data = results.get<1>();
+ int r = results.get<0>();
+ pg->finish_promote(r, results_data, obc);
+ pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
+ }
+};
+
+class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
+ ObjectContextRef obc;
+ PrimaryLogPG *pg;
+ utime_t start;
+ PrimaryLogPG::OpContext *ctx;
+ PrimaryLogPG::CopyCallbackResults promote_results;
+public:
+ PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
+ : obc(obc_),
+ pg(pg_),
+ start(ceph_clock_now()), ctx(ctx) {}
+
+ void finish(PrimaryLogPG::CopyCallbackResults results) override {
+ PrimaryLogPG::CopyResults *results_data = results.get<1>();
+ int r = results.get<0>();
+ if (ctx) {
+ promote_results = results;
+ pg->execute_ctx(ctx);
+ } else {
+ pg->finish_promote_manifest(r, results_data, obc);
+ }
+ pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
+ }
+ friend struct PromoteFinisher;
+};
+
+struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
+ PromoteManifestCallback *promote_callback;
+
+ explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
+ : promote_callback(promote_callback) {
+ }
+
+ int execute() override {
+ if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
+ promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
+ promote_callback->promote_results.get<1>(),
+ promote_callback->obc);
+ } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
+ promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
+ promote_callback->promote_results.get<1>(),
+ promote_callback->obc);
+ } else {
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ return 0;
+ }
+};
+
+void PrimaryLogPG::promote_object(ObjectContextRef obc,
+ const hobject_t& missing_oid,
+ const object_locator_t& oloc,
+ OpRequestRef op,
+ ObjectContextRef *promote_obc)
+{
+ hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
+ ceph_assert(hoid != hobject_t());
+ if (m_scrubber->write_blocked_by_scrub(hoid)) {
+ dout(10) << __func__ << " " << hoid
+ << " blocked by scrub" << dendl;
+ if (op) {
+ waiting_for_scrub.push_back(op);
+ op->mark_delayed("waiting for scrub");
+ dout(10) << __func__ << " " << hoid
+ << " placing op in waiting_for_scrub" << dendl;
+ } else {
+ dout(10) << __func__ << " " << hoid
+ << " no op, dropping on the floor" << dendl;
+ }
+ return;
+ }
+ if (op && !check_laggy_requeue(op)) {
+ return;
+ }
+ if (!obc) { // we need to create an ObjectContext
+ ceph_assert(missing_oid != hobject_t());
+ obc = get_object_context(missing_oid, true);
+ }
+ if (promote_obc)
+ *promote_obc = obc;
+
+ /*
+ * Before promote complete, if there are proxy-reads for the object,
+ * for this case we don't use DONTNEED.
+ */
+ unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+ map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
+ if (q == in_progress_proxy_ops.end()) {
+ src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+ }
+
+ CopyCallback *cb;
+ object_locator_t my_oloc;
+ hobject_t src_hoid;
+ if (!obc->obs.oi.has_manifest()) {
+ my_oloc = oloc;
+ my_oloc.pool = pool.info.tier_of;
+ src_hoid = obc->obs.oi.soid;
+ cb = new PromoteCallback(obc, this);
+ } else {
+ if (obc->obs.oi.manifest.is_chunked()) {
+ src_hoid = obc->obs.oi.soid;
+ cb = new PromoteManifestCallback(obc, this);
+ } else if (obc->obs.oi.manifest.is_redirect()) {
+ object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
+ my_oloc = src_oloc;
+ src_hoid = obc->obs.oi.manifest.redirect_target;
+ cb = new PromoteCallback(obc, this);
+ } else {
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ }
+
+ unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
+ start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
+ obc->obs.oi.soid.snap == CEPH_NOSNAP,
+ src_fadvise_flags, 0);
+
+ ceph_assert(obc->is_blocked());
+
+ if (op)
+ wait_for_blocked_object(obc->obs.oi.soid, op);
+
+ recovery_state.update_stats(
+ [](auto &history, auto &stats) {
+ stats.stats.sum.num_promote++;
+ return false;
+ });
+}
+
+void PrimaryLogPG::execute_ctx(OpContext *ctx)
+{
+ FUNCTRACE(cct);
+ dout(10) << __func__ << " " << ctx << dendl;
+ ctx->reset_obs(ctx->obc);
+ ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
+ OpRequestRef op = ctx->op;
+ auto m = op->get_req<MOSDOp>();
+ ObjectContextRef obc = ctx->obc;
+ const hobject_t& soid = obc->obs.oi.soid;
+
+ // this method must be idempotent since we may call it several times
+ // before we finally apply the resulting transaction.
+ ctx->op_t.reset(new PGTransaction);
+
+ if (op->may_write() || op->may_cache()) {
+ // snap
+ if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
+ pool.info.is_pool_snaps_mode()) {
+ // use pool's snapc
+ ctx->snapc = pool.snapc;
+ } else {
+ // client specified snapc
+ ctx->snapc.seq = m->get_snap_seq();
+ ctx->snapc.snaps = m->get_snaps();
+ filter_snapc(ctx->snapc.snaps);
+ }
+ if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
+ ctx->snapc.seq < obc->ssc->snapset.seq) {
+ dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
+ << " < snapset seq " << obc->ssc->snapset.seq
+ << " on " << obc->obs.oi.soid << dendl;
+ reply_ctx(ctx, -EOLDSNAPC);
+ return;
+ }
+
+ // version
+ ctx->at_version = get_next_version();
+ ctx->mtime = m->get_mtime();
+
+ dout(10) << __func__ << " " << soid << " " << *ctx->ops
+ << " ov " << obc->obs.oi.version << " av " << ctx->at_version
+ << " snapc " << ctx->snapc
+ << " snapset " << obc->ssc->snapset
+ << dendl;
+ } else {
+ dout(10) << __func__ << " " << soid << " " << *ctx->ops
+ << " ov " << obc->obs.oi.version
+ << dendl;
+ }
+
+ if (!ctx->user_at_version)
+ ctx->user_at_version = obc->obs.oi.user_version;
+ dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
+
+ {
+#ifdef WITH_LTTNG
+ osd_reqid_t reqid = ctx->op->get_reqid();
+#endif
+ tracepoint(osd, prepare_tx_enter, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc);
+ }
+#ifdef HAVE_JAEGER
+ if (ctx->op->osd_parent_span) {
+ auto execute_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+ }
+#endif
+
+ int result = prepare_transaction(ctx);
+
+ {
+#ifdef WITH_LTTNG
+ osd_reqid_t reqid = ctx->op->get_reqid();
+#endif
+ tracepoint(osd, prepare_tx_exit, reqid.name._type,
+ reqid.name._num, reqid.tid, reqid.inc);
+ }
+
+ bool pending_async_reads = !ctx->pending_async_reads.empty();
+ if (result == -EINPROGRESS || pending_async_reads) {
+ // come back later.
+ if (pending_async_reads) {
+ ceph_assert(pool.info.is_erasure());
+ in_progress_async_reads.push_back(make_pair(op, ctx));
+ ctx->start_async_reads(this);
+ }
+ return;
+ }
+
+ if (result == -EAGAIN) {
+ // clean up after the ctx
+ close_op_ctx(ctx);
+ return;
+ }
+
+ bool ignore_out_data = false;
+ if (!ctx->op_t->empty() &&
+ op->may_write() &&
+ result >= 0) {
+ // successful update
+ if (ctx->op->allows_returnvec()) {
+ // enforce reasonable bound on the return buffer sizes
+ for (auto& i : *ctx->ops) {
+ if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
+ dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
+ result = -EOVERFLOW; // overall result is overflow
+ i.rval = -EOVERFLOW;
+ i.outdata.clear();
+ }
+ }
+ } else {
+ // legacy behavior -- zero result and return data etc.
+ ignore_out_data = true;
+ result = 0;
+ }
+ }
+
+ // prepare the reply
+ ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
+ ignore_out_data);
+ dout(20) << __func__ << " alloc reply " << ctx->reply
+ << " result " << result << dendl;
+
+ // read or error?
+ if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
+ // finish side-effects
+ if (result >= 0)
+ do_osd_op_effects(ctx, m->get_connection());
+
+ complete_read_ctx(result, ctx);
+ return;
+ }
+
+ ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
+
+ ceph_assert(op->may_write() || op->may_cache());
+
+ // trim log?
+ recovery_state.update_trim_to();
+
+ // verify that we are doing this in order?
+ if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
+ !pool.info.is_tier() && !pool.info.has_tiers()) {
+ map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
+ ceph_tid_t t = m->get_tid();
+ client_t n = m->get_source().num();
+ map<client_t,ceph_tid_t>::iterator p = cm.find(n);
+ if (p == cm.end()) {
+ dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
+ cm[n] = t;
+ } else {
+ dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
+ if (p->second > t) {
+ derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
+ ceph_abort_msg("out of order op");
+ }
+ p->second = t;
+ }
+ }
+
+ if (ctx->update_log_only) {
+ if (result >= 0)
+ do_osd_op_effects(ctx, m->get_connection());
+
+ dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
+ // save just what we need from ctx
+ MOSDOpReply *reply = ctx->reply;
+ ctx->reply = nullptr;
+ reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
+
+ if (result == -ENOENT) {
+ reply->set_enoent_reply_versions(info.last_update,
+ info.last_user_version);
+ }
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ // append to pg log for dup detection - don't save buffers for now
+ record_write_error(op, soid, reply, result,
+ ctx->op->allows_returnvec() ? ctx : nullptr);
+ close_op_ctx(ctx);
+ return;
+ }
+
+ // no need to capture PG ref, repop cancel will handle that
+ // Can capture the ctx by pointer, it's owned by the repop
+ ctx->register_on_commit(
+ [m, ctx, this](){
+ if (ctx->op)
+ log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
+
+ if (m && !ctx->sent_reply) {
+ MOSDOpReply *reply = ctx->reply;
+ ctx->reply = nullptr;
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ dout(10) << " sending reply on " << *m << " " << reply << dendl;
+ osd->send_message_osd_client(reply, m->get_connection());
+ ctx->sent_reply = true;
+ ctx->op->mark_commit_sent();
+ }
+ });
+ ctx->register_on_success(
+ [ctx, this]() {
+ do_osd_op_effects(
+ ctx,
+ ctx->op ? ctx->op->get_req()->get_connection() :
+ ConnectionRef());
+ });
+ ctx->register_on_finish(
+ [ctx]() {
+ delete ctx;
+ });
+
+ // issue replica writes
+ ceph_tid_t rep_tid = osd->get_tid();
+
+ RepGather *repop = new_repop(ctx, obc, rep_tid);
+
+ issue_repop(repop, ctx);
+ eval_repop(repop);
+ repop->put();
+}
+
+void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
+ release_object_locks(ctx->lock_manager);
+
+ ctx->op_t.reset();
+
+ for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
+ ctx->on_finish.erase(p++)) {
+ (*p)();
+ }
+ delete ctx;
+}
+
+void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
+{
+ if (ctx->op)
+ osd->reply_op_error(ctx->op, r);
+ close_op_ctx(ctx);
+}
+
+void PrimaryLogPG::log_op_stats(const OpRequest& op,
+ const uint64_t inb,
+ const uint64_t outb)
+{
+ auto m = op.get_req<MOSDOp>();
+ const utime_t now = ceph_clock_now();
+
+ const utime_t latency = now - m->get_recv_stamp();
+ const utime_t process_latency = now - op.get_dequeued_time();
+
+ osd->logger->inc(l_osd_op);
+
+ osd->logger->inc(l_osd_op_outb, outb);
+ osd->logger->inc(l_osd_op_inb, inb);
+ osd->logger->tinc(l_osd_op_lat, latency);
+ osd->logger->tinc(l_osd_op_process_lat, process_latency);
+
+ if (op.may_read() && op.may_write()) {
+ osd->logger->inc(l_osd_op_rw);
+ osd->logger->inc(l_osd_op_rw_inb, inb);
+ osd->logger->inc(l_osd_op_rw_outb, outb);
+ osd->logger->tinc(l_osd_op_rw_lat, latency);
+ osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
+ osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
+ osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
+ } else if (op.may_read()) {
+ osd->logger->inc(l_osd_op_r);
+ osd->logger->inc(l_osd_op_r_outb, outb);
+ osd->logger->tinc(l_osd_op_r_lat, latency);
+ osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
+ osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
+ } else if (op.may_write() || op.may_cache()) {
+ osd->logger->inc(l_osd_op_w);
+ osd->logger->inc(l_osd_op_w_inb, inb);
+ osd->logger->tinc(l_osd_op_w_lat, latency);
+ osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
+ osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
+ } else {
+ ceph_abort();
+ }
+
+ dout(15) << "log_op_stats " << *m
+ << " inb " << inb
+ << " outb " << outb
+ << " lat " << latency << dendl;
+
+ if (m_dynamic_perf_stats.is_enabled()) {
+ m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
+ }
+}
+
+void PrimaryLogPG::set_dynamic_perf_stats_queries(
+ const std::list<OSDPerfMetricQuery> &queries)
+{
+ m_dynamic_perf_stats.set_queries(queries);
+}
+
+void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
+{
+ std::swap(m_dynamic_perf_stats, *stats);
+}
+
+void PrimaryLogPG::do_scan(
+ OpRequestRef op,
+ ThreadPool::TPHandle &handle)
+{
+ auto m = op->get_req<MOSDPGScan>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
+ dout(10) << "do_scan " << *m << dendl;
+
+ op->mark_started();
+
+ switch (m->op) {
+ case MOSDPGScan::OP_SCAN_GET_DIGEST:
+ {
+ auto dpp = get_dpp();
+ if (osd->check_backfill_full(dpp)) {
+ dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::BackfillTooFull())));
+ return;
+ }
+
+ BackfillInterval bi;
+ bi.begin = m->begin;
+ // No need to flush, there won't be any in progress writes occuring
+ // past m->begin
+ scan_range(
+ cct->_conf->osd_backfill_scan_min,
+ cct->_conf->osd_backfill_scan_max,
+ &bi,
+ handle);
+ MOSDPGScan *reply = new MOSDPGScan(
+ MOSDPGScan::OP_SCAN_DIGEST,
+ pg_whoami,
+ get_osdmap_epoch(), m->query_epoch,
+ spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
+ encode(bi.objects, reply->get_data());
+ osd->send_message_osd_cluster(reply, m->get_connection());
+ }
+ break;
+
+ case MOSDPGScan::OP_SCAN_DIGEST:
+ {
+ pg_shard_t from = m->from;
+
+ // Check that from is in backfill_targets vector
+ ceph_assert(is_backfill_target(from));
+
+ BackfillInterval& bi = peer_backfill_info[from];
+ bi.begin = m->begin;
+ bi.end = m->end;
+ auto p = m->get_data().cbegin();
+
+ // take care to preserve ordering!
+ bi.clear_objects();
+ decode_noclear(bi.objects, p);
+ dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
+ << " bi.objects.size()=" << bi.objects.size() << dendl;
+
+ if (waiting_on_backfill.erase(from)) {
+ if (waiting_on_backfill.empty()) {
+ ceph_assert(
+ peer_backfill_info.size() ==
+ get_backfill_targets().size());
+ finish_recovery_op(hobject_t::get_max());
+ }
+ } else {
+ // we canceled backfill for a while due to a too full, and this
+ // is an extra response from a non-too-full peer
+ dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
+ }
+ }
+ break;
+ }
+}
+
+void PrimaryLogPG::do_backfill(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGBackfill>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
+ dout(10) << "do_backfill " << *m << dendl;
+
+ op->mark_started();
+
+ switch (m->op) {
+ case MOSDPGBackfill::OP_BACKFILL_FINISH:
+ {
+ ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
+
+ MOSDPGBackfill *reply = new MOSDPGBackfill(
+ MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+ get_osdmap_epoch(),
+ m->query_epoch,
+ spg_t(info.pgid.pgid, get_primary().shard));
+ reply->set_priority(get_recovery_op_priority());
+ osd->send_message_osd_cluster(reply, m->get_connection());
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ RecoveryDone())));
+ }
+ // fall-thru
+
+ case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
+ {
+ ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
+
+ ObjectStore::Transaction t;
+ recovery_state.update_backfill_progress(
+ m->last_backfill,
+ m->stats,
+ m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ t);
+
+ int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
+ ceph_assert(tr == 0);
+ }
+ break;
+
+ case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
+ {
+ ceph_assert(is_primary());
+ ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
+ finish_recovery_op(hobject_t::get_max());
+ }
+ break;
+ }
+}
+
+void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
+{
+ const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
+ op->get_req());
+ ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
+ dout(7) << __func__ << " " << m->ls << dendl;
+
+ op->mark_started();
+
+ ObjectStore::Transaction t;
+ for (auto& p : m->ls) {
+ if (is_remote_backfilling()) {
+ struct stat st;
+ int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
+ pg_whoami.shard) , &st);
+ if (r == 0) {
+ sub_local_num_bytes(st.st_size);
+ int64_t usersize;
+ if (pool.info.is_erasure()) {
+ bufferlist bv;
+ int r = osd->store->getattr(
+ ch,
+ ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
+ OI_ATTR,
+ bv);
+ if (r >= 0) {
+ object_info_t oi(bv);
+ usersize = oi.size * pgbackend->get_ec_data_chunk_count();
+ } else {
+ dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
+ << " can't get object info" << dendl;
+ usersize = 0;
+ }
+ } else {
+ usersize = st.st_size;
+ }
+ sub_num_bytes(usersize);
+ dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
+ << " sub actual data by " << st.st_size
+ << " sub num_bytes by " << usersize
+ << dendl;
+ }
+ }
+ remove_snap_mapped_object(t, p.first);
+ }
+ int r = osd->store->queue_transaction(ch, std::move(t), NULL);
+ ceph_assert(r == 0);
+}
+
+int PrimaryLogPG::trim_object(
+ bool first, const hobject_t &coid, snapid_t snap_to_trim,
+ PrimaryLogPG::OpContextUPtr *ctxp)
+{
+ *ctxp = NULL;
+
+ // load clone info
+ bufferlist bl;
+ ObjectContextRef obc = get_object_context(coid, false, NULL);
+ if (!obc || !obc->ssc || !obc->ssc->exists) {
+ osd->clog->error() << __func__ << ": Can not trim " << coid
+ << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
+ return -ENOENT;
+ }
+
+ hobject_t head_oid = coid.get_head();
+ ObjectContextRef head_obc = get_object_context(head_oid, false);
+ if (!head_obc) {
+ osd->clog->error() << __func__ << ": Can not trim " << coid
+ << " repair needed, no snapset obc for " << head_oid;
+ return -ENOENT;
+ }
+
+ SnapSet& snapset = obc->ssc->snapset;
+
+ object_info_t &coi = obc->obs.oi;
+ auto citer = snapset.clone_snaps.find(coid.snap);
+ if (citer == snapset.clone_snaps.end()) {
+ osd->clog->error() << "No clone_snaps in snapset " << snapset
+ << " for object " << coid << "\n";
+ return -ENOENT;
+ }
+ set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
+ if (old_snaps.empty()) {
+ osd->clog->error() << "No object info snaps for object " << coid;
+ return -ENOENT;
+ }
+
+ dout(10) << coid << " old_snaps " << old_snaps
+ << " old snapset " << snapset << dendl;
+ if (snapset.seq == 0) {
+ osd->clog->error() << "No snapset.seq for object " << coid;
+ return -ENOENT;
+ }
+
+ set<snapid_t> new_snaps;
+ const OSDMapRef& osdmap = get_osdmap();
+ for (set<snapid_t>::iterator i = old_snaps.begin();
+ i != old_snaps.end();
+ ++i) {
+ if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
+ *i != snap_to_trim) {
+ new_snaps.insert(*i);
+ }
+ }
+
+ vector<snapid_t>::iterator p = snapset.clones.end();
+
+ if (new_snaps.empty()) {
+ p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
+ if (p == snapset.clones.end()) {
+ osd->clog->error() << "Snap " << coid.snap << " not in clones";
+ return -ENOENT;
+ }
+ }
+
+ OpContextUPtr ctx = simple_opc_create(obc);
+ ctx->head_obc = head_obc;
+
+ if (!ctx->lock_manager.get_snaptrimmer_write(
+ coid,
+ obc,
+ first)) {
+ close_op_ctx(ctx.release());
+ dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
+ return -ENOLCK;
+ }
+
+ if (!ctx->lock_manager.get_snaptrimmer_write(
+ head_oid,
+ head_obc,
+ first)) {
+ close_op_ctx(ctx.release());
+ dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
+ return -ENOLCK;
+ }
+
+ ctx->at_version = get_next_version();
+
+ PGTransaction *t = ctx->op_t.get();
+
+ if (new_snaps.empty()) {
+ // remove clone
+ dout(10) << coid << " snaps " << old_snaps << " -> "
+ << new_snaps << " ... deleting" << dendl;
+
+ // ...from snapset
+ ceph_assert(p != snapset.clones.end());
+
+ snapid_t last = coid.snap;
+ ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
+
+ if (p != snapset.clones.begin()) {
+ // not the oldest... merge overlap into next older clone
+ vector<snapid_t>::iterator n = p - 1;
+ hobject_t prev_coid = coid;
+ prev_coid.snap = *n;
+ bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+ if (adjust_prev_bytes)
+ ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
+
+ snapset.clone_overlap[*n].intersection_of(
+ snapset.clone_overlap[*p]);
+
+ if (adjust_prev_bytes)
+ ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
+ }
+ ctx->delta_stats.num_objects--;
+ if (coi.is_dirty())
+ ctx->delta_stats.num_objects_dirty--;
+ if (coi.is_omap())
+ ctx->delta_stats.num_objects_omap--;
+ if (coi.is_whiteout()) {
+ dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
+ ctx->delta_stats.num_whiteouts--;
+ }
+ ctx->delta_stats.num_object_clones--;
+ if (coi.is_cache_pinned())
+ ctx->delta_stats.num_objects_pinned--;
+ if (coi.has_manifest()) {
+ dec_all_refcount_manifest(coi, ctx.get());
+ ctx->delta_stats.num_objects_manifest--;
+ }
+ obc->obs.exists = false;
+
+ snapset.clones.erase(p);
+ snapset.clone_overlap.erase(last);
+ snapset.clone_size.erase(last);
+ snapset.clone_snaps.erase(last);
+
+ ctx->log.push_back(
+ pg_log_entry_t(
+ pg_log_entry_t::DELETE,
+ coid,
+ ctx->at_version,
+ ctx->obs->oi.version,
+ 0,
+ osd_reqid_t(),
+ ctx->mtime,
+ 0)
+ );
+ t->remove(coid);
+ t->update_snaps(
+ coid,
+ old_snaps,
+ new_snaps);
+
+ coi = object_info_t(coid);
+
+ ctx->at_version.version++;
+ } else {
+ // save adjusted snaps for this object
+ dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
+ snapset.clone_snaps[coid.snap] =
+ vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
+ // we still do a 'modify' event on this object just to trigger a
+ // snapmapper.update ... :(
+
+ coi.prior_version = coi.version;
+ coi.version = ctx->at_version;
+ bl.clear();
+ encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ t->setattr(coid, OI_ATTR, bl);
+
+ ctx->log.push_back(
+ pg_log_entry_t(
+ pg_log_entry_t::MODIFY,
+ coid,
+ coi.version,
+ coi.prior_version,
+ 0,
+ osd_reqid_t(),
+ ctx->mtime,
+ 0)
+ );
+ ctx->at_version.version++;
+
+ t->update_snaps(
+ coid,
+ old_snaps,
+ new_snaps);
+ }
+
+ // save head snapset
+ dout(10) << coid << " new snapset " << snapset << " on "
+ << head_obc->obs.oi << dendl;
+ if (snapset.clones.empty() &&
+ (head_obc->obs.oi.is_whiteout() &&
+ !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
+ !head_obc->obs.oi.is_cache_pinned())) {
+ // NOTE: this arguably constitutes minor interference with the
+ // tiering agent if this is a cache tier since a snap trim event
+ // is effectively evicting a whiteout we might otherwise want to
+ // keep around.
+ dout(10) << coid << " removing " << head_oid << dendl;
+ ctx->log.push_back(
+ pg_log_entry_t(
+ pg_log_entry_t::DELETE,
+ head_oid,
+ ctx->at_version,
+ head_obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ ctx->mtime,
+ 0)
+ );
+ dout(10) << "removing snap head" << dendl;
+ object_info_t& oi = head_obc->obs.oi;
+ ctx->delta_stats.num_objects--;
+ if (oi.is_dirty()) {
+ ctx->delta_stats.num_objects_dirty--;
+ }
+ if (oi.is_omap())
+ ctx->delta_stats.num_objects_omap--;
+ if (oi.is_whiteout()) {
+ dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
+ ctx->delta_stats.num_whiteouts--;
+ }
+ if (oi.is_cache_pinned()) {
+ ctx->delta_stats.num_objects_pinned--;
+ }
+ if (oi.has_manifest()) {
+ ctx->delta_stats.num_objects_manifest--;
+ dec_all_refcount_manifest(oi, ctx.get());
+ }
+ head_obc->obs.exists = false;
+ head_obc->obs.oi = object_info_t(head_oid);
+ t->remove(head_oid);
+ } else {
+ if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+ // filter SnapSet::snaps for the benefit of pre-octopus
+ // peers. This is perhaps overly conservative in that I'm not
+ // certain they need this, but let's be conservative here.
+ dout(10) << coid << " filtering snapset on " << head_oid << dendl;
+ snapset.filter(pool.info);
+ } else {
+ snapset.snaps.clear();
+ }
+ dout(10) << coid << " writing updated snapset on " << head_oid
+ << ", snapset is " << snapset << dendl;
+ ctx->log.push_back(
+ pg_log_entry_t(
+ pg_log_entry_t::MODIFY,
+ head_oid,
+ ctx->at_version,
+ head_obc->obs.oi.version,
+ 0,
+ osd_reqid_t(),
+ ctx->mtime,
+ 0)
+ );
+
+ head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
+ head_obc->obs.oi.version = ctx->at_version;
+
+ map <string, bufferlist> attrs;
+ bl.clear();
+ encode(snapset, bl);
+ attrs[SS_ATTR] = std::move(bl);
+
+ bl.clear();
+ encode(head_obc->obs.oi, bl,
+ get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ attrs[OI_ATTR] = std::move(bl);
+ t->setattrs(head_oid, attrs);
+ }
+
+ *ctxp = std::move(ctx);
+ return 0;
+}
+
+void PrimaryLogPG::kick_snap_trim()
+{
+ ceph_assert(is_active());
+ ceph_assert(is_primary());
+ if (is_clean() &&
+ !state_test(PG_STATE_PREMERGE) &&
+ !snap_trimq.empty()) {
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
+ dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
+ } else {
+ dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
+ snap_trimmer_machine.process_event(KickTrim());
+ }
+ }
+}
+
+void PrimaryLogPG::snap_trimmer_scrub_complete()
+{
+ if (is_primary() && is_active() && is_clean() && !snap_trimq.empty()) {
+ dout(10) << "scrub finished - requeuing snap_trimmer" << dendl;
+ snap_trimmer_machine.process_event(ScrubComplete());
+ }
+}
+
+void PrimaryLogPG::snap_trimmer(epoch_t queued)
+{
+ if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
+ return;
+ }
+
+ ceph_assert(is_primary());
+
+ dout(10) << "snap_trimmer posting" << dendl;
+ snap_trimmer_machine.process_event(DoSnapWork());
+ dout(10) << "snap_trimmer complete" << dendl;
+ return;
+}
+
+int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
+{
+ __u64 v2;
+
+ string v2s(xattr.c_str(), xattr.length());
+ if (v2s.length())
+ v2 = strtoull(v2s.c_str(), NULL, 10);
+ else
+ v2 = 0;
+
+ dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
+
+ switch (op) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ return (v1 == v2);
+ case CEPH_OSD_CMPXATTR_OP_NE:
+ return (v1 != v2);
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ return (v1 > v2);
+ case CEPH_OSD_CMPXATTR_OP_GTE:
+ return (v1 >= v2);
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ return (v1 < v2);
+ case CEPH_OSD_CMPXATTR_OP_LTE:
+ return (v1 <= v2);
+ default:
+ return -EINVAL;
+ }
+}
+
+int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
+{
+ string v2s(xattr.c_str(), xattr.length());
+
+ dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
+
+ switch (op) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ return (v1s.compare(v2s) == 0);
+ case CEPH_OSD_CMPXATTR_OP_NE:
+ return (v1s.compare(v2s) != 0);
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ return (v1s.compare(v2s) > 0);
+ case CEPH_OSD_CMPXATTR_OP_GTE:
+ return (v1s.compare(v2s) >= 0);
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ return (v1s.compare(v2s) < 0);
+ case CEPH_OSD_CMPXATTR_OP_LTE:
+ return (v1s.compare(v2s) <= 0);
+ default:
+ return -EINVAL;
+ }
+}
+
+int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
+{
+ ceph_osd_op& op = osd_op.op;
+ vector<OSDOp> write_ops(1);
+ OSDOp& write_op = write_ops[0];
+ uint64_t write_length = op.writesame.length;
+ int result = 0;
+
+ if (!write_length)
+ return 0;
+
+ if (!op.writesame.data_length || write_length % op.writesame.data_length)
+ return -EINVAL;
+
+ if (op.writesame.data_length != osd_op.indata.length()) {
+ derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
+ return -EINVAL;
+ }
+
+ while (write_length) {
+ write_op.indata.append(osd_op.indata);
+ write_length -= op.writesame.data_length;
+ }
+
+ write_op.op.op = CEPH_OSD_OP_WRITE;
+ write_op.op.extent.offset = op.writesame.offset;
+ write_op.op.extent.length = op.writesame.length;
+ result = do_osd_ops(ctx, write_ops);
+ if (result < 0)
+ derr << "do_writesame do_osd_ops failed " << result << dendl;
+
+ return result;
+}
+
+// ========================================================================
+// low level osd ops
+
+int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
+{
+ dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
+ bufferlist header, vals;
+ int r = _get_tmap(ctx, &header, &vals);
+ if (r < 0) {
+ if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
+ r = 0;
+ return r;
+ }
+
+ vector<OSDOp> ops(3);
+
+ ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
+ ops[0].op.extent.offset = 0;
+ ops[0].op.extent.length = 0;
+
+ ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
+ ops[1].indata = std::move(header);
+
+ ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
+ ops[2].indata = std::move(vals);
+
+ return do_osd_ops(ctx, ops);
+}
+
+int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
+ OSDOp& osd_op, bufferlist& bl)
+{
+ // decode
+ bufferlist header;
+ map<string, bufferlist> m;
+ if (bl.length()) {
+ auto p = bl.cbegin();
+ decode(header, p);
+ decode(m, p);
+ ceph_assert(p.end());
+ }
+
+ // do the update(s)
+ while (!bp.end()) {
+ __u8 op;
+ string key;
+ decode(op, bp);
+
+ switch (op) {
+ case CEPH_OSD_TMAP_SET: // insert key
+ {
+ decode(key, bp);
+ bufferlist data;
+ decode(data, bp);
+ m[key] = data;
+ }
+ break;
+ case CEPH_OSD_TMAP_RM: // remove key
+ decode(key, bp);
+ if (!m.count(key)) {
+ return -ENOENT;
+ }
+ m.erase(key);
+ break;
+ case CEPH_OSD_TMAP_RMSLOPPY: // remove key
+ decode(key, bp);
+ m.erase(key);
+ break;
+ case CEPH_OSD_TMAP_HDR: // update header
+ {
+ decode(header, bp);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ // reencode
+ bufferlist obl;
+ encode(header, obl);
+ encode(m, obl);
+
+ // write it out
+ vector<OSDOp> nops(1);
+ OSDOp& newop = nops[0];
+ newop.op.op = CEPH_OSD_OP_WRITEFULL;
+ newop.op.extent.offset = 0;
+ newop.op.extent.length = obl.length();
+ newop.indata = obl;
+ do_osd_ops(ctx, nops);
+ return 0;
+}
+
+int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
+{
+ bufferlist::const_iterator orig_bp = bp;
+ int result = 0;
+ if (bp.end()) {
+ dout(10) << "tmapup is a no-op" << dendl;
+ } else {
+ // read the whole object
+ vector<OSDOp> nops(1);
+ OSDOp& newop = nops[0];
+ newop.op.op = CEPH_OSD_OP_READ;
+ newop.op.extent.offset = 0;
+ newop.op.extent.length = 0;
+ result = do_osd_ops(ctx, nops);
+
+ dout(10) << "tmapup read " << newop.outdata.length() << dendl;
+
+ dout(30) << " starting is \n";
+ newop.outdata.hexdump(*_dout);
+ *_dout << dendl;
+
+ auto ip = newop.outdata.cbegin();
+ bufferlist obl;
+
+ dout(30) << "the update command is: \n";
+ osd_op.indata.hexdump(*_dout);
+ *_dout << dendl;
+
+ // header
+ bufferlist header;
+ __u32 nkeys = 0;
+ if (newop.outdata.length()) {
+ decode(header, ip);
+ decode(nkeys, ip);
+ }
+ dout(10) << "tmapup header " << header.length() << dendl;
+
+ if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
+ ++bp;
+ decode(header, bp);
+ dout(10) << "tmapup new header " << header.length() << dendl;
+ }
+
+ encode(header, obl);
+
+ dout(20) << "tmapup initial nkeys " << nkeys << dendl;
+
+ // update keys
+ bufferlist newkeydata;
+ string nextkey, last_in_key;
+ bufferlist nextval;
+ bool have_next = false;
+ if (!ip.end()) {
+ have_next = true;
+ decode(nextkey, ip);
+ decode(nextval, ip);
+ }
+ while (!bp.end() && !result) {
+ __u8 op;
+ string key;
+ try {
+ decode(op, bp);
+ decode(key, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ return -EINVAL;
+ }
+ if (key < last_in_key) {
+ dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
+ << "', falling back to an inefficient (unsorted) update" << dendl;
+ bp = orig_bp;
+ return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
+ }
+ last_in_key = key;
+
+ dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
+
+ // skip existing intervening keys
+ bool key_exists = false;
+ while (have_next && !key_exists) {
+ dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
+ if (nextkey > key)
+ break;
+ if (nextkey < key) {
+ // copy untouched.
+ encode(nextkey, newkeydata);
+ encode(nextval, newkeydata);
+ dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
+ } else {
+ // don't copy; discard old value. and stop.
+ dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
+ key_exists = true;
+ nkeys--;
+ }
+ if (!ip.end()) {
+ decode(nextkey, ip);
+ decode(nextval, ip);
+ } else {
+ have_next = false;
+ }
+ }
+
+ if (op == CEPH_OSD_TMAP_SET) {
+ bufferlist val;
+ try {
+ decode(val, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ return -EINVAL;
+ }
+ encode(key, newkeydata);
+ encode(val, newkeydata);
+ dout(20) << " set " << key << " " << val.length() << dendl;
+ nkeys++;
+ } else if (op == CEPH_OSD_TMAP_CREATE) {
+ if (key_exists) {
+ return -EEXIST;
+ }
+ bufferlist val;
+ try {
+ decode(val, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ return -EINVAL;
+ }
+ encode(key, newkeydata);
+ encode(val, newkeydata);
+ dout(20) << " create " << key << " " << val.length() << dendl;
+ nkeys++;
+ } else if (op == CEPH_OSD_TMAP_RM) {
+ // do nothing.
+ if (!key_exists) {
+ return -ENOENT;
+ }
+ } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
+ // do nothing
+ } else {
+ dout(10) << " invalid tmap op " << (int)op << dendl;
+ return -EINVAL;
+ }
+ }
+
+ // copy remaining
+ if (have_next) {
+ encode(nextkey, newkeydata);
+ encode(nextval, newkeydata);
+ dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
+ }
+ if (!ip.end()) {
+ bufferlist rest;
+ rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
+ dout(20) << " keep trailing " << rest.length()
+ << " at " << newkeydata.length() << dendl;
+ newkeydata.claim_append(rest);
+ }
+
+ // encode final key count + key data
+ dout(20) << "tmapup final nkeys " << nkeys << dendl;
+ encode(nkeys, obl);
+ obl.claim_append(newkeydata);
+
+ if (0) {
+ dout(30) << " final is \n";
+ obl.hexdump(*_dout);
+ *_dout << dendl;
+
+ // sanity check
+ auto tp = obl.cbegin();
+ bufferlist h;
+ decode(h, tp);
+ map<string,bufferlist> d;
+ decode(d, tp);
+ ceph_assert(tp.end());
+ dout(0) << " **** debug sanity check, looks ok ****" << dendl;
+ }
+
+ // write it out
+ if (!result) {
+ dout(20) << "tmapput write " << obl.length() << dendl;
+ newop.op.op = CEPH_OSD_OP_WRITEFULL;
+ newop.op.extent.offset = 0;
+ newop.op.extent.length = obl.length();
+ newop.indata = obl;
+ do_osd_ops(ctx, nops);
+ }
+ }
+ return result;
+}
+
+static int check_offset_and_length(uint64_t offset, uint64_t length,
+ uint64_t max, DoutPrefixProvider *dpp)
+{
+ if (offset >= max ||
+ length > max ||
+ offset + length > max) {
+ ldpp_dout(dpp, 10) << __func__ << " "
+ << "osd_max_object_size: " << max
+ << "; Hard limit of object size is 4GB." << dendl;
+ return -EFBIG;
+ }
+
+ return 0;
+}
+
+struct FillInVerifyExtent : public Context {
+ ceph_le64 *r;
+ int32_t *rval;
+ bufferlist *outdatap;
+ std::optional<uint32_t> maybe_crc;
+ uint64_t size;
+ OSDService *osd;
+ hobject_t soid;
+ uint32_t flags;
+ FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
+ std::optional<uint32_t> mc, uint64_t size,
+ OSDService *osd, hobject_t soid, uint32_t flags) :
+ r(r), rval(rv), outdatap(blp), maybe_crc(mc),
+ size(size), osd(osd), soid(soid), flags(flags) {}
+ void finish(int len) override {
+ if (len < 0) {
+ *rval = len;
+ return;
+ }
+ *r = len;
+ *rval = 0;
+
+ // whole object? can we verify the checksum?
+ if (maybe_crc && *r == size) {
+ uint32_t crc = outdatap->crc32c(-1);
+ if (maybe_crc != crc) {
+ osd->clog->error() << std::hex << " full-object read crc 0x" << crc
+ << " != expected 0x" << *maybe_crc
+ << std::dec << " on " << soid;
+ if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+ *rval = -EIO;
+ *r = 0;
+ }
+ }
+ }
+ }
+};
+
+struct ToSparseReadResult : public Context {
+ int* result;
+ bufferlist* data_bl;
+ uint64_t data_offset;
+ ceph_le64* len;
+ ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
+ ceph_le64* len)
+ : result(result), data_bl(bl), data_offset(offset),len(len) {}
+ void finish(int r) override {
+ if (r < 0) {
+ *result = r;
+ return;
+ }
+ *result = 0;
+ *len = r;
+ bufferlist outdata;
+ map<uint64_t, uint64_t> extents = {{data_offset, r}};
+ encode(extents, outdata);
+ encode_destructively(*data_bl, outdata);
+ data_bl->swap(outdata);
+ }
+};
+
+template<typename V>
+static string list_keys(const map<string, V>& m) {
+ string s;
+ for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
+ if (!s.empty()) {
+ s.push_back(',');
+ }
+ s.append(itr->first);
+ }
+ return s;
+}
+
+template<typename T>
+static string list_entries(const T& m) {
+ string s;
+ for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
+ if (!s.empty()) {
+ s.push_back(',');
+ }
+ s.append(*itr);
+ }
+ return s;
+}
+
+void PrimaryLogPG::maybe_create_new_object(
+ OpContext *ctx,
+ bool ignore_transaction)
+{
+ ObjectState& obs = ctx->new_obs;
+ if (!obs.exists) {
+ ctx->delta_stats.num_objects++;
+ obs.exists = true;
+ ceph_assert(!obs.oi.is_whiteout());
+ obs.oi.new_object();
+ if (!ignore_transaction)
+ ctx->op_t->create(obs.oi.soid);
+ } else if (obs.oi.is_whiteout()) {
+ dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ --ctx->delta_stats.num_whiteouts;
+ }
+}
+
+struct ReadFinisher : public PrimaryLogPG::OpFinisher {
+ OSDOp& osd_op;
+
+ explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
+ }
+
+ int execute() override {
+ return osd_op.rval;
+ }
+};
+
+struct C_ChecksumRead : public Context {
+ PrimaryLogPG *primary_log_pg;
+ OSDOp &osd_op;
+ Checksummer::CSumType csum_type;
+ bufferlist init_value_bl;
+ ceph_le64 read_length;
+ bufferlist read_bl;
+ Context *fill_extent_ctx;
+
+ C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
+ Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
+ std::optional<uint32_t> maybe_crc, uint64_t size,
+ OSDService *osd, hobject_t soid, uint32_t flags)
+ : primary_log_pg(primary_log_pg), osd_op(osd_op),
+ csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
+ fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
+ &read_bl, maybe_crc, size,
+ osd, soid, flags)) {
+ }
+ ~C_ChecksumRead() override {
+ delete fill_extent_ctx;
+ }
+
+ void finish(int r) override {
+ fill_extent_ctx->complete(r);
+ fill_extent_ctx = nullptr;
+
+ if (osd_op.rval >= 0) {
+ bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
+ osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
+ &init_value_bl_it, read_bl);
+ }
+ }
+};
+
+int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
+ bufferlist::const_iterator *bl_it)
+{
+ dout(20) << __func__ << dendl;
+
+ auto& op = osd_op.op;
+ if (op.checksum.chunk_size > 0) {
+ if (op.checksum.length == 0) {
+ dout(10) << __func__ << ": length required when chunk size provided"
+ << dendl;
+ return -EINVAL;
+ }
+ if (op.checksum.length % op.checksum.chunk_size != 0) {
+ dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ auto& oi = ctx->new_obs.oi;
+ if (op.checksum.offset == 0 && op.checksum.length == 0) {
+ // zeroed offset+length implies checksum whole object
+ op.checksum.length = oi.size;
+ } else if (op.checksum.offset >= oi.size) {
+ // read size was trimmed to zero, do nothing
+ // see PrimaryLogPG::do_read
+ return 0;
+ } else if (op.extent.offset + op.extent.length > oi.size) {
+ op.extent.length = oi.size - op.extent.offset;
+ if (op.checksum.chunk_size > 0 &&
+ op.checksum.length % op.checksum.chunk_size != 0) {
+ dout(10) << __func__ << ": length (trimmed to 0x"
+ << std::hex << op.checksum.length
+ << ") not aligned to chunk size 0x"
+ << op.checksum.chunk_size << std::dec
+ << dendl;
+ return -EINVAL;
+ }
+ }
+
+ Checksummer::CSumType csum_type;
+ switch (op.checksum.type) {
+ case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
+ csum_type = Checksummer::CSUM_XXHASH32;
+ break;
+ case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
+ csum_type = Checksummer::CSUM_XXHASH64;
+ break;
+ case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
+ csum_type = Checksummer::CSUM_CRC32C;
+ break;
+ default:
+ dout(10) << __func__ << ": unknown crc type ("
+ << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
+ return -EINVAL;
+ }
+
+ size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
+ if (bl_it->get_remaining() < csum_init_value_size) {
+ dout(10) << __func__ << ": init value not provided" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist init_value_bl;
+ init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
+ csum_init_value_size);
+ *bl_it += csum_init_value_size;
+
+ if (pool.info.is_erasure() && op.checksum.length > 0) {
+ // If there is a data digest and it is possible we are reading
+ // entire object, pass the digest.
+ std::optional<uint32_t> maybe_crc;
+ if (oi.is_data_digest() && op.checksum.offset == 0 &&
+ op.checksum.length >= oi.size) {
+ maybe_crc = oi.data_digest;
+ }
+
+ // async read
+ auto& soid = oi.soid;
+ auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
+ std::move(init_value_bl), maybe_crc,
+ oi.size, osd, soid, op.flags);
+
+ ctx->pending_async_reads.push_back({
+ {op.checksum.offset, op.checksum.length, op.flags},
+ {&checksum_ctx->read_bl, checksum_ctx}});
+
+ dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ return -EINPROGRESS;
+ }
+
+ // sync read
+ std::vector<OSDOp> read_ops(1);
+ auto& read_op = read_ops[0];
+ if (op.checksum.length > 0) {
+ read_op.op.op = CEPH_OSD_OP_READ;
+ read_op.op.flags = op.flags;
+ read_op.op.extent.offset = op.checksum.offset;
+ read_op.op.extent.length = op.checksum.length;
+ read_op.op.extent.truncate_size = 0;
+ read_op.op.extent.truncate_seq = 0;
+
+ int r = do_osd_ops(ctx, read_ops);
+ if (r < 0) {
+ derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
+ return finish_checksum(osd_op, csum_type, &init_value_bl_it,
+ read_op.outdata);
+}
+
+int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
+ Checksummer::CSumType csum_type,
+ bufferlist::const_iterator *init_value_bl_it,
+ const bufferlist &read_bl) {
+ dout(20) << __func__ << dendl;
+
+ auto& op = osd_op.op;
+
+ if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
+ derr << __func__ << ": bytes read " << read_bl.length() << " != "
+ << op.checksum.length << dendl;
+ return -EINVAL;
+ }
+
+ size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
+ op.checksum.chunk_size : read_bl.length());
+ uint32_t csum_count = (csum_chunk_size > 0 ?
+ read_bl.length() / csum_chunk_size : 0);
+
+ bufferlist csum;
+ bufferptr csum_data;
+ if (csum_count > 0) {
+ size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
+ csum_data = ceph::buffer::create(csum_value_size * csum_count);
+ csum_data.zero();
+ csum.append(csum_data);
+
+ switch (csum_type) {
+ case Checksummer::CSUM_XXHASH32:
+ {
+ Checksummer::xxhash32::init_value_t init_value;
+ decode(init_value, *init_value_bl_it);
+ Checksummer::calculate<Checksummer::xxhash32>(
+ init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
+ &csum_data);
+ }
+ break;
+ case Checksummer::CSUM_XXHASH64:
+ {
+ Checksummer::xxhash64::init_value_t init_value;
+ decode(init_value, *init_value_bl_it);
+ Checksummer::calculate<Checksummer::xxhash64>(
+ init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
+ &csum_data);
+ }
+ break;
+ case Checksummer::CSUM_CRC32C:
+ {
+ Checksummer::crc32c::init_value_t init_value;
+ decode(init_value, *init_value_bl_it);
+ Checksummer::calculate<Checksummer::crc32c>(
+ init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
+ &csum_data);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ encode(csum_count, osd_op.outdata);
+ osd_op.outdata.claim_append(csum);
+ return 0;
+}
+
+struct C_ExtentCmpRead : public Context {
+ PrimaryLogPG *primary_log_pg;
+ OSDOp &osd_op;
+ ceph_le64 read_length{};
+ bufferlist read_bl;
+ Context *fill_extent_ctx;
+
+ C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
+ std::optional<uint32_t> maybe_crc, uint64_t size,
+ OSDService *osd, hobject_t soid, uint32_t flags)
+ : primary_log_pg(primary_log_pg), osd_op(osd_op),
+ fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
+ &read_bl, maybe_crc, size,
+ osd, soid, flags)) {
+ }
+ ~C_ExtentCmpRead() override {
+ delete fill_extent_ctx;
+ }
+
+ void finish(int r) override {
+ if (r == -ENOENT) {
+ osd_op.rval = 0;
+ read_bl.clear();
+ delete fill_extent_ctx;
+ } else {
+ fill_extent_ctx->complete(r);
+ }
+ fill_extent_ctx = nullptr;
+
+ if (osd_op.rval >= 0) {
+ osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
+ }
+ }
+};
+
+int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
+{
+ dout(20) << __func__ << dendl;
+ ceph_osd_op& op = osd_op.op;
+
+ auto& oi = ctx->new_obs.oi;
+ uint64_t size = oi.size;
+ if ((oi.truncate_seq < op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
+ size = op.extent.truncate_size;
+ }
+
+ if (op.extent.offset >= size) {
+ op.extent.length = 0;
+ } else if (op.extent.offset + op.extent.length > size) {
+ op.extent.length = size - op.extent.offset;
+ }
+
+ if (op.extent.length == 0) {
+ dout(20) << __func__ << " zero length extent" << dendl;
+ return finish_extent_cmp(osd_op, bufferlist{});
+ } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
+ dout(20) << __func__ << " object DNE" << dendl;
+ return finish_extent_cmp(osd_op, {});
+ } else if (pool.info.is_erasure()) {
+ // If there is a data digest and it is possible we are reading
+ // entire object, pass the digest.
+ std::optional<uint32_t> maybe_crc;
+ if (oi.is_data_digest() && op.checksum.offset == 0 &&
+ op.checksum.length >= oi.size) {
+ maybe_crc = oi.data_digest;
+ }
+
+ // async read
+ auto& soid = oi.soid;
+ auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
+ osd, soid, op.flags);
+ ctx->pending_async_reads.push_back({
+ {op.extent.offset, op.extent.length, op.flags},
+ {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
+
+ dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ return -EINPROGRESS;
+ }
+
+ // sync read
+ vector<OSDOp> read_ops(1);
+ OSDOp& read_op = read_ops[0];
+
+ read_op.op.op = CEPH_OSD_OP_SYNC_READ;
+ read_op.op.extent.offset = op.extent.offset;
+ read_op.op.extent.length = op.extent.length;
+ read_op.op.extent.truncate_seq = op.extent.truncate_seq;
+ read_op.op.extent.truncate_size = op.extent.truncate_size;
+
+ int result = do_osd_ops(ctx, read_ops);
+ if (result < 0) {
+ derr << __func__ << " failed " << result << dendl;
+ return result;
+ }
+ return finish_extent_cmp(osd_op, read_op.outdata);
+}
+
+int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
+{
+ for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
+ char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
+ if (osd_op.indata[idx] != read_byte) {
+ return (-MAX_ERRNO - idx);
+ }
+ }
+
+ return 0;
+}
+
+int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
+ dout(20) << __func__ << dendl;
+ auto& op = osd_op.op;
+ auto& oi = ctx->new_obs.oi;
+ auto& soid = oi.soid;
+ __u32 seq = oi.truncate_seq;
+ uint64_t size = oi.size;
+ bool trimmed_read = false;
+
+ dout(30) << __func__ << " oi.size: " << oi.size << dendl;
+ dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
+ dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
+ dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
+
+ // are we beyond truncate_size?
+ if ( (seq < op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
+ (size > op.extent.truncate_size) )
+ size = op.extent.truncate_size;
+
+ if (op.extent.length == 0) //length is zero mean read the whole object
+ op.extent.length = size;
+
+ if (op.extent.offset >= size) {
+ op.extent.length = 0;
+ trimmed_read = true;
+ } else if (op.extent.offset + op.extent.length > size) {
+ op.extent.length = size - op.extent.offset;
+ trimmed_read = true;
+ }
+
+ dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
+
+ // read into a buffer
+ int result = 0;
+ if (trimmed_read && op.extent.length == 0) {
+ // read size was trimmed to zero and it is expected to do nothing
+ // a read operation of 0 bytes does *not* do nothing, this is why
+ // the trimmed_read boolean is needed
+ } else if (pool.info.is_erasure()) {
+ // The initialisation below is required to silence a false positive
+ // -Wmaybe-uninitialized warning
+ std::optional<uint32_t> maybe_crc;
+ // If there is a data digest and it is possible we are reading
+ // entire object, pass the digest. FillInVerifyExtent will
+ // will check the oi.size again.
+ if (oi.is_data_digest() && op.extent.offset == 0 &&
+ op.extent.length >= oi.size)
+ maybe_crc = oi.data_digest;
+ ctx->pending_async_reads.push_back(
+ make_pair(
+ boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
+ make_pair(&osd_op.outdata,
+ new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
+ &osd_op.outdata, maybe_crc, oi.size,
+ osd, soid, op.flags))));
+ dout(10) << " async_read noted for " << soid << dendl;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ } else {
+ int r = pgbackend->objects_read_sync(
+ soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
+ // whole object? can we verify the checksum?
+ if (r >= 0 && op.extent.offset == 0 &&
+ (uint64_t)r == oi.size && oi.is_data_digest()) {
+ uint32_t crc = osd_op.outdata.crc32c(-1);
+ if (oi.data_digest != crc) {
+ osd->clog->error() << info.pgid << std::hex
+ << " full-object read crc 0x" << crc
+ << " != expected 0x" << oi.data_digest
+ << std::dec << " on " << soid;
+ r = -EIO; // try repair later
+ }
+ }
+ if (r == -EIO) {
+ r = rep_repair_primary_object(soid, ctx);
+ }
+ if (r >= 0)
+ op.extent.length = r;
+ else if (r == -EAGAIN) {
+ result = -EAGAIN;
+ } else {
+ result = r;
+ op.extent.length = 0;
+ }
+ dout(10) << " read got " << r << " / " << op.extent.length
+ << " bytes from obj " << soid << dendl;
+ }
+ if (result >= 0) {
+ ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
+ ctx->delta_stats.num_rd++;
+ }
+ return result;
+}
+
+int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
+ dout(20) << __func__ << dendl;
+ auto& op = osd_op.op;
+ auto& oi = ctx->new_obs.oi;
+ auto& soid = oi.soid;
+
+ if (op.extent.truncate_seq) {
+ dout(0) << "sparse_read does not support truncation sequence " << dendl;
+ return -EINVAL;
+ }
+
+ ++ctx->num_read;
+ if (pool.info.is_erasure()) {
+ // translate sparse read to a normal one if not supported
+ uint64_t offset = op.extent.offset;
+ uint64_t length = op.extent.length;
+ if (offset > oi.size) {
+ length = 0;
+ } else if (offset + length > oi.size) {
+ length = oi.size - offset;
+ }
+
+ if (length > 0) {
+ ctx->pending_async_reads.push_back(
+ make_pair(
+ boost::make_tuple(offset, length, op.flags),
+ make_pair(
+ &osd_op.outdata,
+ new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
+ &op.extent.length))));
+ dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ } else {
+ dout(10) << " sparse read ended up empty for " << soid << dendl;
+ map<uint64_t, uint64_t> extents;
+ encode(extents, osd_op.outdata);
+ }
+ } else {
+ // read into a buffer
+ map<uint64_t, uint64_t> m;
+ int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
+ info.pgid.shard),
+ op.extent.offset, op.extent.length, m);
+ if (r < 0) {
+ return r;
+ }
+
+ bufferlist data_bl;
+ r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
+ if (r == -EIO) {
+ r = rep_repair_primary_object(soid, ctx);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
+ // Maybe at first, there is no much whole objects. With continued use, more
+ // and more whole object exist. So from this point, for spare-read add
+ // checksum make sense.
+ if ((uint64_t)r == oi.size && oi.is_data_digest()) {
+ uint32_t crc = data_bl.crc32c(-1);
+ if (oi.data_digest != crc) {
+ osd->clog->error() << info.pgid << std::hex
+ << " full-object read crc 0x" << crc
+ << " != expected 0x" << oi.data_digest
+ << std::dec << " on " << soid;
+ r = rep_repair_primary_object(soid, ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ op.extent.length = r;
+
+ encode(m, osd_op.outdata); // re-encode since it might be modified
+ ::encode_destructively(data_bl, osd_op.outdata);
+
+ dout(10) << " sparse_read got " << r << " bytes from object "
+ << soid << dendl;
+ }
+
+ ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
+ ctx->delta_stats.num_rd++;
+ return 0;
+}
+
+int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
+{
+ int result = 0;
+ SnapSetContext *ssc = ctx->obc->ssc;
+ ObjectState& obs = ctx->new_obs;
+ object_info_t& oi = obs.oi;
+ const hobject_t& soid = oi.soid;
+ const bool skip_data_digest = osd->store->has_builtin_csum() &&
+ osd->osd_skip_data_digest;
+
+ PGTransaction* t = ctx->op_t.get();
+
+ dout(10) << "do_osd_op " << soid << " " << ops << dendl;
+#ifdef HAVE_JAEGER
+ if (ctx->op->osd_parent_span) {
+ auto do_osd_op_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+ }
+#endif
+
+ ctx->current_osd_subop_num = 0;
+ for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
+ OSDOp& osd_op = *p;
+ ceph_osd_op& op = osd_op.op;
+
+ OpFinisher* op_finisher = nullptr;
+ {
+ auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
+ if (op_finisher_it != ctx->op_finishers.end()) {
+ op_finisher = op_finisher_it->second.get();
+ }
+ }
+
+ // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
+ // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
+ // but the code in this function seems to treat them as native-endian. What should the
+ // tracepoints do?
+ tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
+
+ dout(10) << "do_osd_op " << osd_op << dendl;
+
+ auto bp = osd_op.indata.cbegin();
+
+ // user-visible modifcation?
+ switch (op.op) {
+ // non user-visible modifications
+ case CEPH_OSD_OP_WATCH:
+ case CEPH_OSD_OP_CACHE_EVICT:
+ case CEPH_OSD_OP_CACHE_FLUSH:
+ case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+ case CEPH_OSD_OP_UNDIRTY:
+ case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
+ case CEPH_OSD_OP_COPY_FROM2:
+ case CEPH_OSD_OP_CACHE_PIN:
+ case CEPH_OSD_OP_CACHE_UNPIN:
+ case CEPH_OSD_OP_SET_REDIRECT:
+ case CEPH_OSD_OP_SET_CHUNK:
+ case CEPH_OSD_OP_TIER_PROMOTE:
+ case CEPH_OSD_OP_TIER_FLUSH:
+ case CEPH_OSD_OP_TIER_EVICT:
+ break;
+ default:
+ if (op.op & CEPH_OSD_OP_MODE_WR)
+ ctx->user_modify = true;
+ }
+
+ // munge -1 truncate to 0 truncate
+ if (ceph_osd_op_uses_extent(op.op) &&
+ op.extent.truncate_seq == 1 &&
+ op.extent.truncate_size == (-1ULL)) {
+ op.extent.truncate_size = 0;
+ op.extent.truncate_seq = 0;
+ }
+
+ // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
+ if (op.op == CEPH_OSD_OP_ZERO &&
+ obs.exists &&
+ op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
+ op.extent.length >= 1 &&
+ op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
+ op.extent.offset + op.extent.length >= oi.size) {
+ if (op.extent.offset >= oi.size) {
+ // no-op
+ goto fail;
+ }
+ dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
+ << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
+ op.op = CEPH_OSD_OP_TRUNCATE;
+ }
+
+ switch (op.op) {
+
+ // --- READS ---
+
+ case CEPH_OSD_OP_CMPEXT:
+ ++ctx->num_read;
+ tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+ op.extent.length, op.extent.truncate_size,
+ op.extent.truncate_seq);
+
+ if (op_finisher == nullptr) {
+ result = do_extent_cmp(ctx, osd_op);
+ } else {
+ result = op_finisher->execute();
+ }
+ break;
+
+ case CEPH_OSD_OP_SYNC_READ:
+ if (pool.info.is_erasure()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ // fall through
+ case CEPH_OSD_OP_READ:
+ ++ctx->num_read;
+ tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+ op.extent.length, op.extent.truncate_size,
+ op.extent.truncate_seq);
+ if (op_finisher == nullptr) {
+ if (!ctx->data_off) {
+ ctx->data_off = op.extent.offset;
+ }
+ result = do_read(ctx, osd_op);
+ } else {
+ result = op_finisher->execute();
+ }
+ break;
+
+ case CEPH_OSD_OP_CHECKSUM:
+ ++ctx->num_read;
+ {
+ tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
+ op.checksum.offset, op.checksum.length,
+ op.checksum.chunk_size);
+
+ if (op_finisher == nullptr) {
+ result = do_checksum(ctx, osd_op, &bp);
+ } else {
+ result = op_finisher->execute();
+ }
+ }
+ break;
+
+ /* map extents */
+ case CEPH_OSD_OP_MAPEXT:
+ tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
+ if (pool.info.is_erasure()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_read;
+ {
+ // read into a buffer
+ bufferlist bl;
+ int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
+ info.pgid.shard),
+ op.extent.offset, op.extent.length, bl);
+ osd_op.outdata = std::move(bl);
+ if (r < 0)
+ result = r;
+ else
+ ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ ctx->delta_stats.num_rd++;
+ dout(10) << " map_extents done on object " << soid << dendl;
+ }
+ break;
+
+ /* map extents */
+ case CEPH_OSD_OP_SPARSE_READ:
+ tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
+ soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+ op.extent.length, op.extent.truncate_size,
+ op.extent.truncate_seq);
+ if (op_finisher == nullptr) {
+ result = do_sparse_read(ctx, osd_op);
+ } else {
+ result = op_finisher->execute();
+ }
+ break;
+
+ case CEPH_OSD_OP_CALL:
+ {
+ string cname, mname;
+ bufferlist indata;
+ try {
+ bp.copy(op.cls.class_len, cname);
+ bp.copy(op.cls.method_len, mname);
+ bp.copy(op.cls.indata_len, indata);
+ } catch (ceph::buffer::error& e) {
+ dout(10) << "call unable to decode class + method + indata" << dendl;
+ dout(30) << "in dump: ";
+ osd_op.indata.hexdump(*_dout);
+ *_dout << dendl;
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
+ break;
+ }
+ tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
+
+ ClassHandler::ClassData *cls;
+ result = ClassHandler::get_instance().open_class(cname, &cls);
+ ceph_assert(result == 0); // init_op_flags() already verified this works.
+
+ ClassHandler::ClassMethod *method = cls->get_method(mname);
+ if (!method) {
+ dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
+ result = -EOPNOTSUPP;
+ break;
+ }
+
+ int flags = method->get_flags();
+ if (flags & CLS_METHOD_WR)
+ ctx->user_modify = true;
+
+ bufferlist outdata;
+ dout(10) << "call method " << cname << "." << mname << dendl;
+ int prev_rd = ctx->num_read;
+ int prev_wr = ctx->num_write;
+ result = method->exec((cls_method_context_t)&ctx, indata, outdata);
+
+ if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
+ derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
+ result = -EIO;
+ break;
+ }
+ if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
+ derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
+ result = -EIO;
+ break;
+ }
+
+ dout(10) << "method called response length=" << outdata.length() << dendl;
+ op.extent.length = outdata.length();
+ osd_op.outdata.claim_append(outdata);
+ dout(30) << "out dump: ";
+ osd_op.outdata.hexdump(*_dout);
+ *_dout << dendl;
+ }
+ break;
+
+ case CEPH_OSD_OP_STAT:
+ // note: stat does not require RD
+ {
+ tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
+
+ if (obs.exists && !oi.is_whiteout()) {
+ encode(oi.size, osd_op.outdata);
+ encode(oi.mtime, osd_op.outdata);
+ dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
+ } else {
+ result = -ENOENT;
+ dout(10) << "stat oi object does not exist" << dendl;
+ }
+
+ ctx->delta_stats.num_rd++;
+ }
+ break;
+
+ case CEPH_OSD_OP_ISDIRTY:
+ ++ctx->num_read;
+ {
+ tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
+ bool is_dirty = obs.oi.is_dirty();
+ encode(is_dirty, osd_op.outdata);
+ ctx->delta_stats.num_rd++;
+ result = 0;
+ }
+ break;
+
+ case CEPH_OSD_OP_UNDIRTY:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
+ if (oi.is_dirty()) {
+ ctx->undirty = true; // see make_writeable()
+ ctx->modify = true;
+ ctx->delta_stats.num_wr++;
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
+ if (ctx->lock_type != RWState::RWNONE) {
+ dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
+ result = -EINVAL;
+ break;
+ }
+ if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = 0;
+ break;
+ }
+ if (oi.is_cache_pinned()) {
+ dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
+ result = -EPERM;
+ break;
+ }
+ if (oi.is_dirty()) {
+ result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
+ if (result == -EINPROGRESS)
+ result = -EAGAIN;
+ } else {
+ result = 0;
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_CACHE_FLUSH:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
+ if (ctx->lock_type == RWState::RWNONE) {
+ dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
+ result = -EINVAL;
+ break;
+ }
+ if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = 0;
+ break;
+ }
+ if (oi.is_cache_pinned()) {
+ dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
+ result = -EPERM;
+ break;
+ }
+ hobject_t missing;
+ if (oi.is_dirty()) {
+ result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
+ if (result == -EINPROGRESS)
+ result = -EAGAIN;
+ } else {
+ result = 0;
+ }
+ // Check special return value which has set missing_return
+ if (result == -ENOENT) {
+ dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
+ ceph_assert(!missing.is_min());
+ wait_for_unreadable_object(missing, ctx->op);
+ // Error code which is used elsewhere when wait_for_unreadable_object() is used
+ result = -EAGAIN;
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_CACHE_EVICT:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
+ if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = 0;
+ break;
+ }
+ if (oi.is_cache_pinned()) {
+ dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
+ result = -EPERM;
+ break;
+ }
+ if (oi.is_dirty()) {
+ result = -EBUSY;
+ break;
+ }
+ if (!oi.watchers.empty()) {
+ result = -EBUSY;
+ break;
+ }
+ if (soid.snap == CEPH_NOSNAP) {
+ result = _verify_no_head_clones(soid, ssc->snapset);
+ if (result < 0)
+ break;
+ }
+ result = _delete_oid(ctx, true, false);
+ if (result >= 0) {
+ // mark that this is a cache eviction to avoid triggering normal
+ // make_writeable() clone creation in finish_ctx()
+ ctx->cache_operation = true;
+ }
+ osd->logger->inc(l_osd_tier_evict);
+ }
+ break;
+
+ case CEPH_OSD_OP_GETXATTR:
+ ++ctx->num_read;
+ {
+ string aname;
+ bp.copy(op.xattr.name_len, aname);
+ tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+ string name = "_" + aname;
+ int r = getattr_maybe_cache(
+ ctx->obc,
+ name,
+ &(osd_op.outdata));
+ if (r >= 0) {
+ op.xattr.value_len = osd_op.outdata.length();
+ result = 0;
+ ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ } else
+ result = r;
+
+ ctx->delta_stats.num_rd++;
+ }
+ break;
+
+ case CEPH_OSD_OP_GETXATTRS:
+ ++ctx->num_read;
+ {
+ tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
+ map<string, bufferlist> out;
+ result = getattrs_maybe_cache(
+ ctx->obc,
+ &out);
+
+ bufferlist bl;
+ encode(out, bl);
+ ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+ ctx->delta_stats.num_rd++;
+ osd_op.outdata.claim_append(bl);
+ }
+ break;
+
+ case CEPH_OSD_OP_CMPXATTR:
+ ++ctx->num_read;
+ {
+ string aname;
+ bp.copy(op.xattr.name_len, aname);
+ tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+ string name = "_" + aname;
+ name[op.xattr.name_len + 1] = 0;
+
+ bufferlist xattr;
+ result = getattr_maybe_cache(
+ ctx->obc,
+ name,
+ &xattr);
+ if (result < 0 && result != -EEXIST && result != -ENODATA)
+ break;
+
+ ctx->delta_stats.num_rd++;
+ ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
+
+ switch (op.xattr.cmp_mode) {
+ case CEPH_OSD_CMPXATTR_MODE_STRING:
+ {
+ string val;
+ bp.copy(op.xattr.value_len, val);
+ val[op.xattr.value_len] = 0;
+ dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
+ << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
+ result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
+ }
+ break;
+
+ case CEPH_OSD_CMPXATTR_MODE_U64:
+ {
+ uint64_t u64val;
+ try {
+ decode(u64val, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+ dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
+ << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
+ result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
+ }
+ break;
+
+ default:
+ dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
+ result = -EINVAL;
+ }
+
+ if (!result) {
+ dout(10) << "comparison returned false" << dendl;
+ result = -ECANCELED;
+ break;
+ }
+ if (result < 0) {
+ dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
+ break;
+ }
+
+ dout(10) << "comparison returned true" << dendl;
+ }
+ break;
+
+ case CEPH_OSD_OP_ASSERT_VER:
+ ++ctx->num_read;
+ {
+ uint64_t ver = op.assert_ver.ver;
+ tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
+ if (!ver)
+ result = -EINVAL;
+ else if (ver < oi.user_version)
+ result = -ERANGE;
+ else if (ver > oi.user_version)
+ result = -EOVERFLOW;
+ }
+ break;
+
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ ++ctx->num_read;
+ {
+ tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
+ obj_list_watch_response_t resp;
+
+ map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
+ for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
+ ++oi_iter) {
+ dout(20) << "key cookie=" << oi_iter->first.first
+ << " entity=" << oi_iter->first.second << " "
+ << oi_iter->second << dendl;
+ ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
+ ceph_assert(oi_iter->first.second.is_client());
+
+ watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
+ oi_iter->second.timeout_seconds, oi_iter->second.addr);
+ resp.entries.push_back(wi);
+ }
+
+ resp.encode(osd_op.outdata, ctx->get_features());
+ result = 0;
+
+ ctx->delta_stats.num_rd++;
+ break;
+ }
+
+ case CEPH_OSD_OP_LIST_SNAPS:
+ ++ctx->num_read;
+ {
+ tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
+ obj_list_snap_response_t resp;
+
+ if (!ssc) {
+ ssc = ctx->obc->ssc = get_snapset_context(soid, false);
+ }
+ ceph_assert(ssc);
+ dout(20) << " snapset " << ssc->snapset << dendl;
+
+ int clonecount = ssc->snapset.clones.size();
+ clonecount++; // for head
+ resp.clones.reserve(clonecount);
+ for (auto clone_iter = ssc->snapset.clones.begin();
+ clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
+ clone_info ci;
+ ci.cloneid = *clone_iter;
+
+ hobject_t clone_oid = soid;
+ clone_oid.snap = *clone_iter;
+
+ auto p = ssc->snapset.clone_snaps.find(*clone_iter);
+ if (p == ssc->snapset.clone_snaps.end()) {
+ osd->clog->error() << "osd." << osd->whoami
+ << ": inconsistent clone_snaps found for oid "
+ << soid << " clone " << *clone_iter
+ << " snapset " << ssc->snapset;
+ result = -EINVAL;
+ break;
+ }
+ for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
+ ci.snaps.push_back(*q);
+ }
+
+ dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
+
+ map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
+ coi = ssc->snapset.clone_overlap.find(ci.cloneid);
+ if (coi == ssc->snapset.clone_overlap.end()) {
+ osd->clog->error() << "osd." << osd->whoami
+ << ": inconsistent clone_overlap found for oid "
+ << soid << " clone " << *clone_iter;
+ result = -EINVAL;
+ break;
+ }
+ const interval_set<uint64_t> &o = coi->second;
+ ci.overlap.reserve(o.num_intervals());
+ for (interval_set<uint64_t>::const_iterator r = o.begin();
+ r != o.end(); ++r) {
+ ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
+ r.get_len()));
+ }
+
+ map<snapid_t, uint64_t>::const_iterator si;
+ si = ssc->snapset.clone_size.find(ci.cloneid);
+ if (si == ssc->snapset.clone_size.end()) {
+ osd->clog->error() << "osd." << osd->whoami
+ << ": inconsistent clone_size found for oid "
+ << soid << " clone " << *clone_iter;
+ result = -EINVAL;
+ break;
+ }
+ ci.size = si->second;
+
+ resp.clones.push_back(ci);
+ }
+ if (result < 0) {
+ break;
+ }
+ if (!ctx->obc->obs.oi.is_whiteout()) {
+ ceph_assert(obs.exists);
+ clone_info ci;
+ ci.cloneid = CEPH_NOSNAP;
+
+ //Size for HEAD is oi.size
+ ci.size = oi.size;
+
+ resp.clones.push_back(ci);
+ }
+ resp.seq = ssc->snapset.seq;
+
+ resp.encode(osd_op.outdata);
+ result = 0;
+
+ ctx->delta_stats.num_rd++;
+ break;
+ }
+
+ case CEPH_OSD_OP_NOTIFY:
+ ++ctx->num_read;
+ {
+ uint32_t timeout;
+ bufferlist bl;
+
+ try {
+ uint32_t ver; // obsolete
+ decode(ver, bp);
+ decode(timeout, bp);
+ decode(bl, bp);
+ } catch (const ceph::buffer::error &e) {
+ timeout = 0;
+ }
+ tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
+ if (!timeout)
+ timeout = cct->_conf->osd_default_notify_timeout;
+
+ notify_info_t n;
+ n.timeout = timeout;
+ n.notify_id = osd->get_next_id(get_osdmap_epoch());
+ n.cookie = op.notify.cookie;
+ n.bl = bl;
+ ctx->notifies.push_back(n);
+
+ // return our unique notify id to the client
+ encode(n.notify_id, osd_op.outdata);
+ }
+ break;
+
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ ++ctx->num_read;
+ {
+ try {
+ uint64_t notify_id = 0;
+ uint64_t watch_cookie = 0;
+ decode(notify_id, bp);
+ decode(watch_cookie, bp);
+ bufferlist reply_bl;
+ if (!bp.end()) {
+ decode(reply_bl, bp);
+ }
+ tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
+ OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
+ ctx->notify_acks.push_back(ack);
+ } catch (const ceph::buffer::error &e) {
+ tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
+ OpContext::NotifyAck ack(
+ // op.watch.cookie is actually the notify_id for historical reasons
+ op.watch.cookie
+ );
+ ctx->notify_acks.push_back(ack);
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_SETALLOCHINT:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
+ maybe_create_new_object(ctx);
+ oi.expected_object_size = op.alloc_hint.expected_object_size;
+ oi.expected_write_size = op.alloc_hint.expected_write_size;
+ oi.alloc_hint_flags = op.alloc_hint.flags;
+ t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
+ op.alloc_hint.expected_write_size,
+ op.alloc_hint.flags);
+ }
+ break;
+
+
+ // --- WRITES ---
+
+ // -- object data --
+
+ case CEPH_OSD_OP_WRITE:
+ ++ctx->num_write;
+ result = 0;
+ { // write
+ __u32 seq = oi.truncate_seq;
+ tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+ if (op.extent.length != osd_op.indata.length()) {
+ result = -EINVAL;
+ break;
+ }
+
+ if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
+ op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ if (pool.info.requires_aligned_append() &&
+ (op.extent.offset % pool.info.required_alignment() != 0)) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+
+ if (!obs.exists) {
+ if (pool.info.requires_aligned_append() && op.extent.offset) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ } else if (op.extent.offset != oi.size &&
+ pool.info.requires_aligned_append()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+
+ if (seq && (seq > op.extent.truncate_seq) &&
+ (op.extent.offset + op.extent.length > oi.size)) {
+ // old write, arrived after trimtrunc
+ op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
+ dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
+ << ", adjusting write length to " << op.extent.length << dendl;
+ bufferlist t;
+ t.substr_of(osd_op.indata, 0, op.extent.length);
+ osd_op.indata.swap(t);
+ }
+ if (op.extent.truncate_seq > seq) {
+ // write arrives before trimtrunc
+ if (obs.exists && !oi.is_whiteout()) {
+ dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
+ << ", truncating to " << op.extent.truncate_size << dendl;
+ t->truncate(soid, op.extent.truncate_size);
+ oi.truncate_seq = op.extent.truncate_seq;
+ oi.truncate_size = op.extent.truncate_size;
+ if (oi.size > op.extent.truncate_size) {
+ interval_set<uint64_t> trim;
+ trim.insert(op.extent.truncate_size,
+ oi.size - op.extent.truncate_size);
+ ctx->modified_ranges.union_of(trim);
+ ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
+ oi.clear_data_digest();
+ }
+ if (op.extent.truncate_size != oi.size) {
+ truncate_update_size_and_usage(ctx->delta_stats,
+ oi,
+ op.extent.truncate_size);
+ }
+ } else {
+ dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
+ << ", but object is new" << dendl;
+ oi.truncate_seq = op.extent.truncate_seq;
+ oi.truncate_size = op.extent.truncate_size;
+ }
+ }
+ result = check_offset_and_length(
+ op.extent.offset, op.extent.length,
+ static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+ if (result < 0)
+ break;
+
+ maybe_create_new_object(ctx);
+
+ if (op.extent.length == 0) {
+ if (op.extent.offset > oi.size) {
+ t->truncate(
+ soid, op.extent.offset);
+ truncate_update_size_and_usage(ctx->delta_stats, oi,
+ op.extent.offset);
+ } else {
+ t->nop(soid);
+ }
+ } else {
+ t->write(
+ soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
+ }
+
+ if (op.extent.offset == 0 && op.extent.length >= oi.size
+ && !skip_data_digest) {
+ obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+ } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
+ if (skip_data_digest) {
+ obs.oi.clear_data_digest();
+ } else {
+ obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
+ }
+ } else {
+ obs.oi.clear_data_digest();
+ }
+ write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+ op.extent.offset, op.extent.length);
+ ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
+ dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
+ }
+ break;
+
+ case CEPH_OSD_OP_WRITEFULL:
+ ++ctx->num_write;
+ result = 0;
+ { // write full object
+ tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
+
+ if (op.extent.length != osd_op.indata.length()) {
+ result = -EINVAL;
+ break;
+ }
+ result = check_offset_and_length(
+ 0, op.extent.length,
+ static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+ if (result < 0)
+ break;
+
+ if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
+ op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ maybe_create_new_object(ctx);
+ if (pool.info.is_erasure()) {
+ t->truncate(soid, 0);
+ } else if (obs.exists && op.extent.length < oi.size) {
+ t->truncate(soid, op.extent.length);
+ }
+ if (op.extent.length) {
+ t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
+ }
+ if (!skip_data_digest) {
+ obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+ } else {
+ obs.oi.clear_data_digest();
+ }
+ ctx->clean_regions.mark_data_region_dirty(0,
+ std::max((uint64_t)op.extent.length, oi.size));
+ write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+ 0, op.extent.length, true);
+ }
+ break;
+
+ case CEPH_OSD_OP_WRITESAME:
+ ++ctx->num_write;
+ tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
+ result = do_writesame(ctx, osd_op);
+ break;
+
+ case CEPH_OSD_OP_ROLLBACK :
+ ++ctx->num_write;
+ tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
+ result = _rollback_to(ctx, op);
+ break;
+
+ case CEPH_OSD_OP_ZERO:
+ tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
+ if (pool.info.requires_aligned_append()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_write;
+ { // zero
+ result = check_offset_and_length(
+ op.extent.offset, op.extent.length,
+ static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+ if (result < 0)
+ break;
+
+ ceph_assert(op.extent.length);
+ if (obs.exists && !oi.is_whiteout()) {
+ t->zero(soid, op.extent.offset, op.extent.length);
+ interval_set<uint64_t> ch;
+ ch.insert(op.extent.offset, op.extent.length);
+ ctx->modified_ranges.union_of(ch);
+ ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
+ ctx->delta_stats.num_wr++;
+ oi.clear_data_digest();
+ } else {
+ // no-op
+ }
+ }
+ break;
+ case CEPH_OSD_OP_CREATE:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
+ if (obs.exists && !oi.is_whiteout() &&
+ (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
+ result = -EEXIST; /* this is an exclusive create */
+ } else {
+ if (osd_op.indata.length()) {
+ auto p = osd_op.indata.cbegin();
+ string category;
+ try {
+ decode(category, p);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+ // category is no longer implemented.
+ }
+ maybe_create_new_object(ctx);
+ t->nop(soid);
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_TRIMTRUNC:
+ op.extent.offset = op.extent.truncate_size;
+ // falling through
+
+ case CEPH_OSD_OP_TRUNCATE:
+ tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+ if (pool.info.requires_aligned_append()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ // truncate
+ if (!obs.exists || oi.is_whiteout()) {
+ dout(10) << " object dne, truncate is a no-op" << dendl;
+ break;
+ }
+
+ result = check_offset_and_length(
+ op.extent.offset, op.extent.length,
+ static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+ if (result < 0)
+ break;
+
+ if (op.extent.truncate_seq) {
+ ceph_assert(op.extent.offset == op.extent.truncate_size);
+ if (op.extent.truncate_seq <= oi.truncate_seq) {
+ dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
+ << ", no-op" << dendl;
+ break; // old
+ }
+ dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
+ << ", truncating" << dendl;
+ oi.truncate_seq = op.extent.truncate_seq;
+ oi.truncate_size = op.extent.truncate_size;
+ }
+
+ maybe_create_new_object(ctx);
+ t->truncate(soid, op.extent.offset);
+ if (oi.size > op.extent.offset) {
+ interval_set<uint64_t> trim;
+ trim.insert(op.extent.offset, oi.size-op.extent.offset);
+ ctx->modified_ranges.union_of(trim);
+ ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
+ } else if (oi.size < op.extent.offset) {
+ ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
+ }
+ if (op.extent.offset != oi.size) {
+ truncate_update_size_and_usage(ctx->delta_stats,
+ oi,
+ op.extent.offset);
+ }
+ ctx->delta_stats.num_wr++;
+ // do no set exists, or we will break above DELETE -> TRUNCATE munging.
+
+ oi.clear_data_digest();
+ }
+ break;
+
+ case CEPH_OSD_OP_DELETE:
+ ++ctx->num_write;
+ result = 0;
+ tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
+ {
+ result = _delete_oid(ctx, false, ctx->ignore_cache);
+ }
+ break;
+
+ case CEPH_OSD_OP_WATCH:
+ ++ctx->num_write;
+ result = 0;
+ {
+ tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
+ op.watch.cookie, op.watch.op);
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ result = 0;
+ uint64_t cookie = op.watch.cookie;
+ entity_name_t entity = ctx->reqid.name;
+ ObjectContextRef obc = ctx->obc;
+
+ dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
+ << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
+ << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
+ dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
+ dout(10) << "watch: peer_addr="
+ << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
+
+ uint32_t timeout = cct->_conf->osd_client_watch_timeout;
+ if (op.watch.timeout != 0) {
+ timeout = op.watch.timeout;
+ }
+
+ watch_info_t w(cookie, timeout,
+ ctx->op->get_req()->get_connection()->get_peer_addr());
+ if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
+ op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
+ if (oi.watchers.count(make_pair(cookie, entity))) {
+ dout(10) << " found existing watch " << w << " by " << entity << dendl;
+ } else {
+ dout(10) << " registered new watch " << w << " by " << entity << dendl;
+ oi.watchers[make_pair(cookie, entity)] = w;
+ t->nop(soid); // make sure update the object_info on disk!
+ }
+ bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
+ ctx->watch_connects.push_back(make_pair(w, will_ping));
+ } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
+ if (!oi.watchers.count(make_pair(cookie, entity))) {
+ result = -ENOTCONN;
+ break;
+ }
+ dout(10) << " found existing watch " << w << " by " << entity << dendl;
+ ctx->watch_connects.push_back(make_pair(w, true));
+ } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
+ /* Note: WATCH with PING doesn't cause may_write() to return true,
+ * so if there is nothing else in the transaction, this is going
+ * to run do_osd_op_effects, but not write out a log entry */
+ if (!oi.watchers.count(make_pair(cookie, entity))) {
+ result = -ENOTCONN;
+ break;
+ }
+ map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
+ obc->watchers.find(make_pair(cookie, entity));
+ if (p == obc->watchers.end() ||
+ !p->second->is_connected()) {
+ // client needs to reconnect
+ result = -ETIMEDOUT;
+ break;
+ }
+ dout(10) << " found existing watch " << w << " by " << entity << dendl;
+ p->second->got_ping(ceph_clock_now());
+ result = 0;
+ } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
+ map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
+ oi.watchers.find(make_pair(cookie, entity));
+ if (oi_iter != oi.watchers.end()) {
+ dout(10) << " removed watch " << oi_iter->second << " by "
+ << entity << dendl;
+ oi.watchers.erase(oi_iter);
+ t->nop(soid); // update oi on disk
+ ctx->watch_disconnects.push_back(
+ watch_disconnect_t(cookie, entity, false));
+ } else {
+ dout(10) << " can't remove: no watch by " << entity << dendl;
+ }
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_CACHE_PIN:
+ tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
+ if ((!pool.info.is_tier() ||
+ pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
+ result = -EINVAL;
+ dout(10) << " pin object is only allowed on the cache tier " << dendl;
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ break;
+ }
+
+ if (!oi.is_cache_pinned()) {
+ oi.set_flag(object_info_t::FLAG_CACHE_PIN);
+ ctx->modify = true;
+ ctx->delta_stats.num_objects_pinned++;
+ ctx->delta_stats.num_wr++;
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_CACHE_UNPIN:
+ tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
+ if ((!pool.info.is_tier() ||
+ pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
+ result = -EINVAL;
+ dout(10) << " pin object is only allowed on the cache tier " << dendl;
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ break;
+ }
+
+ if (oi.is_cache_pinned()) {
+ oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
+ ctx->modify = true;
+ ctx->delta_stats.num_objects_pinned--;
+ ctx->delta_stats.num_wr++;
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_SET_REDIRECT:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (pool.info.is_tier()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+
+ object_t target_name;
+ object_locator_t target_oloc;
+ snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
+ version_t target_version = op.copy_from.src_version;
+ try {
+ decode(target_name, bp);
+ decode(target_oloc, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+ pg_t raw_pg;
+ result = get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
+ if (result < 0) {
+ dout(5) << " pool information is invalid: " << result << dendl;
+ break;
+ }
+ hobject_t target(target_name, target_oloc.key, target_snapid,
+ raw_pg.ps(), raw_pg.pool(),
+ target_oloc.nspace);
+ if (target == soid) {
+ dout(20) << " set-redirect self is invalid" << dendl;
+ result = -EINVAL;
+ break;
+ }
+
+ bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
+ bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
+ if (has_reference) {
+ result = -EINVAL;
+ dout(5) << " the object is already a manifest " << dendl;
+ break;
+ }
+ if (op_finisher == nullptr && need_reference) {
+ // start
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new SetManifestFinisher(osd_op));
+ ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
+ C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, soid);
+ ceph_tid_t tid = refcount_manifest(soid, target,
+ refcount_t::INCREMENT_REF, fin, std::nullopt);
+ mop->objecter_tid = tid;
+ manifest_ops[soid] = mop;
+ ctx->obc->start_block();
+ result = -EINPROGRESS;
+ } else {
+ // finish
+ if (op_finisher) {
+ result = op_finisher->execute();
+ ceph_assert(result == 0);
+ }
+
+ if (!oi.has_manifest() && !oi.manifest.is_redirect())
+ ctx->delta_stats.num_objects_manifest++;
+
+ oi.set_flag(object_info_t::FLAG_MANIFEST);
+ oi.manifest.redirect_target = target;
+ oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
+ t->truncate(soid, 0);
+ ctx->clean_regions.mark_data_region_dirty(0, oi.size);
+ if (oi.is_omap() && pool.info.supports_omap()) {
+ t->omap_clear(soid);
+ obs.oi.clear_omap_digest();
+ obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+ ctx->clean_regions.mark_omap_dirty();
+ }
+ write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+ 0, oi.size, false);
+ ctx->delta_stats.num_bytes -= oi.size;
+ oi.size = 0;
+ oi.new_object();
+ oi.user_version = target_version;
+ ctx->user_at_version = target_version;
+ /* rm_attrs */
+ map<string,bufferlist> rmattrs;
+ result = getattrs_maybe_cache(ctx->obc, &rmattrs);
+ if (result < 0) {
+ dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
+ return result;
+ }
+ map<string, bufferlist>::iterator iter;
+ for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
+ const string& name = iter->first;
+ t->rmattr(soid, name);
+ }
+ if (!has_reference && need_reference) {
+ oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
+ }
+ dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
+ if (op_finisher) {
+ ctx->op_finishers.erase(ctx->current_osd_subop_num);
+ }
+ }
+ }
+
+ break;
+
+ case CEPH_OSD_OP_SET_CHUNK:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (pool.info.is_tier()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (oi.manifest.is_redirect()) {
+ result = -EINVAL;
+ goto fail;
+ }
+
+ object_locator_t tgt_oloc;
+ uint64_t src_offset, src_length, tgt_offset;
+ object_t tgt_name;
+ try {
+ decode(src_offset, bp);
+ decode(src_length, bp);
+ decode(tgt_oloc, bp);
+ decode(tgt_name, bp);
+ decode(tgt_offset, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+
+ if (!src_length) {
+ result = -EINVAL;
+ goto fail;
+ }
+ if (src_offset + src_length > oi.size) {
+ result = -ERANGE;
+ goto fail;
+ }
+ if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (pool.info.is_erasure()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+
+ for (auto &p : oi.manifest.chunk_map) {
+ interval_set<uint64_t> chunk;
+ chunk.insert(p.first, p.second.length);
+ if (chunk.intersects(src_offset, src_length)) {
+ dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
+ << " chunk_info: " << p << dendl;
+ result = -EOPNOTSUPP;
+ goto fail;
+ }
+ }
+
+ pg_t raw_pg;
+ chunk_info_t chunk_info;
+ result = get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
+ if (result < 0) {
+ dout(5) << " pool information is invalid: " << result << dendl;
+ break;
+ }
+ hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
+ raw_pg.ps(), raw_pg.pool(),
+ tgt_oloc.nspace);
+ bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
+ (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
+ if (has_reference) {
+ result = -EINVAL;
+ dout(5) << " the object is already a manifest " << dendl;
+ break;
+ }
+ chunk_info.oid = target;
+ chunk_info.offset = tgt_offset;
+ chunk_info.length = src_length;
+ if (op_finisher == nullptr) {
+ // start
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new SetManifestFinisher(osd_op));
+ object_manifest_t set_chunk;
+ bool need_inc_ref = false;
+ set_chunk.chunk_map[src_offset] = chunk_info;
+ need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
+ if (need_inc_ref) {
+ result = -EINPROGRESS;
+ break;
+ }
+ }
+ if (op_finisher) {
+ result = op_finisher->execute();
+ ceph_assert(result == 0);
+ }
+
+ oi.manifest.chunk_map[src_offset] = chunk_info;
+ if (!oi.has_manifest() && !oi.manifest.is_chunked())
+ ctx->delta_stats.num_objects_manifest++;
+ oi.set_flag(object_info_t::FLAG_MANIFEST);
+ oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
+ if (!has_reference) {
+ oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
+ }
+ ctx->modify = true;
+ ctx->cache_operation = true;
+
+ dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
+ << " chunk_info: " << chunk_info << dendl;
+ if (op_finisher) {
+ ctx->op_finishers.erase(ctx->current_osd_subop_num);
+ }
+ }
+
+ break;
+
+ case CEPH_OSD_OP_TIER_PROMOTE:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (pool.info.is_tier()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (!obs.oi.has_manifest()) {
+ result = 0;
+ break;
+ }
+
+ if (op_finisher == nullptr) {
+ PromoteManifestCallback *cb;
+ object_locator_t my_oloc;
+ hobject_t src_hoid;
+
+ if (obs.oi.manifest.is_chunked()) {
+ src_hoid = obs.oi.soid;
+ } else if (obs.oi.manifest.is_redirect()) {
+ object_locator_t src_oloc(obs.oi.manifest.redirect_target);
+ my_oloc = src_oloc;
+ src_hoid = obs.oi.manifest.redirect_target;
+ } else {
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ cb = new PromoteManifestCallback(ctx->obc, this, ctx);
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new PromoteFinisher(cb));
+ unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
+ unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+ start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
+ obs.oi.soid.snap == CEPH_NOSNAP,
+ src_fadvise_flags, 0);
+
+ dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
+ result = -EINPROGRESS;
+ } else {
+ result = op_finisher->execute();
+ ceph_assert(result == 0);
+ ctx->op_finishers.erase(ctx->current_osd_subop_num);
+ }
+ }
+
+ break;
+
+ case CEPH_OSD_OP_TIER_FLUSH:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (pool.info.is_tier()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (!obs.oi.has_manifest()) {
+ result = 0;
+ break;
+ }
+
+ if (oi.is_dirty()) {
+ result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
+ if (result == -EINPROGRESS)
+ result = -EAGAIN;
+ } else {
+ result = 0;
+ }
+ }
+
+ break;
+
+ case CEPH_OSD_OP_TIER_EVICT:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (pool.info.is_tier()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (!obs.oi.has_manifest()) {
+ result = -EINVAL;
+ break;
+ }
+
+ // The chunks already has a reference, so it is just enough to invoke truncate if necessary
+ uint64_t chunk_length = 0;
+ for (auto p : obs.oi.manifest.chunk_map) {
+ chunk_length += p.second.length;
+ }
+ if (chunk_length == obs.oi.size) {
+ for (auto &p : obs.oi.manifest.chunk_map) {
+ p.second.set_flag(chunk_info_t::FLAG_MISSING);
+ }
+ // punch hole
+ t->zero(soid, 0, oi.size);
+ oi.clear_data_digest();
+ ctx->delta_stats.num_wr++;
+ ctx->cache_operation = true;
+ }
+ osd->logger->inc(l_osd_tier_evict);
+ }
+
+ break;
+
+ case CEPH_OSD_OP_UNSET_MANIFEST:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (pool.info.is_tier()) {
+ result = -EINVAL;
+ break;
+ }
+ if (!obs.exists) {
+ result = -ENOENT;
+ break;
+ }
+ if (!oi.has_manifest()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+
+ dec_all_refcount_manifest(oi, ctx);
+
+ oi.clear_flag(object_info_t::FLAG_MANIFEST);
+ oi.manifest = object_manifest_t();
+ ctx->delta_stats.num_objects_manifest--;
+ ctx->delta_stats.num_wr++;
+ ctx->modify = true;
+ }
+
+ break;
+
+ // -- object attrs --
+
+ case CEPH_OSD_OP_SETXATTR:
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (cct->_conf->osd_max_attr_size > 0 &&
+ op.xattr.value_len > cct->_conf->osd_max_attr_size) {
+ tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
+ result = -EFBIG;
+ break;
+ }
+ unsigned max_name_len =
+ std::min<uint64_t>(osd->store->get_max_attr_name_length(),
+ cct->_conf->osd_max_attr_name_len);
+ if (op.xattr.name_len > max_name_len) {
+ result = -ENAMETOOLONG;
+ break;
+ }
+ maybe_create_new_object(ctx);
+ string aname;
+ bp.copy(op.xattr.name_len, aname);
+ tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+ string name = "_" + aname;
+ bufferlist bl;
+ bp.copy(op.xattr.value_len, bl);
+ t->setattr(soid, name, bl);
+ ctx->delta_stats.num_wr++;
+ }
+ break;
+
+ case CEPH_OSD_OP_RMXATTR:
+ ++ctx->num_write;
+ result = 0;
+ {
+ string aname;
+ bp.copy(op.xattr.name_len, aname);
+ tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ break;
+ }
+ string name = "_" + aname;
+ t->rmattr(soid, name);
+ ctx->delta_stats.num_wr++;
+ }
+ break;
+
+
+ // -- fancy writers --
+ case CEPH_OSD_OP_APPEND:
+ {
+ tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+ // just do it inline; this works because we are happy to execute
+ // fancy op on replicas as well.
+ vector<OSDOp> nops(1);
+ OSDOp& newop = nops[0];
+ newop.op.op = CEPH_OSD_OP_WRITE;
+ newop.op.extent.offset = oi.size;
+ newop.op.extent.length = op.extent.length;
+ newop.op.extent.truncate_seq = oi.truncate_seq;
+ newop.indata = osd_op.indata;
+ result = do_osd_ops(ctx, nops);
+ osd_op.outdata = std::move(newop.outdata);
+ }
+ break;
+
+ case CEPH_OSD_OP_STARTSYNC:
+ result = 0;
+ t->nop(soid);
+ break;
+
+ // -- trivial map --
+ case CEPH_OSD_OP_TMAPGET:
+ tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
+ if (pool.info.is_erasure()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ {
+ vector<OSDOp> nops(1);
+ OSDOp& newop = nops[0];
+ newop.op.op = CEPH_OSD_OP_SYNC_READ;
+ newop.op.extent.offset = 0;
+ newop.op.extent.length = 0;
+ result = do_osd_ops(ctx, nops);
+ osd_op.outdata = std::move(newop.outdata);
+ }
+ break;
+
+ case CEPH_OSD_OP_TMAPPUT:
+ tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
+ if (pool.info.is_erasure()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ {
+ //_dout_lock.Lock();
+ //osd_op.data.hexdump(*_dout);
+ //_dout_lock.Unlock();
+
+ // verify sort order
+ bool unsorted = false;
+ if (true) {
+ bufferlist header;
+ decode(header, bp);
+ uint32_t n;
+ decode(n, bp);
+ string last_key;
+ while (n--) {
+ string key;
+ decode(key, bp);
+ dout(10) << "tmapput key " << key << dendl;
+ bufferlist val;
+ decode(val, bp);
+ if (key < last_key) {
+ dout(10) << "TMAPPUT is unordered; resorting" << dendl;
+ unsorted = true;
+ break;
+ }
+ last_key = key;
+ }
+ }
+
+ // write it
+ vector<OSDOp> nops(1);
+ OSDOp& newop = nops[0];
+ newop.op.op = CEPH_OSD_OP_WRITEFULL;
+ newop.op.extent.offset = 0;
+ newop.op.extent.length = osd_op.indata.length();
+ newop.indata = osd_op.indata;
+
+ if (unsorted) {
+ bp = osd_op.indata.begin();
+ bufferlist header;
+ map<string, bufferlist> m;
+ decode(header, bp);
+ decode(m, bp);
+ ceph_assert(bp.end());
+ bufferlist newbl;
+ encode(header, newbl);
+ encode(m, newbl);
+ newop.indata = newbl;
+ }
+ result = do_osd_ops(ctx, nops);
+ ceph_assert(result == 0);
+ }
+ break;
+
+ case CEPH_OSD_OP_TMAPUP:
+ tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
+ if (pool.info.is_erasure()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_write;
+ result = do_tmapup(ctx, bp, osd_op);
+ break;
+
+ case CEPH_OSD_OP_TMAP2OMAP:
+ ++ctx->num_write;
+ tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
+ result = do_tmap2omap(ctx, op.tmap2omap.flags);
+ break;
+
+ // OMAP Read ops
+ case CEPH_OSD_OP_OMAPGETKEYS:
+ ++ctx->num_read;
+ {
+ string start_after;
+ uint64_t max_return;
+ try {
+ decode(start_after, bp);
+ decode(max_return, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
+ goto fail;
+ }
+ if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
+ max_return = cct->_conf->osd_max_omap_entries_per_request;
+ }
+ tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
+
+ bufferlist bl;
+ uint32_t num = 0;
+ bool truncated = false;
+ if (oi.is_omap()) {
+ ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
+ ch, ghobject_t(soid)
+ );
+ ceph_assert(iter);
+ iter->upper_bound(start_after);
+ for (num = 0; iter->valid(); ++num, iter->next()) {
+ if (num >= max_return ||
+ bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
+ truncated = true;
+ break;
+ }
+ encode(iter->key(), bl);
+ }
+ } // else return empty out_set
+ encode(num, osd_op.outdata);
+ osd_op.outdata.claim_append(bl);
+ encode(truncated, osd_op.outdata);
+ ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ ctx->delta_stats.num_rd++;
+ }
+ break;
+
+ case CEPH_OSD_OP_OMAPGETVALS:
+ ++ctx->num_read;
+ {
+ string start_after;
+ uint64_t max_return;
+ string filter_prefix;
+ try {
+ decode(start_after, bp);
+ decode(max_return, bp);
+ decode(filter_prefix, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
+ goto fail;
+ }
+ if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
+ max_return = cct->_conf->osd_max_omap_entries_per_request;
+ }
+ tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
+
+ uint32_t num = 0;
+ bool truncated = false;
+ bufferlist bl;
+ if (oi.is_omap()) {
+ ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
+ ch, ghobject_t(soid)
+ );
+ if (!iter) {
+ result = -ENOENT;
+ goto fail;
+ }
+ iter->upper_bound(start_after);
+ if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
+ for (num = 0;
+ iter->valid() &&
+ iter->key().substr(0, filter_prefix.size()) == filter_prefix;
+ ++num, iter->next()) {
+ dout(20) << "Found key " << iter->key() << dendl;
+ if (num >= max_return ||
+ bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
+ truncated = true;
+ break;
+ }
+ encode(iter->key(), bl);
+ encode(iter->value(), bl);
+ }
+ } // else return empty out_set
+ encode(num, osd_op.outdata);
+ osd_op.outdata.claim_append(bl);
+ encode(truncated, osd_op.outdata);
+ ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ ctx->delta_stats.num_rd++;
+ }
+ break;
+
+ case CEPH_OSD_OP_OMAPGETHEADER:
+ tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
+ if (!oi.is_omap()) {
+ // return empty header
+ break;
+ }
+ ++ctx->num_read;
+ {
+ osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
+ ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ ctx->delta_stats.num_rd++;
+ }
+ break;
+
+ case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+ ++ctx->num_read;
+ {
+ set<string> keys_to_get;
+ try {
+ decode(keys_to_get, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
+ goto fail;
+ }
+ tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
+ map<string, bufferlist> out;
+ if (oi.is_omap()) {
+ osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
+ } // else return empty omap entries
+ encode(out, osd_op.outdata);
+ ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+ ctx->delta_stats.num_rd++;
+ }
+ break;
+
+ case CEPH_OSD_OP_OMAP_CMP:
+ ++ctx->num_read;
+ {
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
+ break;
+ }
+ map<string, pair<bufferlist, int> > assertions;
+ try {
+ decode(assertions, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
+ goto fail;
+ }
+ tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
+
+ map<string, bufferlist> out;
+
+ if (oi.is_omap()) {
+ set<string> to_get;
+ for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
+ i != assertions.end();
+ ++i)
+ to_get.insert(i->first);
+ int r = osd->store->omap_get_values(ch, ghobject_t(soid),
+ to_get, &out);
+ if (r < 0) {
+ result = r;
+ break;
+ }
+ } // else leave out empty
+
+ //Should set num_rd_kb based on encode length of map
+ ctx->delta_stats.num_rd++;
+
+ int r = 0;
+ bufferlist empty;
+ for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
+ i != assertions.end();
+ ++i) {
+ auto out_entry = out.find(i->first);
+ bufferlist &bl = (out_entry != out.end()) ?
+ out_entry->second : empty;
+ switch (i->second.second) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ if (!(bl == i->second.first)) {
+ r = -ECANCELED;
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ if (!(bl < i->second.first)) {
+ r = -ECANCELED;
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ if (!(bl > i->second.first)) {
+ r = -ECANCELED;
+ }
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ if (r < 0)
+ break;
+ }
+ if (r < 0) {
+ result = r;
+ }
+ }
+ break;
+
+ // OMAP Write ops
+ case CEPH_OSD_OP_OMAPSETVALS:
+ if (!pool.info.supports_omap()) {
+ result = -EOPNOTSUPP;
+ tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ maybe_create_new_object(ctx);
+ bufferlist to_set_bl;
+ try {
+ decode_str_str_map_to_bl(bp, &to_set_bl);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+ goto fail;
+ }
+ tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+ if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
+ dout(20) << "setting vals: " << dendl;
+ map<string,bufferlist> to_set;
+ bufferlist::const_iterator pt = to_set_bl.begin();
+ decode(to_set, pt);
+ for (map<string, bufferlist>::iterator i = to_set.begin();
+ i != to_set.end();
+ ++i) {
+ dout(20) << "\t" << i->first << dendl;
+ }
+ }
+ t->omap_setkeys(soid, to_set_bl);
+ ctx->clean_regions.mark_omap_dirty();
+ ctx->delta_stats.num_wr++;
+ ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
+ }
+ obs.oi.set_flag(object_info_t::FLAG_OMAP);
+ obs.oi.clear_omap_digest();
+ break;
+
+ case CEPH_OSD_OP_OMAPSETHEADER:
+ tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
+ if (!pool.info.supports_omap()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ maybe_create_new_object(ctx);
+ t->omap_setheader(soid, osd_op.indata);
+ ctx->clean_regions.mark_omap_dirty();
+ ctx->delta_stats.num_wr++;
+ }
+ obs.oi.set_flag(object_info_t::FLAG_OMAP);
+ obs.oi.clear_omap_digest();
+ break;
+
+ case CEPH_OSD_OP_OMAPCLEAR:
+ tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
+ if (!pool.info.supports_omap()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ break;
+ }
+ if (oi.is_omap()) {
+ t->omap_clear(soid);
+ ctx->clean_regions.mark_omap_dirty();
+ ctx->delta_stats.num_wr++;
+ obs.oi.clear_omap_digest();
+ obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+ }
+ }
+ break;
+
+ case CEPH_OSD_OP_OMAPRMKEYS:
+ if (!pool.info.supports_omap()) {
+ result = -EOPNOTSUPP;
+ tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+ break;
+ }
+ bufferlist to_rm_bl;
+ try {
+ decode_str_set_to_bl(bp, &to_rm_bl);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+ goto fail;
+ }
+ tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+ t->omap_rmkeys(soid, to_rm_bl);
+ ctx->clean_regions.mark_omap_dirty();
+ ctx->delta_stats.num_wr++;
+ }
+ obs.oi.clear_omap_digest();
+ break;
+
+ case CEPH_OSD_OP_OMAPRMKEYRANGE:
+ tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
+ if (!pool.info.supports_omap()) {
+ result = -EOPNOTSUPP;
+ break;
+ }
+ ++ctx->num_write;
+ result = 0;
+ {
+ if (!obs.exists || oi.is_whiteout()) {
+ result = -ENOENT;
+ break;
+ }
+ std::string key_begin, key_end;
+ try {
+ decode(key_begin, bp);
+ decode(key_end, bp);
+ } catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ goto fail;
+ }
+ t->omap_rmkeyrange(soid, key_begin, key_end);
+ ctx->clean_regions.mark_omap_dirty();
+ ctx->delta_stats.num_wr++;
+ }
+ obs.oi.clear_omap_digest();
+ break;
+
+ case CEPH_OSD_OP_COPY_GET:
+ ++ctx->num_read;
+ tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
+ soid.snap.val);
+ if (op_finisher == nullptr) {
+ result = do_copy_get(ctx, bp, osd_op, ctx->obc);
+ } else {
+ result = op_finisher->execute();
+ }
+ break;
+
+ case CEPH_OSD_OP_COPY_FROM:
+ case CEPH_OSD_OP_COPY_FROM2:
+ ++ctx->num_write;
+ result = 0;
+ {
+ object_t src_name;
+ object_locator_t src_oloc;
+ uint32_t truncate_seq = 0;
+ uint64_t truncate_size = 0;
+ bool have_truncate = false;
+ snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
+ version_t src_version = op.copy_from.src_version;
+
+ if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
+ (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
+ dout(20) << "invalid copy-from2 flags 0x"
+ << std::hex << (int)op.copy_from.flags << std::dec << dendl;
+ result = -EINVAL;
+ break;
+ }
+ try {
+ decode(src_name, bp);
+ decode(src_oloc, bp);
+ // check if client sent us truncate_seq and truncate_size
+ if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
+ (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
+ decode(truncate_seq, bp);
+ decode(truncate_size, bp);
+ have_truncate = true;
+ }
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ tracepoint(osd,
+ do_osd_op_pre_copy_from,
+ soid.oid.name.c_str(),
+ soid.snap.val,
+ "???",
+ 0,
+ "???",
+ "???",
+ 0,
+ src_snapid,
+ src_version);
+ goto fail;
+ }
+ tracepoint(osd,
+ do_osd_op_pre_copy_from,
+ soid.oid.name.c_str(),
+ soid.snap.val,
+ src_name.name.c_str(),
+ src_oloc.pool,
+ src_oloc.key.c_str(),
+ src_oloc.nspace.c_str(),
+ src_oloc.hash,
+ src_snapid,
+ src_version);
+ if (op_finisher == nullptr) {
+ // start
+ pg_t raw_pg;
+ get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
+ hobject_t src(src_name, src_oloc.key, src_snapid,
+ raw_pg.ps(), raw_pg.pool(),
+ src_oloc.nspace);
+ if (src == soid) {
+ dout(20) << " copy from self is invalid" << dendl;
+ result = -EINVAL;
+ break;
+ }
+ CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
+ if (have_truncate)
+ cb->set_truncate(truncate_seq, truncate_size);
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new CopyFromFinisher(cb));
+ start_copy(cb, ctx->obc, src, src_oloc, src_version,
+ op.copy_from.flags,
+ false,
+ op.copy_from.src_fadvise_flags,
+ op.flags);
+ result = -EINPROGRESS;
+ } else {
+ // finish
+ result = op_finisher->execute();
+ ceph_assert(result == 0);
+
+ // COPY_FROM cannot be executed multiple times -- it must restart
+ ctx->op_finishers.erase(ctx->current_osd_subop_num);
+ }
+ }
+ break;
+
+ default:
+ tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
+ dout(1) << "unrecognized osd op " << op.op
+ << " " << ceph_osd_op_name(op.op)
+ << dendl;
+ result = -EOPNOTSUPP;
+ }
+
+ fail:
+ osd_op.rval = result;
+ tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
+ if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
+ result != -EAGAIN && result != -EINPROGRESS)
+ result = 0;
+
+ if (result < 0)
+ break;
+ }
+ if (result < 0) {
+ dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
+ }
+ return result;
+}
+
+int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
+{
+ if (ctx->new_obs.oi.size == 0) {
+ dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
+ return -ENODATA;
+ }
+ vector<OSDOp> nops(1);
+ OSDOp &newop = nops[0];
+ newop.op.op = CEPH_OSD_OP_TMAPGET;
+ do_osd_ops(ctx, nops);
+ try {
+ bufferlist::const_iterator i = newop.outdata.begin();
+ decode(*header, i);
+ (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
+ } catch (...) {
+ dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
+ << dendl;
+ return -EINVAL;
+ }
+ dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
+ << dendl;
+ return 0;
+}
+
+int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
+ const SnapSet& ss)
+{
+ // verify that all clones have been evicted
+ dout(20) << __func__ << " verifying clones are absent "
+ << ss << dendl;
+ for (vector<snapid_t>::const_iterator p = ss.clones.begin();
+ p != ss.clones.end();
+ ++p) {
+ hobject_t clone_oid = soid;
+ clone_oid.snap = *p;
+ if (is_missing_object(clone_oid))
+ return -EBUSY;
+ ObjectContextRef clone_obc = get_object_context(clone_oid, false);
+ if (clone_obc && clone_obc->obs.exists) {
+ dout(10) << __func__ << " cannot evict head before clone "
+ << clone_oid << dendl;
+ return -EBUSY;
+ }
+ if (copy_ops.count(clone_oid)) {
+ dout(10) << __func__ << " cannot evict head, pending promote on clone "
+ << clone_oid << dendl;
+ return -EBUSY;
+ }
+ }
+ return 0;
+}
+
+inline int PrimaryLogPG::_delete_oid(
+ OpContext *ctx,
+ bool no_whiteout, // no whiteouts, no matter what.
+ bool try_no_whiteout) // try not to whiteout
+{
+ SnapSet& snapset = ctx->new_snapset;
+ ObjectState& obs = ctx->new_obs;
+ object_info_t& oi = obs.oi;
+ const hobject_t& soid = oi.soid;
+ PGTransaction* t = ctx->op_t.get();
+
+ // cache: cache: set whiteout on delete?
+ bool whiteout = false;
+ if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
+ && !no_whiteout
+ && !try_no_whiteout) {
+ whiteout = true;
+ }
+
+ // in luminous or later, we can't delete the head if there are
+ // clones. we trust the caller passing no_whiteout has already
+ // verified they don't exist.
+ if (!snapset.clones.empty() ||
+ (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
+ if (no_whiteout) {
+ dout(20) << __func__ << " has or will have clones but no_whiteout=1"
+ << dendl;
+ } else {
+ dout(20) << __func__ << " has or will have clones; will whiteout"
+ << dendl;
+ whiteout = true;
+ }
+ }
+ dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
+ << " no_whiteout=" << (int)no_whiteout
+ << " try_no_whiteout=" << (int)try_no_whiteout
+ << dendl;
+ if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
+ return -ENOENT;
+
+ t->remove(soid);
+
+ if (oi.size > 0) {
+ interval_set<uint64_t> ch;
+ ch.insert(0, oi.size);
+ ctx->modified_ranges.union_of(ch);
+ ctx->clean_regions.mark_data_region_dirty(0, oi.size);
+ }
+
+ ctx->clean_regions.mark_omap_dirty();
+ ctx->delta_stats.num_wr++;
+ if (soid.is_snap()) {
+ ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
+ ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
+ } else {
+ ctx->delta_stats.num_bytes -= oi.size;
+ }
+ oi.size = 0;
+ oi.new_object();
+
+ // disconnect all watchers
+ for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
+ oi.watchers.begin();
+ p != oi.watchers.end();
+ ++p) {
+ dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
+ ctx->watch_disconnects.push_back(
+ watch_disconnect_t(p->first.first, p->first.second, true));
+ }
+ oi.watchers.clear();
+
+ if (oi.has_manifest()) {
+ ctx->delta_stats.num_objects_manifest--;
+ dec_all_refcount_manifest(oi, ctx);
+ }
+
+ if (whiteout) {
+ dout(20) << __func__ << " setting whiteout on " << soid << dendl;
+ oi.set_flag(object_info_t::FLAG_WHITEOUT);
+ ctx->delta_stats.num_whiteouts++;
+ t->create(soid);
+ osd->logger->inc(l_osd_tier_whiteout);
+ return 0;
+ }
+
+ // delete the head
+ ctx->delta_stats.num_objects--;
+ if (soid.is_snap())
+ ctx->delta_stats.num_object_clones--;
+ if (oi.is_whiteout()) {
+ dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
+ ctx->delta_stats.num_whiteouts--;
+ oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ }
+ if (oi.is_cache_pinned()) {
+ ctx->delta_stats.num_objects_pinned--;
+ }
+ obs.exists = false;
+ return 0;
+}
+
+int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
+{
+ SnapSet& snapset = ctx->new_snapset;
+ ObjectState& obs = ctx->new_obs;
+ object_info_t& oi = obs.oi;
+ const hobject_t& soid = oi.soid;
+ PGTransaction* t = ctx->op_t.get();
+ snapid_t snapid = (uint64_t)op.snap.snapid;
+ hobject_t missing_oid;
+
+ dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
+
+ ObjectContextRef rollback_to;
+
+ int ret = find_object_context(
+ hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
+ soid.get_namespace()),
+ &rollback_to, false, false, &missing_oid);
+ if (ret == -EAGAIN) {
+ /* clone must be missing */
+ ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
+ dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
+ << missing_oid << " (requested snapid: ) " << snapid << dendl;
+ block_write_on_degraded_snap(missing_oid, ctx->op);
+ return ret;
+ }
+ {
+ ObjectContextRef promote_obc;
+ cache_result_t tier_mode_result;
+ if (obs.exists && obs.oi.has_manifest()) {
+ tier_mode_result =
+ maybe_handle_manifest_detail(
+ ctx->op,
+ true,
+ rollback_to);
+ } else {
+ tier_mode_result =
+ maybe_handle_cache_detail(
+ ctx->op,
+ true,
+ rollback_to,
+ ret,
+ missing_oid,
+ true,
+ false,
+ &promote_obc);
+ }
+ switch (tier_mode_result) {
+ case cache_result_t::NOOP:
+ break;
+ case cache_result_t::BLOCKED_PROMOTE:
+ ceph_assert(promote_obc);
+ block_write_on_snap_rollback(soid, promote_obc, ctx->op);
+ return -EAGAIN;
+ case cache_result_t::BLOCKED_FULL:
+ block_write_on_full_cache(soid, ctx->op);
+ return -EAGAIN;
+ case cache_result_t::REPLIED_WITH_EAGAIN:
+ ceph_abort_msg("this can't happen, no rollback on replica");
+ default:
+ ceph_abort_msg("must promote was set, other values are not valid");
+ return -EAGAIN;
+ }
+ }
+
+ if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+ // there's no snapshot here, or there's no object.
+ // if there's no snapshot, we delete the object; otherwise, do nothing.
+ dout(20) << "_rollback_to deleting head on " << soid.oid
+ << " because got ENOENT|whiteout on find_object_context" << dendl;
+ if (ctx->obc->obs.oi.watchers.size()) {
+ // Cannot delete an object with watchers
+ ret = -EBUSY;
+ } else {
+ _delete_oid(ctx, false, false);
+ ret = 0;
+ }
+ } else if (ret) {
+ // ummm....huh? It *can't* return anything else at time of writing.
+ ceph_abort_msg("unexpected error code in _rollback_to");
+ } else { //we got our context, let's use it to do the rollback!
+ hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
+ if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
+ is_degraded_on_async_recovery_target(rollback_to_sobject)) {
+ dout(20) << "_rollback_to attempted to roll back to a degraded object "
+ << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
+ block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
+ ret = -EAGAIN;
+ } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
+ // rolling back to the head; we just need to clone it.
+ ctx->modify = true;
+ } else {
+ /* 1) Delete current head
+ * 2) Clone correct snapshot into head
+ * 3) Calculate clone_overlaps by following overlaps
+ * forward from rollback snapshot */
+ dout(10) << "_rollback_to deleting " << soid.oid
+ << " and rolling back to old snap" << dendl;
+
+ if (obs.exists) {
+ t->remove(soid);
+ }
+ t->clone(soid, rollback_to_sobject);
+ t->add_obc(rollback_to);
+
+ map<snapid_t, interval_set<uint64_t> >::iterator iter =
+ snapset.clone_overlap.lower_bound(snapid);
+ ceph_assert(iter != snapset.clone_overlap.end());
+ interval_set<uint64_t> overlaps = iter->second;
+ for ( ;
+ iter != snapset.clone_overlap.end();
+ ++iter)
+ overlaps.intersection_of(iter->second);
+
+ if (obs.oi.size > 0) {
+ interval_set<uint64_t> modified;
+ modified.insert(0, obs.oi.size);
+ overlaps.intersection_of(modified);
+ modified.subtract(overlaps);
+ ctx->modified_ranges.union_of(modified);
+ }
+
+ // Adjust the cached objectcontext
+ maybe_create_new_object(ctx, true);
+ ctx->delta_stats.num_bytes -= obs.oi.size;
+ ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
+ ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
+ ctx->clean_regions.mark_omap_dirty();
+ obs.oi.size = rollback_to->obs.oi.size;
+ if (rollback_to->obs.oi.is_data_digest())
+ obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
+ else
+ obs.oi.clear_data_digest();
+ if (rollback_to->obs.oi.is_omap_digest())
+ obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
+ else
+ obs.oi.clear_omap_digest();
+
+ if (rollback_to->obs.oi.is_omap()) {
+ dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
+ obs.oi.set_flag(object_info_t::FLAG_OMAP);
+ } else {
+ dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
+ obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+ }
+ }
+ }
+ return ret;
+}
+
+void PrimaryLogPG::_make_clone(
+ OpContext *ctx,
+ PGTransaction* t,
+ ObjectContextRef obc,
+ const hobject_t& head, const hobject_t& coid,
+ object_info_t *poi)
+{
+ bufferlist bv;
+ encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+ t->clone(coid, head);
+ setattr_maybe_cache(obc, t, OI_ATTR, bv);
+ rmattr_maybe_cache(obc, t, SS_ATTR);
+}
+
+void PrimaryLogPG::make_writeable(OpContext *ctx)
+{
+ const hobject_t& soid = ctx->obs->oi.soid;
+ SnapContext& snapc = ctx->snapc;
+
+ // clone?
+ ceph_assert(soid.snap == CEPH_NOSNAP);
+ dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
+ << " snapc=" << snapc << dendl;
+
+ bool was_dirty = ctx->obc->obs.oi.is_dirty();
+ if (ctx->new_obs.exists) {
+ // we will mark the object dirty
+ if (ctx->undirty && was_dirty) {
+ dout(20) << " clearing DIRTY flag" << dendl;
+ ceph_assert(ctx->new_obs.oi.is_dirty());
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+ --ctx->delta_stats.num_objects_dirty;
+ osd->logger->inc(l_osd_tier_clean);
+ } else if (!was_dirty && !ctx->undirty) {
+ dout(20) << " setting DIRTY flag" << dendl;
+ ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
+ ++ctx->delta_stats.num_objects_dirty;
+ osd->logger->inc(l_osd_tier_dirty);
+ }
+ } else {
+ if (was_dirty) {
+ dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+ --ctx->delta_stats.num_objects_dirty;
+ }
+ }
+
+ if ((ctx->new_obs.exists &&
+ ctx->new_obs.oi.is_omap()) &&
+ (!ctx->obc->obs.exists ||
+ !ctx->obc->obs.oi.is_omap())) {
+ ++ctx->delta_stats.num_objects_omap;
+ }
+ if ((!ctx->new_obs.exists ||
+ !ctx->new_obs.oi.is_omap()) &&
+ (ctx->obc->obs.exists &&
+ ctx->obc->obs.oi.is_omap())) {
+ --ctx->delta_stats.num_objects_omap;
+ }
+
+ if (ctx->new_snapset.seq > snapc.seq) {
+ dout(10) << " op snapset is old" << dendl;
+ }
+
+ if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
+ snapc.snaps.size() && // there are snaps
+ !ctx->cache_operation &&
+ snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
+ // clone
+ hobject_t coid = soid;
+ coid.snap = snapc.seq;
+
+ unsigned l;
+ for (l = 1;
+ l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
+ l++) ;
+
+ vector<snapid_t> snaps(l);
+ for (unsigned i=0; i<l; i++)
+ snaps[i] = snapc.snaps[i];
+
+ // prepare clone
+ object_info_t static_snap_oi(coid);
+ object_info_t *snap_oi;
+ if (is_primary()) {
+ ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
+ ctx->clone_obc->destructor_callback =
+ new C_PG_ObjectContext(this, ctx->clone_obc.get());
+ ctx->clone_obc->obs.oi = static_snap_oi;
+ ctx->clone_obc->obs.exists = true;
+ ctx->clone_obc->ssc = ctx->obc->ssc;
+ ctx->clone_obc->ssc->ref++;
+ if (pool.info.is_erasure())
+ ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
+ snap_oi = &ctx->clone_obc->obs.oi;
+ if (ctx->obc->obs.oi.has_manifest()) {
+ if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
+ ctx->obc->obs.oi.manifest.is_redirect()) {
+ snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
+ snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
+ snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
+ } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
+ snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
+ snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
+ snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
+ } else {
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ }
+ bool got = ctx->lock_manager.get_write_greedy(
+ coid,
+ ctx->clone_obc,
+ ctx->op);
+ ceph_assert(got);
+ dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
+ } else {
+ snap_oi = &static_snap_oi;
+ }
+ snap_oi->version = ctx->at_version;
+ snap_oi->prior_version = ctx->obs->oi.version;
+ snap_oi->copy_user_bits(ctx->obs->oi);
+
+ _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
+
+ ctx->delta_stats.num_objects++;
+ if (snap_oi->is_dirty()) {
+ ctx->delta_stats.num_objects_dirty++;
+ osd->logger->inc(l_osd_tier_dirty);
+ }
+ if (snap_oi->is_omap())
+ ctx->delta_stats.num_objects_omap++;
+ if (snap_oi->is_cache_pinned())
+ ctx->delta_stats.num_objects_pinned++;
+ if (snap_oi->has_manifest())
+ ctx->delta_stats.num_objects_manifest++;
+ ctx->delta_stats.num_object_clones++;
+ ctx->new_snapset.clones.push_back(coid.snap);
+ ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
+ ctx->new_snapset.clone_snaps[coid.snap] = snaps;
+
+ // clone_overlap should contain an entry for each clone
+ // (an empty interval_set if there is no overlap)
+ ctx->new_snapset.clone_overlap[coid.snap];
+ if (ctx->obs->oi.size)
+ ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
+
+ // log clone
+ dout(10) << " cloning v " << ctx->obs->oi.version
+ << " to " << coid << " v " << ctx->at_version
+ << " snaps=" << snaps
+ << " snapset=" << ctx->new_snapset << dendl;
+ ctx->log.push_back(pg_log_entry_t(
+ pg_log_entry_t::CLONE, coid, ctx->at_version,
+ ctx->obs->oi.version,
+ ctx->obs->oi.user_version,
+ osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
+ encode(snaps, ctx->log.back().snaps);
+
+ ctx->at_version.version++;
+ }
+
+ // update most recent clone_overlap and usage stats
+ if (ctx->new_snapset.clones.size() > 0) {
+ // the clone_overlap is difference of range between head and clones.
+ // we need to check whether the most recent clone exists, if it's
+ // been evicted, it's not included in the stats, but the clone_overlap
+ // is still exist in the snapset, so we should update the
+ // clone_overlap to make it sense.
+ hobject_t last_clone_oid = soid;
+ last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
+ interval_set<uint64_t> &newest_overlap =
+ ctx->new_snapset.clone_overlap.rbegin()->second;
+ ctx->modified_ranges.intersection_of(newest_overlap);
+ if (is_present_clone(last_clone_oid)) {
+ // modified_ranges is still in use by the clone
+ ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
+ }
+ newest_overlap.subtract(ctx->modified_ranges);
+ }
+
+ if (snapc.seq > ctx->new_snapset.seq) {
+ // update snapset with latest snap context
+ ctx->new_snapset.seq = snapc.seq;
+ if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+ ctx->new_snapset.snaps = snapc.snaps;
+ } else {
+ ctx->new_snapset.snaps.clear();
+ }
+ }
+ dout(20) << "make_writeable " << soid
+ << " done, snapset=" << ctx->new_snapset << dendl;
+}
+
+
+void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
+ interval_set<uint64_t>& modified, uint64_t offset,
+ uint64_t length, bool write_full)
+{
+ interval_set<uint64_t> ch;
+ if (write_full) {
+ if (oi.size)
+ ch.insert(0, oi.size);
+ } else if (length)
+ ch.insert(offset, length);
+ modified.union_of(ch);
+ if (write_full ||
+ (offset + length > oi.size && length)) {
+ uint64_t new_size = offset + length;
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += new_size;
+ oi.size = new_size;
+ }
+
+ delta_stats.num_wr++;
+ delta_stats.num_wr_kb += shift_round_up(length, 10);
+}
+
+void PrimaryLogPG::truncate_update_size_and_usage(
+ object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size)
+{
+ if (oi.size != truncate_size) {
+ delta_stats.num_bytes -= oi.size;
+ delta_stats.num_bytes += truncate_size;
+ oi.size = truncate_size;
+ }
+}
+
+void PrimaryLogPG::complete_disconnect_watches(
+ ObjectContextRef obc,
+ const list<watch_disconnect_t> &to_disconnect)
+{
+ for (list<watch_disconnect_t>::const_iterator i =
+ to_disconnect.begin();
+ i != to_disconnect.end();
+ ++i) {
+ pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
+ auto watchers_entry = obc->watchers.find(watcher);
+ if (watchers_entry != obc->watchers.end()) {
+ WatchRef watch = watchers_entry->second;
+ dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
+ obc->watchers.erase(watcher);
+ watch->remove(i->send_disconnect);
+ } else {
+ dout(10) << "do_osd_op_effects disconnect failed to find watcher "
+ << watcher << dendl;
+ }
+ }
+}
+
+void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
+{
+ entity_name_t entity = ctx->reqid.name;
+ dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
+
+ // disconnects first
+ complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
+
+ ceph_assert(conn);
+
+ auto session = conn->get_priv();
+ if (!session)
+ return;
+
+ for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
+ i != ctx->watch_connects.end();
+ ++i) {
+ pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
+ dout(15) << "do_osd_op_effects applying watch connect on session "
+ << session.get() << " watcher " << watcher << dendl;
+ WatchRef watch;
+ if (ctx->obc->watchers.count(watcher)) {
+ dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
+ << dendl;
+ watch = ctx->obc->watchers[watcher];
+ } else {
+ dout(15) << "do_osd_op_effects new watcher " << watcher
+ << dendl;
+ watch = Watch::makeWatchRef(
+ this, osd, ctx->obc, i->first.timeout_seconds,
+ i->first.cookie, entity, conn->get_peer_addr());
+ ctx->obc->watchers.insert(
+ make_pair(
+ watcher,
+ watch));
+ }
+ watch->connect(conn, i->second);
+ }
+
+ for (list<notify_info_t>::iterator p = ctx->notifies.begin();
+ p != ctx->notifies.end();
+ ++p) {
+ dout(10) << "do_osd_op_effects, notify " << *p << dendl;
+ ConnectionRef conn(ctx->op->get_req()->get_connection());
+ NotifyRef notif(
+ Notify::makeNotifyRef(
+ conn,
+ ctx->reqid.name.num(),
+ p->bl,
+ p->timeout,
+ p->cookie,
+ p->notify_id,
+ ctx->obc->obs.oi.user_version,
+ osd));
+ for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
+ ctx->obc->watchers.begin();
+ i != ctx->obc->watchers.end();
+ ++i) {
+ dout(10) << "starting notify on watch " << i->first << dendl;
+ i->second->start_notify(notif);
+ }
+ notif->init();
+ }
+
+ for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
+ p != ctx->notify_acks.end();
+ ++p) {
+ if (p->watch_cookie)
+ dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
+ else
+ dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
+ for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
+ ctx->obc->watchers.begin();
+ i != ctx->obc->watchers.end();
+ ++i) {
+ if (i->first.second != entity) continue;
+ if (p->watch_cookie &&
+ *(p->watch_cookie) != i->first.first) continue;
+ dout(10) << "acking notify on watch " << i->first << dendl;
+ i->second->notify_ack(p->notify_id, p->reply_bl);
+ }
+ }
+}
+
+hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
+{
+ ostringstream ss;
+ ss << "temp_" << info.pgid << "_" << get_role()
+ << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
+ hobject_t hoid = target.make_temp_hobject(ss.str());
+ dout(20) << __func__ << " " << hoid << dendl;
+ return hoid;
+}
+
+hobject_t PrimaryLogPG::get_temp_recovery_object(
+ const hobject_t& target,
+ eversion_t version)
+{
+ ostringstream ss;
+ ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
+ << "_" << version
+ << "_" << info.history.same_interval_since
+ << "_" << target.snap;
+ // pgid + version + interval + snapid is unique, and short
+ hobject_t hoid = target.make_temp_hobject(ss.str());
+ dout(20) << __func__ << " " << hoid << dendl;
+ return hoid;
+}
+
+int PrimaryLogPG::prepare_transaction(OpContext *ctx)
+{
+ ceph_assert(!ctx->ops->empty());
+
+ // valid snap context?
+ if (!ctx->snapc.is_valid()) {
+ dout(10) << " invalid snapc " << ctx->snapc << dendl;
+ return -EINVAL;
+ }
+
+ // prepare the actual mutation
+ int result = do_osd_ops(ctx, *ctx->ops);
+ if (result < 0) {
+ if (ctx->op->may_write() &&
+ get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+ // need to save the error code in the pg log, to detect dup ops,
+ // but do nothing else
+ ctx->update_log_only = true;
+ }
+ return result;
+ }
+
+ // read-op? write-op noop? done?
+ if (ctx->op_t->empty() && !ctx->modify) {
+ if (ctx->pending_async_reads.empty())
+ unstable_stats.add(ctx->delta_stats);
+ if (ctx->op->may_write() &&
+ get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+ ctx->update_log_only = true;
+ }
+ return result;
+ }
+
+ // check for full
+ if ((ctx->delta_stats.num_bytes > 0 ||
+ ctx->delta_stats.num_objects > 0) && // FIXME: keys?
+ pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
+ auto m = ctx->op->get_req<MOSDOp>();
+ if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
+ m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+ dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
+ << dendl;
+ } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+ // they tried, they failed.
+ dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
+ return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
+ } else {
+ // drop request
+ dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
+ return -EAGAIN;
+ }
+ }
+
+ const hobject_t& soid = ctx->obs->oi.soid;
+ // clone, if necessary
+ if (soid.snap == CEPH_NOSNAP)
+ make_writeable(ctx);
+
+ finish_ctx(ctx,
+ ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
+ pg_log_entry_t::DELETE,
+ result);
+
+ return result;
+}
+
+void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
+{
+ const hobject_t& soid = ctx->obs->oi.soid;
+ dout(20) << __func__ << " " << soid << " " << ctx
+ << " op " << pg_log_entry_t::get_op_name(log_op_type)
+ << dendl;
+ utime_t now = ceph_clock_now();
+
+#ifdef HAVE_JAEGER
+ if (ctx->op->osd_parent_span) {
+ auto finish_ctx_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+ }
+#endif
+ // Drop the reference if deduped chunk is modified
+ if (ctx->new_obs.oi.is_dirty() &&
+ (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
+ // If a clone is creating, ignore dropping the reference for manifest object
+ !ctx->delta_stats.num_object_clones &&
+ ctx->new_obs.oi.size != 0 && // missing, redirect and delete
+ !ctx->cache_operation &&
+ log_op_type != pg_log_entry_t::PROMOTE) {
+ dec_refcount_by_dirty(ctx);
+ }
+
+ // finish and log the op.
+ if (ctx->user_modify) {
+ // update the user_version for any modify ops, except for the watch op
+ ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
+ /* In order for new clients and old clients to interoperate properly
+ * when exchanging versions, we need to lower bound the user_version
+ * (which our new clients pay proper attention to)
+ * by the at_version (which is all the old clients can ever see). */
+ if (ctx->at_version.version > ctx->user_at_version)
+ ctx->user_at_version = ctx->at_version.version;
+ ctx->new_obs.oi.user_version = ctx->user_at_version;
+ }
+ ctx->bytes_written = ctx->op_t->get_bytes_written();
+
+ if (ctx->new_obs.exists) {
+ ctx->new_obs.oi.version = ctx->at_version;
+ ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
+ ctx->new_obs.oi.last_reqid = ctx->reqid;
+ if (ctx->mtime != utime_t()) {
+ ctx->new_obs.oi.mtime = ctx->mtime;
+ dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
+ ctx->new_obs.oi.local_mtime = now;
+ } else {
+ dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
+ }
+
+ // object_info_t
+ map <string, bufferlist> attrs;
+ bufferlist bv(sizeof(ctx->new_obs.oi));
+ encode(ctx->new_obs.oi, bv,
+ get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ attrs[OI_ATTR] = std::move(bv);
+
+ // snapset
+ if (soid.snap == CEPH_NOSNAP) {
+ dout(10) << " final snapset " << ctx->new_snapset
+ << " in " << soid << dendl;
+ bufferlist bss;
+ encode(ctx->new_snapset, bss);
+ attrs[SS_ATTR] = std::move(bss);
+ } else {
+ dout(10) << " no snapset (this is a clone)" << dendl;
+ }
+ ctx->op_t->setattrs(soid, attrs);
+ } else {
+ // reset cached oi
+ ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
+ }
+
+ // append to log
+ ctx->log.push_back(
+ pg_log_entry_t(log_op_type, soid, ctx->at_version,
+ ctx->obs->oi.version,
+ ctx->user_at_version, ctx->reqid,
+ ctx->mtime,
+ (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
+ if (ctx->op && ctx->op->allows_returnvec()) {
+ // also the per-op values
+ ctx->log.back().set_op_returns(*ctx->ops);
+ dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
+ << dendl;
+ }
+
+ ctx->log.back().clean_regions = ctx->clean_regions;
+ dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl;
+
+ if (soid.snap < CEPH_NOSNAP) {
+ switch (log_op_type) {
+ case pg_log_entry_t::MODIFY:
+ case pg_log_entry_t::PROMOTE:
+ case pg_log_entry_t::CLEAN:
+ dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
+ << dendl;
+ encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!ctx->extra_reqids.empty()) {
+ dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
+ << ctx->extra_reqid_return_codes << dendl;
+ ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
+ ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
+ }
+
+ // apply new object state.
+ ctx->obc->obs = ctx->new_obs;
+
+ if (soid.is_head() && !ctx->obc->obs.exists) {
+ ctx->obc->ssc->exists = false;
+ ctx->obc->ssc->snapset = SnapSet();
+ } else {
+ ctx->obc->ssc->exists = true;
+ ctx->obc->ssc->snapset = ctx->new_snapset;
+ }
+}
+
+void PrimaryLogPG::apply_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats) {
+
+ recovery_state.apply_op_stats(soid, delta_stats);
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ pg_shard_t bt = *i;
+ const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
+ if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
+ pending_backfill_updates[soid].stats.add(delta_stats);
+ }
+ }
+
+ m_scrubber->stats_of_handled_objects(delta_stats, soid);
+}
+
+void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
+{
+ auto m = ctx->op->get_req<MOSDOp>();
+ ceph_assert(ctx->async_reads_complete());
+
+ for (auto p = ctx->ops->begin();
+ p != ctx->ops->end() && result >= 0; ++p) {
+ if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+ result = p->rval;
+ break;
+ }
+ ctx->bytes_read += p->outdata.length();
+ }
+ ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
+
+ MOSDOpReply *reply = ctx->reply;
+ ctx->reply = nullptr;
+
+ if (result >= 0) {
+ if (!ctx->ignore_log_op_stats) {
+ log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
+
+ publish_stats_to_osd();
+ }
+
+ // on read, return the current object version
+ if (ctx->obs) {
+ reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
+ } else {
+ reply->set_reply_versions(eversion_t(), ctx->user_at_version);
+ }
+ } else if (result == -ENOENT) {
+ // on ENOENT, set a floor for what the next user version will be.
+ reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
+ }
+
+ reply->set_result(result);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ osd->send_message_osd_client(reply, m->get_connection());
+ close_op_ctx(ctx);
+}
+
+// ========================================================================
+// copyfrom
+
+struct C_Copyfrom : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
+ C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+ const PrimaryLogPG::CopyOpRef& c)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), cop(c)
+ {}
+ void finish(int r) override {
+ if (r == -ECANCELED)
+ return;
+ std::scoped_lock l{*pg};
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ pg->process_copy_chunk(oid, tid, r);
+ cop.reset();
+ }
+ }
+};
+
+struct C_CopyFrom_AsyncReadCb : public Context {
+ OSDOp *osd_op;
+ object_copy_data_t reply_obj;
+ uint64_t features;
+ size_t len;
+ C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
+ osd_op(osd_op), features(features), len(0) {}
+ void finish(int r) override {
+ osd_op->rval = r;
+ if (r < 0) {
+ return;
+ }
+
+ ceph_assert(len > 0);
+ ceph_assert(len <= reply_obj.data.length());
+ bufferlist bl;
+ bl.substr_of(reply_obj.data, 0, len);
+ reply_obj.data.swap(bl);
+ encode(reply_obj, osd_op->outdata, features);
+ }
+};
+
+struct C_CopyChunk : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
+ uint64_t offset = 0;
+ C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+ const PrimaryLogPG::CopyOpRef& c)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), cop(c)
+ {}
+ void finish(int r) override {
+ if (r == -ECANCELED)
+ return;
+ std::scoped_lock l{*pg};
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ pg->process_copy_chunk_manifest(oid, tid, r, offset);
+ cop.reset();
+ }
+ }
+};
+
+int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
+ OSDOp& osd_op, ObjectContextRef &obc)
+{
+ object_info_t& oi = obc->obs.oi;
+ hobject_t& soid = oi.soid;
+ int result = 0;
+ object_copy_cursor_t cursor;
+ uint64_t out_max;
+ try {
+ decode(cursor, bp);
+ decode(out_max, bp);
+ }
+ catch (ceph::buffer::error& e) {
+ result = -EINVAL;
+ return result;
+ }
+
+ const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
+ uint64_t features = op->get_features();
+
+ bool async_read_started = false;
+ object_copy_data_t _reply_obj;
+ C_CopyFrom_AsyncReadCb *cb = nullptr;
+ if (pool.info.is_erasure()) {
+ cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
+ }
+ object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
+ // size, mtime
+ reply_obj.size = oi.size;
+ reply_obj.mtime = oi.mtime;
+ ceph_assert(obc->ssc);
+ if (soid.snap < CEPH_NOSNAP) {
+ auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
+ ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
+ reply_obj.snaps = p->second;
+ } else {
+ reply_obj.snap_seq = obc->ssc->snapset.seq;
+ }
+ if (oi.is_data_digest()) {
+ reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
+ reply_obj.data_digest = oi.data_digest;
+ }
+ if (oi.is_omap_digest()) {
+ reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
+ reply_obj.omap_digest = oi.omap_digest;
+ }
+ reply_obj.truncate_seq = oi.truncate_seq;
+ reply_obj.truncate_size = oi.truncate_size;
+
+ // attrs
+ map<string,bufferlist>& out_attrs = reply_obj.attrs;
+ if (!cursor.attr_complete) {
+ result = getattrs_maybe_cache(
+ ctx->obc,
+ &out_attrs);
+ if (result < 0) {
+ if (cb) {
+ delete cb;
+ }
+ return result;
+ }
+ cursor.attr_complete = true;
+ dout(20) << " got attrs" << dendl;
+ }
+
+ int64_t left = out_max - osd_op.outdata.length();
+
+ // data
+ bufferlist& bl = reply_obj.data;
+ if (left > 0 && !cursor.data_complete) {
+ if (cursor.data_offset < oi.size) {
+ uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
+ if (cb) {
+ async_read_started = true;
+ ctx->pending_async_reads.push_back(
+ make_pair(
+ boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
+ make_pair(&bl, cb)));
+ cb->len = max_read;
+
+ ctx->op_finishers[ctx->current_osd_subop_num].reset(
+ new ReadFinisher(osd_op));
+ result = -EINPROGRESS;
+
+ dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+ } else {
+ result = pgbackend->objects_read_sync(
+ oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
+ if (result < 0)
+ return result;
+ }
+ left -= max_read;
+ cursor.data_offset += max_read;
+ }
+ if (cursor.data_offset == oi.size) {
+ cursor.data_complete = true;
+ dout(20) << " got data" << dendl;
+ }
+ ceph_assert(cursor.data_offset <= oi.size);
+ }
+
+ // omap
+ uint32_t omap_keys = 0;
+ if (!pool.info.supports_omap() || !oi.is_omap()) {
+ cursor.omap_complete = true;
+ } else {
+ if (left > 0 && !cursor.omap_complete) {
+ ceph_assert(cursor.data_complete);
+ if (cursor.omap_offset.empty()) {
+ osd->store->omap_get_header(ch, ghobject_t(oi.soid),
+ &reply_obj.omap_header);
+ }
+ bufferlist omap_data;
+ ObjectMap::ObjectMapIterator iter =
+ osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
+ ceph_assert(iter);
+ iter->upper_bound(cursor.omap_offset);
+ for (; iter->valid(); iter->next()) {
+ ++omap_keys;
+ encode(iter->key(), omap_data);
+ encode(iter->value(), omap_data);
+ left -= iter->key().length() + 4 + iter->value().length() + 4;
+ if (left <= 0)
+ break;
+ }
+ if (omap_keys) {
+ encode(omap_keys, reply_obj.omap_data);
+ reply_obj.omap_data.claim_append(omap_data);
+ }
+ if (iter->valid()) {
+ cursor.omap_offset = iter->key();
+ } else {
+ cursor.omap_complete = true;
+ dout(20) << " got omap" << dendl;
+ }
+ }
+ }
+
+ if (cursor.is_complete()) {
+ // include reqids only in the final step. this is a bit fragile
+ // but it works...
+ recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
+ &reply_obj.reqids,
+ &reply_obj.reqid_return_codes);
+ dout(20) << " got reqids" << dendl;
+ }
+
+ dout(20) << " cursor.is_complete=" << cursor.is_complete()
+ << " " << out_attrs.size() << " attrs"
+ << " " << bl.length() << " bytes"
+ << " " << reply_obj.omap_header.length() << " omap header bytes"
+ << " " << reply_obj.omap_data.length() << " omap data bytes in "
+ << omap_keys << " keys"
+ << " " << reply_obj.reqids.size() << " reqids"
+ << dendl;
+ reply_obj.cursor = cursor;
+ if (!async_read_started) {
+ encode(reply_obj, osd_op.outdata, features);
+ }
+ if (cb && !async_read_started) {
+ delete cb;
+ }
+
+ if (result > 0) {
+ result = 0;
+ }
+ return result;
+}
+
+void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
+ OSDOp& osd_op)
+{
+ const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
+ uint64_t features = m->get_features();
+ object_copy_data_t reply_obj;
+
+ recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
+ &reply_obj.reqid_return_codes);
+ dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
+ encode(reply_obj, osd_op.outdata, features);
+ osd_op.rval = -ENOENT;
+ MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
+ reply->set_result(-ENOENT);
+ reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+ osd->send_message_osd_client(reply, m->get_connection());
+}
+
+void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
+ hobject_t src, object_locator_t oloc,
+ version_t version, unsigned flags,
+ bool mirror_snapset,
+ unsigned src_obj_fadvise_flags,
+ unsigned dest_obj_fadvise_flags)
+{
+ const hobject_t& dest = obc->obs.oi.soid;
+ dout(10) << __func__ << " " << dest
+ << " from " << src << " " << oloc << " v" << version
+ << " flags " << flags
+ << (mirror_snapset ? " mirror_snapset" : "")
+ << dendl;
+
+ ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
+
+ // cancel a previous in-progress copy?
+ if (copy_ops.count(dest)) {
+ // FIXME: if the src etc match, we could avoid restarting from the
+ // beginning.
+ CopyOpRef cop = copy_ops[dest];
+ vector<ceph_tid_t> tids;
+ cancel_copy(cop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+ }
+
+ CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
+ mirror_snapset, src_obj_fadvise_flags,
+ dest_obj_fadvise_flags));
+ copy_ops[dest] = cop;
+ obc->start_block();
+
+ if (!obc->obs.oi.has_manifest()) {
+ _copy_some(obc, cop);
+ } else {
+ if (obc->obs.oi.manifest.is_redirect()) {
+ _copy_some(obc, cop);
+ } else if (obc->obs.oi.manifest.is_chunked()) {
+ auto p = obc->obs.oi.manifest.chunk_map.begin();
+ _copy_some_manifest(obc, cop, p->first);
+ } else {
+ ceph_abort_msg("unrecognized manifest type");
+ }
+ }
+}
+
+void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
+{
+ dout(10) << __func__ << " " << *obc << " " << cop << dendl;
+
+ unsigned flags = 0;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
+ flags |= CEPH_OSD_FLAG_FLUSH;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
+ flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
+ flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
+ flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
+ flags |= CEPH_OSD_FLAG_RWORDERED;
+
+ C_GatherBuilder gather(cct);
+
+ if (cop->cursor.is_initial() && cop->mirror_snapset) {
+ // list snaps too.
+ ceph_assert(cop->src.snap == CEPH_NOSNAP);
+ ObjectOperation op;
+ op.list_snaps(&cop->results.snapset, NULL);
+ ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
+ CEPH_SNAPDIR, NULL,
+ flags, gather.new_sub(), NULL);
+ cop->objecter_tid2 = tid;
+ }
+
+ ObjectOperation op;
+ if (cop->results.user_version) {
+ op.assert_version(cop->results.user_version);
+ } else {
+ // we should learn the version after the first chunk, if we didn't know
+ // it already!
+ ceph_assert(cop->cursor.is_initial());
+ }
+ op.copy_get(&cop->cursor, get_copy_chunk_size(),
+ &cop->results.object_size, &cop->results.mtime,
+ &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
+ &cop->results.snaps, &cop->results.snap_seq,
+ &cop->results.flags,
+ &cop->results.source_data_digest,
+ &cop->results.source_omap_digest,
+ &cop->results.reqids,
+ &cop->results.reqid_return_codes,
+ &cop->results.truncate_seq,
+ &cop->results.truncate_size,
+ &cop->rval);
+ op.set_last_op_flags(cop->src_obj_fadvise_flags);
+
+ C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
+ get_last_peering_reset(), cop);
+ gather.set_finisher(new C_OnFinisher(fin,
+ osd->get_objecter_finisher(get_pg_shard())));
+
+ ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
+ cop->src.snap, NULL,
+ flags,
+ gather.new_sub(),
+ // discover the object version if we don't know it yet
+ cop->results.user_version ? NULL : &cop->results.user_version);
+ fin->tid = tid;
+ cop->objecter_tid = tid;
+ gather.activate();
+}
+
+void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
+{
+ dout(10) << __func__ << " " << *obc << " " << cop << dendl;
+
+ unsigned flags = 0;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
+ flags |= CEPH_OSD_FLAG_FLUSH;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
+ flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
+ flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
+ flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
+ if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
+ flags |= CEPH_OSD_FLAG_RWORDERED;
+
+ int num_chunks = 0;
+ uint64_t last_offset = 0, chunks_size = 0;
+ object_manifest_t *manifest = &obc->obs.oi.manifest;
+ map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
+ for (;iter != manifest->chunk_map.end(); ++iter) {
+ num_chunks++;
+ chunks_size += iter->second.length;
+ last_offset = iter->first;
+ if (get_copy_chunk_size() < chunks_size) {
+ break;
+ }
+ }
+
+ cop->num_chunk = num_chunks;
+ cop->start_offset = start_offset;
+ cop->last_offset = last_offset;
+ dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
+ << " start_offset: " << start_offset << " chunks_size: " << chunks_size
+ << " last_offset: " << last_offset << dendl;
+
+ iter = manifest->chunk_map.find(start_offset);
+ for (;iter != manifest->chunk_map.end(); ++iter) {
+ uint64_t obj_offset = iter->first;
+ uint64_t length = manifest->chunk_map[iter->first].length;
+ hobject_t soid = manifest->chunk_map[iter->first].oid;
+ object_locator_t oloc(soid);
+ CopyCallback * cb = NULL;
+ CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
+ cop->results.user_version, cop->flags, cop->mirror_snapset,
+ cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
+ sub_cop->cursor.data_offset = obj_offset;
+ cop->chunk_cops[obj_offset] = sub_cop;
+
+ int s = sub_cop->chunk_ops.size();
+ sub_cop->chunk_ops.resize(s+1);
+ sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
+ sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
+ sub_cop->chunk_ops[s].op.extent.length = length;
+
+ ObjectOperation op;
+ op.dup(sub_cop->chunk_ops);
+
+ if (cop->results.user_version) {
+ op.assert_version(cop->results.user_version);
+ } else {
+ // we should learn the version after the first chunk, if we didn't know
+ // it already!
+ ceph_assert(cop->cursor.is_initial());
+ }
+ op.set_last_op_flags(cop->src_obj_fadvise_flags);
+
+ C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
+ get_last_peering_reset(), cop);
+ fin->offset = obj_offset;
+
+ ceph_tid_t tid = osd->objecter->read(
+ soid.oid, oloc, op,
+ sub_cop->src.snap, NULL,
+ flags,
+ new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+ // discover the object version if we don't know it yet
+ sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
+ fin->tid = tid;
+ sub_cop->objecter_tid = tid;
+
+ dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
+ << manifest->chunk_map[iter->first].offset
+ << " length: " << length << " pool id: " << oloc.pool
+ << " tid: " << tid << dendl;
+
+ if (last_offset < iter->first) {
+ break;
+ }
+ }
+}
+
+void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
+{
+ dout(10) << __func__ << " " << oid << " tid " << tid
+ << " " << cpp_strerror(r) << dendl;
+ map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
+ if (p == copy_ops.end()) {
+ dout(10) << __func__ << " no copy_op found" << dendl;
+ return;
+ }
+ CopyOpRef cop = p->second;
+ if (tid != cop->objecter_tid) {
+ dout(10) << __func__ << " tid " << tid << " != cop " << cop
+ << " tid " << cop->objecter_tid << dendl;
+ return;
+ }
+
+ if (cop->omap_data.length() || cop->omap_header.length())
+ cop->results.has_omap = true;
+
+ if (r >= 0 && !pool.info.supports_omap() &&
+ (cop->omap_data.length() || cop->omap_header.length())) {
+ r = -EOPNOTSUPP;
+ }
+ cop->objecter_tid = 0;
+ cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
+ ObjectContextRef& cobc = cop->obc;
+
+ if (r < 0)
+ goto out;
+
+ ceph_assert(cop->rval >= 0);
+
+ if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
+ // verify snap hasn't been deleted
+ vector<snapid_t>::iterator p = cop->results.snaps.begin();
+ while (p != cop->results.snaps.end()) {
+ // make best effort to sanitize snaps/clones.
+ if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
+ dout(10) << __func__ << " clone snap " << *p << " has been deleted"
+ << dendl;
+ for (vector<snapid_t>::iterator q = p + 1;
+ q != cop->results.snaps.end();
+ ++q)
+ *(q - 1) = *q;
+ cop->results.snaps.resize(cop->results.snaps.size() - 1);
+ } else {
+ ++p;
+ }
+ }
+ if (cop->results.snaps.empty()) {
+ dout(10) << __func__ << " no more snaps for " << oid << dendl;
+ r = -ENOENT;
+ goto out;
+ }
+ }
+
+ ceph_assert(cop->rval >= 0);
+
+ if (!cop->temp_cursor.data_complete) {
+ cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
+ }
+ if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
+ if (cop->omap_header.length()) {
+ cop->results.omap_digest =
+ cop->omap_header.crc32c(cop->results.omap_digest);
+ }
+ if (cop->omap_data.length()) {
+ bufferlist keys;
+ keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
+ cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
+ }
+ }
+
+ if (!cop->temp_cursor.attr_complete) {
+ for (map<string,bufferlist>::iterator p = cop->attrs.begin();
+ p != cop->attrs.end();
+ ++p) {
+ cop->results.attrs[string("_") + p->first] = p->second;
+ }
+ cop->attrs.clear();
+ }
+
+ if (!cop->cursor.is_complete()) {
+ // write out what we have so far
+ if (cop->temp_cursor.is_initial()) {
+ ceph_assert(!cop->results.started_temp_obj);
+ cop->results.started_temp_obj = true;
+ cop->results.temp_oid = generate_temp_object(oid);
+ dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
+ }
+ ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
+ OpContextUPtr ctx = simple_opc_create(tempobc);
+ if (cop->temp_cursor.is_initial()) {
+ ctx->new_temp_oid = cop->results.temp_oid;
+ }
+ _write_copy_chunk(cop, ctx->op_t.get());
+ simple_opc_submit(std::move(ctx));
+ dout(10) << __func__ << " fetching more" << dendl;
+ _copy_some(cobc, cop);
+ return;
+ }
+
+ // verify digests?
+ if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
+ dout(20) << __func__ << std::hex
+ << " got digest: rx data 0x" << cop->results.data_digest
+ << " omap 0x" << cop->results.omap_digest
+ << ", source: data 0x" << cop->results.source_data_digest
+ << " omap 0x" << cop->results.source_omap_digest
+ << std::dec
+ << " flags " << cop->results.flags
+ << dendl;
+ }
+ if (cop->results.is_data_digest() &&
+ cop->results.data_digest != cop->results.source_data_digest) {
+ derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
+ << " != source 0x" << cop->results.source_data_digest << std::dec
+ << dendl;
+ osd->clog->error() << info.pgid << " copy from " << cop->src
+ << " to " << cop->obc->obs.oi.soid << std::hex
+ << " data digest 0x" << cop->results.data_digest
+ << " != source 0x" << cop->results.source_data_digest
+ << std::dec;
+ r = -EIO;
+ goto out;
+ }
+ if (cop->results.is_omap_digest() &&
+ cop->results.omap_digest != cop->results.source_omap_digest) {
+ derr << __func__ << std::hex
+ << " omap digest 0x" << cop->results.omap_digest
+ << " != source 0x" << cop->results.source_omap_digest
+ << std::dec << dendl;
+ osd->clog->error() << info.pgid << " copy from " << cop->src
+ << " to " << cop->obc->obs.oi.soid << std::hex
+ << " omap digest 0x" << cop->results.omap_digest
+ << " != source 0x" << cop->results.source_omap_digest
+ << std::dec;
+ r = -EIO;
+ goto out;
+ }
+ if (cct->_conf->osd_debug_inject_copyfrom_error) {
+ derr << __func__ << " injecting copyfrom failure" << dendl;
+ r = -EIO;
+ goto out;
+ }
+
+ cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
+ [this, &cop /* avoid ref cycle */](PGTransaction *t) {
+ ObjectState& obs = cop->obc->obs;
+ if (cop->temp_cursor.is_initial()) {
+ dout(20) << "fill_in_final_tx: writing "
+ << "directly to final object" << dendl;
+ // write directly to final object
+ cop->results.temp_oid = obs.oi.soid;
+ _write_copy_chunk(cop, t);
+ } else {
+ // finish writing to temp object, then move into place
+ dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
+ if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
+ /* In redirect manifest case, the object exists in the upper tier.
+ * So, to avoid a conflict when rename() is called, remove existing
+ * object first
+ */
+ t->remove(obs.oi.soid);
+ }
+ _write_copy_chunk(cop, t);
+ t->rename(obs.oi.soid, cop->results.temp_oid);
+ }
+ t->setattrs(obs.oi.soid, cop->results.attrs);
+ });
+
+ dout(20) << __func__ << " success; committing" << dendl;
+
+ out:
+ dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
+ CopyCallbackResults results(r, &cop->results);
+ cop->cb->complete(results);
+
+ copy_ops.erase(cobc->obs.oi.soid);
+ cobc->stop_block();
+
+ if (r < 0 && cop->results.started_temp_obj) {
+ dout(10) << __func__ << " deleting partial temp object "
+ << cop->results.temp_oid << dendl;
+ ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
+ OpContextUPtr ctx = simple_opc_create(tempobc);
+ ctx->op_t->remove(cop->results.temp_oid);
+ ctx->discard_temp_oid = cop->results.temp_oid;
+ simple_opc_submit(std::move(ctx));
+ }
+
+ // cancel and requeue proxy ops on this object
+ if (!r) {
+ cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
+ }
+
+ kick_object_context_blocked(cobc);
+}
+
+void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
+{
+ dout(10) << __func__ << " " << oid << " tid " << tid
+ << " " << cpp_strerror(r) << dendl;
+ map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
+ if (p == copy_ops.end()) {
+ dout(10) << __func__ << " no copy_op found" << dendl;
+ return;
+ }
+ CopyOpRef obj_cop = p->second;
+ CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
+
+ if (tid != chunk_cop->objecter_tid) {
+ dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
+ << " tid " << chunk_cop->objecter_tid << dendl;
+ return;
+ }
+
+ if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
+ r = -EOPNOTSUPP;
+ }
+
+ chunk_cop->objecter_tid = 0;
+ chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
+ ObjectContextRef& cobc = obj_cop->obc;
+ OSDOp &chunk_data = chunk_cop->chunk_ops[0];
+
+ if (r < 0) {
+ obj_cop->failed = true;
+ goto out;
+ }
+
+ if (obj_cop->failed) {
+ return;
+ }
+ if (!chunk_data.outdata.length()) {
+ r = -EIO;
+ obj_cop->failed = true;
+ goto out;
+ }
+
+ obj_cop->num_chunk--;
+
+ /* check all of the copyop are completed */
+ if (obj_cop->num_chunk) {
+ dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
+ return;
+ }
+
+ {
+ OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
+ if (!ctx->lock_manager.take_write_lock(
+ obj_cop->obc->obs.oi.soid,
+ obj_cop->obc)) {
+ // recovery op can take read lock.
+ // so need to wait for recovery completion
+ r = -EAGAIN;
+ obj_cop->failed = true;
+ close_op_ctx(ctx.release());
+ goto out;
+ }
+ dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
+
+ PGTransaction *t = ctx->op_t.get();
+ ObjectState& obs = ctx->new_obs;
+ for (auto p : obj_cop->chunk_cops) {
+ OSDOp &sub_chunk = p.second->chunk_ops[0];
+ t->write(cobc->obs.oi.soid,
+ p.second->cursor.data_offset,
+ sub_chunk.outdata.length(),
+ sub_chunk.outdata,
+ p.second->dest_obj_fadvise_flags);
+ dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
+ << " length: " << sub_chunk.outdata.length() << dendl;
+ write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
+ p.second->cursor.data_offset, sub_chunk.outdata.length());
+ obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
+ ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
+ sub_chunk.outdata.clear();
+ }
+ obs.oi.clear_data_digest();
+ ctx->at_version = get_next_version();
+ finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
+ simple_opc_submit(std::move(ctx));
+
+ auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
+ /* check remaining work */
+ if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
+ if (obj_cop->last_offset >= p->first + p->second.length) {
+ for (auto &en : cobc->obs.oi.manifest.chunk_map) {
+ if (obj_cop->last_offset < en.first) {
+ _copy_some_manifest(cobc, obj_cop, en.first);
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ out:
+ dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
+ CopyCallbackResults results(r, &obj_cop->results);
+ obj_cop->cb->complete(results);
+
+ copy_ops.erase(cobc->obs.oi.soid);
+ cobc->stop_block();
+
+ // cancel and requeue proxy ops on this object
+ if (!r) {
+ cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
+ }
+
+ kick_object_context_blocked(cobc);
+}
+
+void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
+ vector<ceph_tid_t> tids;
+ for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
+ it != proxyread_ops.end();) {
+ if (it->second->soid == oid) {
+ cancel_proxy_read((it++)->second, &tids);
+ } else {
+ ++it;
+ }
+ }
+ for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
+ it != proxywrite_ops.end();) {
+ if (it->second->soid == oid) {
+ cancel_proxy_write((it++)->second, &tids);
+ } else {
+ ++it;
+ }
+ }
+ osd->objecter->op_cancel(tids, -ECANCELED);
+ kick_proxy_ops_blocked(oid);
+}
+
+void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
+{
+ dout(20) << __func__ << " " << cop
+ << " " << cop->attrs.size() << " attrs"
+ << " " << cop->data.length() << " bytes"
+ << " " << cop->omap_header.length() << " omap header bytes"
+ << " " << cop->omap_data.length() << " omap data bytes"
+ << dendl;
+ if (!cop->temp_cursor.attr_complete) {
+ t->create(cop->results.temp_oid);
+ }
+ if (!cop->temp_cursor.data_complete) {
+ ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
+ cop->cursor.data_offset);
+ if (pool.info.required_alignment() &&
+ !cop->cursor.data_complete) {
+ /**
+ * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
+ * to pick it up on the next pass.
+ */
+ ceph_assert(cop->temp_cursor.data_offset %
+ pool.info.required_alignment() == 0);
+ if (cop->data.length() % pool.info.required_alignment() != 0) {
+ uint64_t to_trim =
+ cop->data.length() % pool.info.required_alignment();
+ bufferlist bl;
+ bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
+ cop->data.swap(bl);
+ cop->cursor.data_offset -= to_trim;
+ ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
+ cop->cursor.data_offset);
+ }
+ }
+ if (cop->data.length()) {
+ t->write(
+ cop->results.temp_oid,
+ cop->temp_cursor.data_offset,
+ cop->data.length(),
+ cop->data,
+ cop->dest_obj_fadvise_flags);
+ }
+ cop->data.clear();
+ }
+ if (pool.info.supports_omap()) {
+ if (!cop->temp_cursor.omap_complete) {
+ if (cop->omap_header.length()) {
+ t->omap_setheader(
+ cop->results.temp_oid,
+ cop->omap_header);
+ cop->omap_header.clear();
+ }
+ if (cop->omap_data.length()) {
+ map<string,bufferlist> omap;
+ bufferlist::const_iterator p = cop->omap_data.begin();
+ decode(omap, p);
+ t->omap_setkeys(cop->results.temp_oid, omap);
+ cop->omap_data.clear();
+ }
+ }
+ } else {
+ ceph_assert(cop->omap_header.length() == 0);
+ ceph_assert(cop->omap_data.length() == 0);
+ }
+ cop->temp_cursor = cop->cursor;
+}
+
+void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
+{
+ OpContext *ctx = cb->ctx;
+ dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
+
+ ObjectState& obs = ctx->new_obs;
+ if (obs.exists) {
+ dout(20) << __func__ << ": exists, removing" << dendl;
+ ctx->op_t->remove(obs.oi.soid);
+ } else {
+ ctx->delta_stats.num_objects++;
+ obs.exists = true;
+ }
+ if (cb->is_temp_obj_used()) {
+ ctx->discard_temp_oid = cb->results->temp_oid;
+ }
+ cb->results->fill_in_final_tx(ctx->op_t.get());
+
+ // CopyFromCallback fills this in for us
+ obs.oi.user_version = ctx->user_at_version;
+
+ if (cb->results->is_data_digest()) {
+ obs.oi.set_data_digest(cb->results->data_digest);
+ } else {
+ obs.oi.clear_data_digest();
+ }
+ if (cb->results->is_omap_digest()) {
+ obs.oi.set_omap_digest(cb->results->omap_digest);
+ } else {
+ obs.oi.clear_omap_digest();
+ }
+
+ obs.oi.truncate_seq = cb->truncate_seq;
+ obs.oi.truncate_size = cb->truncate_size;
+
+ obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
+ ctx->mtime = utime_t();
+
+ ctx->extra_reqids = cb->results->reqids;
+ ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
+
+ // cache: clear whiteout?
+ if (obs.oi.is_whiteout()) {
+ dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
+ obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+ --ctx->delta_stats.num_whiteouts;
+ }
+
+ if (cb->results->has_omap) {
+ dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
+ obs.oi.set_flag(object_info_t::FLAG_OMAP);
+ ctx->clean_regions.mark_omap_dirty();
+ } else {
+ dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
+ obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+ }
+
+ interval_set<uint64_t> ch;
+ if (obs.oi.size > 0)
+ ch.insert(0, obs.oi.size);
+ ctx->modified_ranges.union_of(ch);
+ ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
+
+ if (cb->get_data_size() != obs.oi.size) {
+ ctx->delta_stats.num_bytes -= obs.oi.size;
+ obs.oi.size = cb->get_data_size();
+ ctx->delta_stats.num_bytes += obs.oi.size;
+ }
+ ctx->delta_stats.num_wr++;
+ ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
+
+ osd->logger->inc(l_osd_copyfrom);
+}
+
+void PrimaryLogPG::finish_promote(int r, CopyResults *results,
+ ObjectContextRef obc)
+{
+ const hobject_t& soid = obc->obs.oi.soid;
+ dout(10) << __func__ << " " << soid << " r=" << r
+ << " uv" << results->user_version << dendl;
+
+ if (r == -ECANCELED) {
+ return;
+ }
+
+ if (r != -ENOENT && soid.is_snap()) {
+ if (results->snaps.empty()) {
+ // we must have read "snap" content from the head object in the
+ // base pool. use snap_seq to construct what snaps should be
+ // for this clone (what is was before we evicted the clean clone
+ // from this pool, and what it will be when we flush and the
+ // clone eventually happens in the base pool). we want to use
+ // snaps in (results->snap_seq,soid.snap]
+ SnapSet& snapset = obc->ssc->snapset;
+ for (auto p = snapset.clone_snaps.rbegin();
+ p != snapset.clone_snaps.rend();
+ ++p) {
+ for (auto snap : p->second) {
+ if (snap > soid.snap) {
+ continue;
+ }
+ if (snap <= results->snap_seq) {
+ break;
+ }
+ results->snaps.push_back(snap);
+ }
+ }
+ }
+
+ dout(20) << __func__ << " snaps " << results->snaps << dendl;
+ filter_snapc(results->snaps);
+
+ dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
+ if (results->snaps.empty()) {
+ dout(20) << __func__
+ << " snaps are empty, clone is invalid,"
+ << " setting r to ENOENT" << dendl;
+ r = -ENOENT;
+ }
+ }
+
+ if (r < 0 && results->started_temp_obj) {
+ dout(10) << __func__ << " abort; will clean up partial work" << dendl;
+ ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
+ ceph_assert(tempobc);
+ OpContextUPtr ctx = simple_opc_create(tempobc);
+ ctx->op_t->remove(results->temp_oid);
+ simple_opc_submit(std::move(ctx));
+ results->started_temp_obj = false;
+ }
+
+ if (r == -ENOENT && soid.is_snap()) {
+ dout(10) << __func__
+ << ": enoent while trying to promote clone, " << soid
+ << " must have been trimmed, removing from snapset"
+ << dendl;
+ hobject_t head(soid.get_head());
+ ObjectContextRef obc = get_object_context(head, false);
+ ceph_assert(obc);
+
+ OpContextUPtr tctx = simple_opc_create(obc);
+ tctx->at_version = get_next_version();
+ if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+ filter_snapc(tctx->new_snapset.snaps);
+ } else {
+ tctx->new_snapset.snaps.clear();
+ }
+ vector<snapid_t> new_clones;
+ map<snapid_t, vector<snapid_t>> new_clone_snaps;
+ for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
+ i != tctx->new_snapset.clones.end();
+ ++i) {
+ if (*i != soid.snap) {
+ new_clones.push_back(*i);
+ auto p = tctx->new_snapset.clone_snaps.find(*i);
+ if (p != tctx->new_snapset.clone_snaps.end()) {
+ new_clone_snaps[*i] = p->second;
+ }
+ }
+ }
+ tctx->new_snapset.clones.swap(new_clones);
+ tctx->new_snapset.clone_overlap.erase(soid.snap);
+ tctx->new_snapset.clone_size.erase(soid.snap);
+ tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
+
+ // take RWWRITE lock for duration of our local write. ignore starvation.
+ if (!tctx->lock_manager.take_write_lock(
+ head,
+ obc)) {
+ ceph_abort_msg("problem!");
+ }
+ dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
+
+ finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
+
+ simple_opc_submit(std::move(tctx));
+ return;
+ }
+
+ bool whiteout = false;
+ if (r == -ENOENT) {
+ ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
+ dout(10) << __func__ << " whiteout " << soid << dendl;
+ whiteout = true;
+ }
+
+ if (r < 0 && !whiteout) {
+ derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
+ // pass error to everyone blocked on this object
+ // FIXME: this is pretty sloppy, but at this point we got
+ // something unexpected and don't have many other options.
+ map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
+ waiting_for_blocked_object.find(soid);
+ if (blocked_iter != waiting_for_blocked_object.end()) {
+ while (!blocked_iter->second.empty()) {
+ osd->reply_op_error(blocked_iter->second.front(), r);
+ blocked_iter->second.pop_front();
+ }
+ waiting_for_blocked_object.erase(blocked_iter);
+ }
+ return;
+ }
+
+ osd->promote_finish(results->object_size);
+
+ OpContextUPtr tctx = simple_opc_create(obc);
+ tctx->at_version = get_next_version();
+
+ if (!obc->obs.oi.has_manifest()) {
+ ++tctx->delta_stats.num_objects;
+ }
+ if (soid.snap < CEPH_NOSNAP)
+ ++tctx->delta_stats.num_object_clones;
+ tctx->new_obs.exists = true;
+
+ tctx->extra_reqids = results->reqids;
+ tctx->extra_reqid_return_codes = results->reqid_return_codes;
+
+ if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
+ tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
+ tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
+ tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
+ tctx->new_obs.oi.manifest.redirect_target = hobject_t();
+ tctx->delta_stats.num_objects_manifest--;
+ if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
+ dec_all_refcount_manifest(obc->obs.oi, tctx.get());
+ }
+ }
+
+ if (whiteout) {
+ // create a whiteout
+ tctx->op_t->create(soid);
+ tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
+ ++tctx->delta_stats.num_whiteouts;
+ dout(20) << __func__ << " creating whiteout on " << soid << dendl;
+ osd->logger->inc(l_osd_tier_whiteout);
+ } else {
+ if (results->has_omap) {
+ dout(10) << __func__ << " setting omap flag on " << soid << dendl;
+ tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
+ ++tctx->delta_stats.num_objects_omap;
+ }
+
+ results->fill_in_final_tx(tctx->op_t.get());
+ if (results->started_temp_obj) {
+ tctx->discard_temp_oid = results->temp_oid;
+ }
+ tctx->new_obs.oi.size = results->object_size;
+ tctx->new_obs.oi.user_version = results->user_version;
+ tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
+ tctx->mtime = utime_t();
+ if (results->is_data_digest()) {
+ tctx->new_obs.oi.set_data_digest(results->data_digest);
+ } else {
+ tctx->new_obs.oi.clear_data_digest();
+ }
+ if (results->object_size)
+ tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
+ if (results->is_omap_digest()) {
+ tctx->new_obs.oi.set_omap_digest(results->omap_digest);
+ } else {
+ tctx->new_obs.oi.clear_omap_digest();
+ }
+ if (results->has_omap)
+ tctx->clean_regions.mark_omap_dirty();
+ tctx->new_obs.oi.truncate_seq = results->truncate_seq;
+ tctx->new_obs.oi.truncate_size = results->truncate_size;
+
+ if (soid.snap != CEPH_NOSNAP) {
+ ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
+ ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
+ ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
+ results->object_size);
+ ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
+
+ tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
+ } else {
+ tctx->delta_stats.num_bytes += results->object_size;
+ }
+ }
+
+ if (results->mirror_snapset) {
+ ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
+ tctx->new_snapset.from_snap_set(
+ results->snapset,
+ get_osdmap()->require_osd_release < ceph_release_t::luminous);
+ }
+ dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
+
+ // take RWWRITE lock for duration of our local write. ignore starvation.
+ if (!tctx->lock_manager.take_write_lock(
+ obc->obs.oi.soid,
+ obc)) {
+ ceph_abort_msg("problem!");
+ }
+ dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
+
+ finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
+
+ simple_opc_submit(std::move(tctx));
+
+ osd->logger->inc(l_osd_tier_promote);
+
+ if (agent_state &&
+ agent_state->is_idle())
+ agent_choose_mode();
+}
+
+void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
+ ObjectContextRef obc)
+{
+ const hobject_t& soid = obc->obs.oi.soid;
+ dout(10) << __func__ << " " << soid << " r=" << r
+ << " uv" << results->user_version << dendl;
+
+ if (r == -ECANCELED || r == -EAGAIN) {
+ return;
+ }
+
+ if (r < 0) {
+ derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
+ // pass error to everyone blocked on this object
+ // FIXME: this is pretty sloppy, but at this point we got
+ // something unexpected and don't have many other options.
+ map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
+ waiting_for_blocked_object.find(soid);
+ if (blocked_iter != waiting_for_blocked_object.end()) {
+ while (!blocked_iter->second.empty()) {
+ osd->reply_op_error(blocked_iter->second.front(), r);
+ blocked_iter->second.pop_front();
+ }
+ waiting_for_blocked_object.erase(blocked_iter);
+ }
+ return;
+ }
+
+ osd->promote_finish(results->object_size);
+ osd->logger->inc(l_osd_tier_promote);
+
+ if (agent_state &&
+ agent_state->is_idle())
+ agent_choose_mode();
+}
+
+void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
+ vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << " " << cop->obc->obs.oi.soid
+ << " from " << cop->src << " " << cop->oloc
+ << " v" << cop->results.user_version << dendl;
+
+ // cancel objecter op, if we can
+ if (cop->objecter_tid) {
+ tids->push_back(cop->objecter_tid);
+ cop->objecter_tid = 0;
+ if (cop->objecter_tid2) {
+ tids->push_back(cop->objecter_tid2);
+ cop->objecter_tid2 = 0;
+ }
+ }
+
+ copy_ops.erase(cop->obc->obs.oi.soid);
+ cop->obc->stop_block();
+
+ kick_object_context_blocked(cop->obc);
+ cop->results.should_requeue = requeue;
+ CopyCallbackResults result(-ECANCELED, &cop->results);
+ cop->cb->complete(result);
+
+ // There may still be an objecter callback referencing this copy op.
+ // That callback will not need the obc since it's been canceled, and
+ // we need the obc reference to go away prior to flush.
+ cop->obc = ObjectContextRef();
+}
+
+void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << dendl;
+ map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
+ while (p != copy_ops.end()) {
+ // requeue this op? can I queue up all of them?
+ cancel_copy((p++)->second, requeue, tids);
+ }
+}
+
+
+// ========================================================================
+// flush
+//
+// Flush a dirty object in the cache tier by writing it back to the
+// base tier. The sequence looks like:
+//
+// * send a copy-from operation to the base tier to copy the current
+// version of the object
+// * base tier will pull the object via (perhaps multiple) copy-get(s)
+// * on completion, we check if the object has been modified. if so,
+// just reply with -EAGAIN.
+// * try to take a write lock so we can clear the dirty flag. if this
+// fails, wait and retry
+// * start a repop that clears the bit.
+//
+// If we have to wait, we will retry by coming back through the
+// start_flush method. We check if a flush is already in progress
+// and, if so, try to finish it by rechecking the version and trying
+// to clear the dirty bit.
+//
+// In order for the cache-flush (a write op) to not block the copy-get
+// from reading the object, the client *must* set the SKIPRWLOCKS
+// flag.
+//
+// NOTE: normally writes are strictly ordered for the client, but
+// flushes are special in that they can be reordered with respect to
+// other writes. In particular, we can't have a flush request block
+// an update to the cache pool object!
+
+struct C_Flush : public Context {
+ PrimaryLogPGRef pg;
+ hobject_t oid;
+ epoch_t last_peering_reset;
+ ceph_tid_t tid;
+ utime_t start;
+ C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
+ : pg(p), oid(o), last_peering_reset(lpr),
+ tid(0), start(ceph_clock_now())
+ {}
+ void finish(int r) override {
+ if (r == -ECANCELED)
+ return;
+ std::scoped_lock locker{*pg};
+ if (last_peering_reset == pg->get_last_peering_reset()) {
+ pg->finish_flush(oid, tid, r);
+ pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
+ }
+ }
+};
+
+int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
+{
+ const object_info_t& oi = obc->obs.oi;
+ const hobject_t& soid = oi.soid;
+
+ ceph_assert(obc->is_blocked());
+ if (oi.size == 0) {
+ // evicted
+ return 0;
+ }
+ if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
+ dout(0) << " fingerprint algorithm is not set " << dendl;
+ return -EINVAL;
+ }
+ if (pool.info.get_dedup_tier() <= 0) {
+ dout(10) << " dedup tier is not set " << dendl;
+ return -EINVAL;
+ }
+
+ /*
+ * The operations to make dedup chunks are tracked by a ManifestOp.
+ * This op will be finished if all the operations are completed.
+ */
+ ManifestOpRef mop(std::make_shared<ManifestOp>(nullptr));
+
+ // cdc
+ std::map<uint64_t, bufferlist> chunks;
+ int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
+ if (r < 0) {
+ return r;
+ }
+ if (!chunks.size()) {
+ return 0;
+ }
+
+ // chunks issued here are different with chunk_map newly generated
+ // because the same chunks in previous snap will not be issued
+ // So, we need two data structures; the first is the issued chunk list to track
+ // issued operations, and the second is the new chunk_map to update chunk_map after
+ // all operations are finished
+ object_ref_delta_t refs;
+ ObjectContextRef obc_l, obc_g;
+ get_adjacent_clones(obc, obc_l, obc_g);
+ // skip if the same content exits in prev snap at same offset
+ mop->new_manifest.calc_refs_to_inc_on_set(
+ obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+ obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+ refs);
+
+ for (auto p : chunks) {
+ hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
+ if (refs.find(target) == refs.end()) {
+ continue;
+ }
+ C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
+ ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
+ fin, move(chunks[p.first]));
+ mop->chunks[target] = make_pair(p.first, p.second.length());
+ mop->num_chunks++;
+ mop->tids[p.first] = tid;
+ fin->tid = tid;
+ dout(10) << __func__ << " oid: " << soid << " tid: " << tid
+ << " target: " << target << " offset: " << p.first
+ << " length: " << p.second.length() << dendl;
+ }
+
+ if (mop->tids.size()) {
+ manifest_ops[soid] = mop;
+ manifest_ops[soid]->op = op;
+ } else {
+ // size == 0
+ return 0;
+ }
+
+ return -EINPROGRESS;
+}
+
+int PrimaryLogPG::do_cdc(const object_info_t& oi,
+ std::map<uint64_t, chunk_info_t>& chunk_map,
+ std::map<uint64_t, bufferlist>& chunks)
+{
+ string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
+ int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
+ uint64_t total_length = 0;
+
+ std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
+ if (!cdc) {
+ dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist bl;
+ /**
+ * We disable EC pool as a base tier of distributed dedup.
+ * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
+ * Therefore, we should change the current implementation totally to make EC pool compatible.
+ * As s result, we leave this as a future work.
+ */
+ int r = pgbackend->objects_read_sync(
+ oi.soid, 0, oi.size, 0, &bl);
+ if (r < 0) {
+ dout(0) << __func__ << " read fail " << oi.soid
+ << " len: " << oi.size << " r: " << r << dendl;
+ return r;
+ }
+ if (bl.length() != oi.size) {
+ dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
+ << oi.size << " during chunking " << dendl;
+ return -EIO;
+ }
+
+ dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
+ << " oi.size: " << oi.size
+ << " chunk_size: " << chunk_size << dendl;
+
+ vector<pair<uint64_t, uint64_t>> cdc_chunks;
+ cdc->calc_chunks(bl, &cdc_chunks);
+
+ // get fingerprint
+ for (auto p : cdc_chunks) {
+ bufferlist chunk;
+ chunk.substr_of(bl, p.first, p.second);
+ auto [ret, target] = get_fpoid_from_chunk(oi.soid, chunk);
+ if (ret < 0) {
+ return ret;
+ }
+ chunks[p.first] = std::move(chunk);
+ chunk_map[p.first] = chunk_info_t(0, p.second, target);
+ total_length += p.second;
+ }
+ return total_length;
+}
+
+std::pair<int, hobject_t> PrimaryLogPG::get_fpoid_from_chunk(
+ const hobject_t soid, bufferlist& chunk)
+{
+ pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
+ if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
+ return make_pair(-EINVAL, hobject_t());
+ }
+ object_t fp_oid = [&fp_algo, &chunk]() -> string {
+ switch (fp_algo) {
+ case pg_pool_t::TYPE_FINGERPRINT_SHA1:
+ return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
+ case pg_pool_t::TYPE_FINGERPRINT_SHA256:
+ return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
+ case pg_pool_t::TYPE_FINGERPRINT_SHA512:
+ return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
+ default:
+ assert(0 == "unrecognized fingerprint type");
+ return {};
+ }
+ }();
+
+ pg_t raw_pg;
+ object_locator_t oloc(soid);
+ oloc.pool = pool.info.get_dedup_tier();
+ // check if dedup_tier isn't set
+ ceph_assert(oloc.pool > 0);
+ int ret = get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
+ if (ret < 0) {
+ return make_pair(ret, hobject_t());
+ }
+ hobject_t target(fp_oid, oloc.key, snapid_t(),
+ raw_pg.ps(), raw_pg.pool(),
+ oloc.nspace);
+ return make_pair(0, target);
+}
+
+int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
+{
+ dout(10) << __func__ << " " << oid << " tid " << tid
+ << " " << cpp_strerror(r) << dendl;
+ map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
+ if (p == manifest_ops.end()) {
+ dout(10) << __func__ << " no manifest_op found" << dendl;
+ return -EINVAL;
+ }
+ ManifestOpRef mop = p->second;
+ mop->results[offset] = r;
+ if (r < 0) {
+ // if any failure occurs, put a mark on the results to recognize the failure
+ mop->results[0] = r;
+ }
+ if (mop->num_chunks != mop->results.size()) {
+ // there are on-going works
+ return -EINPROGRESS;
+ }
+ ObjectContextRef obc = get_object_context(oid, false);
+ if (!obc) {
+ if (mop->op)
+ osd->reply_op_error(mop->op, -EINVAL);
+ return -EINVAL;
+ }
+ ceph_assert(obc->is_blocked());
+ obc->stop_block();
+ kick_object_context_blocked(obc);
+ if (mop->results[0] < 0) {
+ // check if the previous op returns fail
+ ceph_assert(mop->num_chunks == mop->results.size());
+ manifest_ops.erase(oid);
+ osd->reply_op_error(mop->op, mop->results[0]);
+ return -EIO;
+ }
+
+ if (mop->chunks.size()) {
+ OpContextUPtr ctx = simple_opc_create(obc);
+ ceph_assert(ctx);
+ if (ctx->lock_manager.get_lock_type(
+ RWState::RWWRITE,
+ oid,
+ obc,
+ mop->op)) {
+ dout(20) << __func__ << " took write lock" << dendl;
+ } else if (mop->op) {
+ dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
+ close_op_ctx(ctx.release());
+ return -EAGAIN;
+ }
+
+ ctx->at_version = get_next_version();
+ ctx->new_obs = obc->obs;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+
+ /*
+ * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
+ * head: [0, 2) aaa <-- tier_flush()
+ * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
+ *
+ * In this case, if the new chunk_map is as follows,
+ * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
+ * we should drop aaa from head by using calc_refs_to_drop_on_removal().
+ * So, the precedure is
+ * 1. calc_refs_to_drop_on_removal()
+ * 2. register old references to drop after tier_flush() is committed
+ * 3. update new chunk_map
+ */
+
+ ObjectCleanRegions c_regions = ctx->clean_regions;
+ ObjectContextRef cobc = get_prev_clone_obc(obc);
+ c_regions.mark_fully_dirty();
+ // CDC was done on entire range of manifest object,
+ // so the first thing we should do here is to drop the reference to old chunks
+ ObjectContextRef obc_l, obc_g;
+ get_adjacent_clones(obc, obc_l, obc_g);
+ // clear all old references
+ object_ref_delta_t refs;
+ ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
+ obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+ obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+ refs);
+ if (!refs.is_empty()) {
+ ctx->register_on_commit(
+ [oid, this, refs](){
+ dec_refcount(oid, refs);
+ });
+ }
+
+ // set new references
+ ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
+
+ finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
+ simple_opc_submit(std::move(ctx));
+ }
+ if (mop->op)
+ osd->reply_op_error(mop->op, r);
+
+ manifest_ops.erase(oid);
+ return 0;
+}
+
+int PrimaryLogPG::start_flush(
+ OpRequestRef op, ObjectContextRef obc,
+ bool blocking, hobject_t *pmissing,
+ std::optional<std::function<void()>> &&on_flush)
+{
+ const object_info_t& oi = obc->obs.oi;
+ const hobject_t& soid = oi.soid;
+ dout(10) << __func__ << " " << soid
+ << " v" << oi.version
+ << " uv" << oi.user_version
+ << " " << (blocking ? "blocking" : "non-blocking/best-effort")
+ << dendl;
+
+ bool preoctopus_compat =
+ get_osdmap()->require_osd_release < ceph_release_t::octopus;
+ SnapSet snapset;
+ if (preoctopus_compat) {
+ // for pre-octopus compatibility, filter SnapSet::snaps. not
+ // certain we need this, but let's be conservative.
+ snapset = obc->ssc->snapset.get_filtered(pool.info);
+ } else {
+ // NOTE: change this to a const ref when we remove this compat code
+ snapset = obc->ssc->snapset;
+ }
+
+ if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
+ // current dedup tier only supports blocking operation
+ if (!blocking) {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ // verify there are no (older) check for dirty clones
+ {
+ dout(20) << " snapset " << snapset << dendl;
+ vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
+ while (p != snapset.clones.rend() && *p >= soid.snap)
+ ++p;
+ if (p != snapset.clones.rend()) {
+ hobject_t next = soid;
+ next.snap = *p;
+ ceph_assert(next.snap < soid.snap);
+ if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
+ dout(10) << __func__ << " missing clone is " << next << dendl;
+ if (pmissing)
+ *pmissing = next;
+ return -ENOENT;
+ }
+ ObjectContextRef older_obc = get_object_context(next, false);
+ if (older_obc) {
+ dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
+ << dendl;
+ if (older_obc->obs.oi.is_dirty()) {
+ dout(10) << __func__ << " next oldest clone is dirty: "
+ << older_obc->obs.oi << dendl;
+ return -EBUSY;
+ }
+ } else {
+ dout(20) << __func__ << " next oldest clone " << next
+ << " is not present; implicitly clean" << dendl;
+ }
+ } else {
+ dout(20) << __func__ << " no older clones" << dendl;
+ }
+ }
+
+ if (blocking)
+ obc->start_block();
+
+ map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
+ if (p != flush_ops.end()) {
+ FlushOpRef fop = p->second;
+ if (fop->op == op) {
+ // we couldn't take the write lock on a cache-try-flush before;
+ // now we are trying again for the lock.
+ return try_flush_mark_clean(fop);
+ }
+ if (fop->flushed_version == obc->obs.oi.user_version &&
+ (fop->blocking || !blocking)) {
+ // nonblocking can join anything
+ // blocking can only join a blocking flush
+ dout(20) << __func__ << " piggybacking on existing flush " << dendl;
+ if (op)
+ fop->dup_ops.push_back(op);
+ return -EAGAIN; // clean up this ctx; op will retry later
+ }
+
+ // cancel current flush since it will fail anyway, or because we
+ // are blocking and the existing flush is nonblocking.
+ dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
+ if (fop->op)
+ osd->reply_op_error(fop->op, -EBUSY);
+ while (!fop->dup_ops.empty()) {
+ osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
+ fop->dup_ops.pop_front();
+ }
+ vector<ceph_tid_t> tids;
+ cancel_flush(fop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+ }
+
+ if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
+ int r = start_dedup(op, obc);
+ if (r != -EINPROGRESS) {
+ if (blocking)
+ obc->stop_block();
+ }
+ return r;
+ }
+
+ /**
+ * In general, we need to send a delete and a copyfrom.
+ * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
+ * where 4 is marked as clean. To flush 10, we have to:
+ * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
+ * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
+ *
+ * There is a complicating case. Supposed there had been a clone 7
+ * for snaps [7, 6] which has been trimmed since they no longer exist.
+ * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
+ * the delete, the snap will be promoted to 5, and the head will become
+ * a whiteout. When the copy-from goes through, we'll end up with
+ * 8:[8,4,3,2]:[4(4,3,2)]+head.
+ *
+ * Another complication is the case where there is an interval change
+ * after doing the delete and the flush but before marking the object
+ * clean. We'll happily delete head and then recreate it at the same
+ * sequence number, which works out ok.
+ */
+
+ SnapContext snapc, dsnapc;
+ if (snapset.seq != 0) {
+ if (soid.snap == CEPH_NOSNAP) {
+ snapc = snapset.get_ssc_as_of(snapset.seq);
+ } else {
+ snapid_t min_included_snap;
+ auto p = snapset.clone_snaps.find(soid.snap);
+ ceph_assert(p != snapset.clone_snaps.end());
+ min_included_snap = p->second.back();
+ snapc = snapset.get_ssc_as_of(min_included_snap - 1);
+ }
+
+ snapid_t prev_snapc = 0;
+ for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
+ citer != snapset.clones.rend();
+ ++citer) {
+ if (*citer < soid.snap) {
+ prev_snapc = *citer;
+ break;
+ }
+ }
+
+ dsnapc = snapset.get_ssc_as_of(prev_snapc);
+ }
+
+ object_locator_t base_oloc(soid);
+ base_oloc.pool = pool.info.tier_of;
+
+ if (dsnapc.seq < snapc.seq) {
+ ObjectOperation o;
+ o.remove();
+ osd->objecter->mutate(
+ soid.oid,
+ base_oloc,
+ o,
+ dsnapc,
+ ceph::real_clock::from_ceph_timespec(oi.mtime),
+ (CEPH_OSD_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_FLAG_ENFORCE_SNAPC),
+ NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
+ }
+
+ FlushOpRef fop(std::make_shared<FlushOp>());
+ fop->obc = obc;
+ fop->flushed_version = oi.user_version;
+ fop->blocking = blocking;
+ fop->on_flush = std::move(on_flush);
+ fop->op = op;
+
+ ObjectOperation o;
+ if (oi.is_whiteout()) {
+ fop->removal = true;
+ o.remove();
+ } else {
+ object_locator_t oloc(soid);
+ o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
+ CEPH_OSD_COPY_FROM_FLAG_FLUSH |
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
+ LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+
+ //mean the base tier don't cache data after this
+ if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
+ o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+ }
+ C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
+
+ ceph_tid_t tid = osd->objecter->mutate(
+ soid.oid, base_oloc, o, snapc,
+ ceph::real_clock::from_ceph_timespec(oi.mtime),
+ CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
+ new C_OnFinisher(fin,
+ osd->get_objecter_finisher(get_pg_shard())));
+ /* we're under the pg lock and fin->finish() is grabbing that */
+ fin->tid = tid;
+ fop->objecter_tid = tid;
+
+ flush_ops[soid] = fop;
+
+ recovery_state.update_stats(
+ [&oi](auto &history, auto &stats) {
+ stats.stats.sum.num_flush++;
+ stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
+ return false;
+ });
+ return -EINPROGRESS;
+}
+
+void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
+{
+ dout(10) << __func__ << " " << oid << " tid " << tid
+ << " " << cpp_strerror(r) << dendl;
+ map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
+ if (p == flush_ops.end()) {
+ dout(10) << __func__ << " no flush_op found" << dendl;
+ return;
+ }
+ FlushOpRef fop = p->second;
+ if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
+ dout(10) << __func__ << " tid " << tid << " != fop " << fop
+ << " tid " << fop->objecter_tid << dendl;
+ return;
+ }
+ ObjectContextRef obc = fop->obc;
+ fop->objecter_tid = 0;
+
+ if (r < 0 && !(r == -ENOENT && fop->removal)) {
+ if (fop->op)
+ osd->reply_op_error(fop->op, -EBUSY);
+ if (fop->blocking) {
+ obc->stop_block();
+ kick_object_context_blocked(obc);
+ }
+
+ if (!fop->dup_ops.empty()) {
+ dout(20) << __func__ << " requeueing dups" << dendl;
+ requeue_ops(fop->dup_ops);
+ }
+ if (fop->on_flush) {
+ (*(fop->on_flush))();
+ fop->on_flush = std::nullopt;
+ }
+ flush_ops.erase(oid);
+ return;
+ }
+
+ r = try_flush_mark_clean(fop);
+ if (r == -EBUSY && fop->op) {
+ osd->reply_op_error(fop->op, r);
+ }
+}
+
+int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
+{
+ ObjectContextRef obc = fop->obc;
+ const hobject_t& oid = obc->obs.oi.soid;
+
+ if (fop->blocking) {
+ obc->stop_block();
+ kick_object_context_blocked(obc);
+ }
+
+ if (fop->flushed_version != obc->obs.oi.user_version ||
+ !obc->obs.exists) {
+ if (obc->obs.exists)
+ dout(10) << __func__ << " flushed_version " << fop->flushed_version
+ << " != current " << obc->obs.oi.user_version
+ << dendl;
+ else
+ dout(10) << __func__ << " object no longer exists" << dendl;
+
+ if (!fop->dup_ops.empty()) {
+ dout(20) << __func__ << " requeueing dups" << dendl;
+ requeue_ops(fop->dup_ops);
+ }
+ if (fop->on_flush) {
+ (*(fop->on_flush))();
+ fop->on_flush = std::nullopt;
+ }
+ flush_ops.erase(oid);
+ if (fop->blocking)
+ osd->logger->inc(l_osd_tier_flush_fail);
+ else
+ osd->logger->inc(l_osd_tier_try_flush_fail);
+ return -EBUSY;
+ }
+
+ if (!fop->blocking &&
+ m_scrubber->write_blocked_by_scrub(oid)) {
+ if (fop->op) {
+ dout(10) << __func__ << " blocked by scrub" << dendl;
+ requeue_op(fop->op);
+ requeue_ops(fop->dup_ops);
+ return -EAGAIN; // will retry
+ } else {
+ osd->logger->inc(l_osd_tier_try_flush_fail);
+ vector<ceph_tid_t> tids;
+ cancel_flush(fop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+ return -ECANCELED;
+ }
+ }
+
+ // successfully flushed, can we evict this object?
+ if (!obc->obs.oi.has_manifest() && !fop->op &&
+ agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
+ agent_maybe_evict(obc, true)) {
+ osd->logger->inc(l_osd_tier_clean);
+ if (fop->on_flush) {
+ (*(fop->on_flush))();
+ fop->on_flush = std::nullopt;
+ }
+ flush_ops.erase(oid);
+ return 0;
+ }
+
+ dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
+ OpContextUPtr ctx = simple_opc_create(fop->obc);
+
+ // successfully flushed; can we clear the dirty bit?
+ // try to take the lock manually, since we don't
+ // have a ctx yet.
+ if (ctx->lock_manager.get_lock_type(
+ RWState::RWWRITE,
+ oid,
+ obc,
+ fop->op)) {
+ dout(20) << __func__ << " took write lock" << dendl;
+ } else if (fop->op) {
+ dout(10) << __func__ << " waiting on write lock " << fop->op << " "
+ << fop->dup_ops << dendl;
+ // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
+ for (auto op : fop->dup_ops) {
+ bool locked = ctx->lock_manager.get_lock_type(
+ RWState::RWWRITE,
+ oid,
+ obc,
+ op);
+ ceph_assert(!locked);
+ }
+ close_op_ctx(ctx.release());
+ return -EAGAIN; // will retry
+ } else {
+ dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
+ close_op_ctx(ctx.release());
+ osd->logger->inc(l_osd_tier_try_flush_fail);
+ vector<ceph_tid_t> tids;
+ cancel_flush(fop, false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+ return -ECANCELED;
+ }
+
+ if (fop->on_flush) {
+ ctx->register_on_finish(*(fop->on_flush));
+ fop->on_flush = std::nullopt;
+ }
+
+ ctx->at_version = get_next_version();
+
+ ctx->new_obs = obc->obs;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+ --ctx->delta_stats.num_objects_dirty;
+ if (fop->obc->obs.oi.has_manifest()) {
+ ceph_assert(obc->obs.oi.manifest.is_chunked());
+ PGTransaction* t = ctx->op_t.get();
+ uint64_t chunks_size = 0;
+ for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
+ chunks_size += p.second.length;
+ }
+ if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
+ t->omap_clear(oid);
+ ctx->new_obs.oi.clear_omap_digest();
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+ ctx->clean_regions.mark_omap_dirty();
+ }
+ if (obc->obs.oi.size == chunks_size) {
+ t->truncate(oid, 0);
+ interval_set<uint64_t> trim;
+ trim.insert(0, ctx->new_obs.oi.size);
+ ctx->modified_ranges.union_of(trim);
+ truncate_update_size_and_usage(ctx->delta_stats,
+ ctx->new_obs.oi,
+ 0);
+ ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
+ ctx->new_obs.oi.new_object();
+ for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
+ p.second.set_flag(chunk_info_t::FLAG_MISSING);
+ }
+ } else {
+ for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
+ dout(20) << __func__ << " offset: " << p.second.offset
+ << " length: " << p.second.length << dendl;
+ p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
+ }
+ }
+ }
+
+ finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
+
+ osd->logger->inc(l_osd_tier_clean);
+
+ if (!fop->dup_ops.empty() || fop->op) {
+ dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
+ list<OpRequestRef> ls;
+ if (fop->op)
+ ls.push_back(fop->op);
+ ls.splice(ls.end(), fop->dup_ops);
+ requeue_ops(ls);
+ }
+
+ simple_opc_submit(std::move(ctx));
+
+ flush_ops.erase(oid);
+
+ if (fop->blocking)
+ osd->logger->inc(l_osd_tier_flush);
+ else
+ osd->logger->inc(l_osd_tier_try_flush);
+
+ return -EINPROGRESS;
+}
+
+void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
+ vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
+ << fop->objecter_tid << dendl;
+ if (fop->objecter_tid) {
+ tids->push_back(fop->objecter_tid);
+ fop->objecter_tid = 0;
+ }
+ if (fop->io_tids.size()) {
+ for (auto &p : fop->io_tids) {
+ tids->push_back(p.second);
+ p.second = 0;
+ }
+ }
+ if (fop->blocking && fop->obc->is_blocked()) {
+ fop->obc->stop_block();
+ kick_object_context_blocked(fop->obc);
+ }
+ if (requeue) {
+ if (fop->op)
+ requeue_op(fop->op);
+ requeue_ops(fop->dup_ops);
+ }
+ if (fop->on_flush) {
+ (*(fop->on_flush))();
+ fop->on_flush = std::nullopt;
+ }
+ flush_ops.erase(fop->obc->obs.oi.soid);
+}
+
+void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+ dout(10) << __func__ << dendl;
+ map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
+ while (p != flush_ops.end()) {
+ cancel_flush((p++)->second, requeue, tids);
+ }
+}
+
+bool PrimaryLogPG::is_present_clone(hobject_t coid)
+{
+ if (!pool.info.allow_incomplete_clones())
+ return true;
+ if (is_missing_object(coid))
+ return true;
+ ObjectContextRef obc = get_object_context(coid, false);
+ return obc && obc->obs.exists;
+}
+
+// ========================================================================
+// rep op gather
+
+class C_OSD_RepopCommit : public Context {
+ PrimaryLogPGRef pg;
+ boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
+public:
+ C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
+ : pg(pg), repop(repop) {}
+ void finish(int) override {
+ pg->repop_all_committed(repop.get());
+ }
+};
+
+void PrimaryLogPG::repop_all_committed(RepGather *repop)
+{
+ dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
+ << dendl;
+ repop->all_committed = true;
+ if (!repop->rep_aborted) {
+ if (repop->v != eversion_t()) {
+ recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
+ }
+ eval_repop(repop);
+ }
+}
+
+void PrimaryLogPG::op_applied(const eversion_t &applied_version)
+{
+ dout(10) << "op_applied version " << applied_version << dendl;
+ ceph_assert(applied_version != eversion_t());
+ ceph_assert(applied_version <= info.last_update);
+ recovery_state.local_write_applied(applied_version);
+
+ if (is_primary() && m_scrubber) {
+ // if there's a scrub operation waiting for the selected chunk to be fully updated -
+ // allow it to continue
+ m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
+ }
+}
+
+void PrimaryLogPG::eval_repop(RepGather *repop)
+{
+ #ifdef HAVE_JAEGER
+ if (repop->op->osd_parent_span) {
+ auto eval_span = jaeger_tracing::child_span(__func__, repop->op->osd_parent_span);
+ }
+ #endif
+ dout(10) << "eval_repop " << *repop
+ << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
+
+ // ondisk?
+ if (repop->all_committed) {
+ dout(10) << " commit: " << *repop << dendl;
+ for (auto p = repop->on_committed.begin();
+ p != repop->on_committed.end();
+ repop->on_committed.erase(p++)) {
+ (*p)();
+ }
+ // send dup commits, in order
+ auto it = waiting_for_ondisk.find(repop->v);
+ if (it != waiting_for_ondisk.end()) {
+ ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
+ for (auto& i : it->second) {
+ int return_code = repop->r;
+ if (return_code >= 0) {
+ return_code = std::get<2>(i);
+ }
+ osd->reply_op_error(std::get<0>(i), return_code, repop->v,
+ std::get<1>(i), std::get<3>(i));
+ }
+ waiting_for_ondisk.erase(it);
+ }
+
+ publish_stats_to_osd();
+
+ dout(10) << " removing " << *repop << dendl;
+ ceph_assert(!repop_queue.empty());
+ dout(20) << " q front is " << *repop_queue.front() << dendl;
+ if (repop_queue.front() == repop) {
+ RepGather *to_remove = nullptr;
+ while (!repop_queue.empty() &&
+ (to_remove = repop_queue.front())->all_committed) {
+ repop_queue.pop_front();
+ for (auto p = to_remove->on_success.begin();
+ p != to_remove->on_success.end();
+ to_remove->on_success.erase(p++)) {
+ (*p)();
+ }
+ remove_repop(to_remove);
+ }
+ }
+ }
+}
+
+void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
+{
+ FUNCTRACE(cct);
+ const hobject_t& soid = ctx->obs->oi.soid;
+ dout(7) << "issue_repop rep_tid " << repop->rep_tid
+ << " o " << soid
+ << dendl;
+#ifdef HAVE_JAEGER
+ if (ctx->op->osd_parent_span) {
+ auto issue_repop_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+ }
+#endif
+
+ repop->v = ctx->at_version;
+
+ ctx->op_t->add_obc(ctx->obc);
+ if (ctx->clone_obc) {
+ ctx->op_t->add_obc(ctx->clone_obc);
+ }
+ if (ctx->head_obc) {
+ ctx->op_t->add_obc(ctx->head_obc);
+ }
+
+ Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
+ if (!(ctx->log.empty())) {
+ ceph_assert(ctx->at_version >= projected_last_update);
+ projected_last_update = ctx->at_version;
+ }
+ for (auto &&entry: ctx->log) {
+ projected_log.add(entry);
+ }
+
+ recovery_state.pre_submit_op(
+ soid,
+ ctx->log,
+ ctx->at_version);
+ pgbackend->submit_transaction(
+ soid,
+ ctx->delta_stats,
+ ctx->at_version,
+ std::move(ctx->op_t),
+ recovery_state.get_pg_trim_to(),
+ recovery_state.get_min_last_complete_ondisk(),
+ std::move(ctx->log),
+ ctx->updated_hset_history,
+ on_all_commit,
+ repop->rep_tid,
+ ctx->reqid,
+ ctx->op);
+}
+
+PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
+ OpContext *ctx, ObjectContextRef obc,
+ ceph_tid_t rep_tid)
+{
+ if (ctx->op)
+ dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
+ else
+ dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
+
+ RepGather *repop = new RepGather(
+ ctx, rep_tid, info.last_complete);
+
+ repop->start = ceph_clock_now();
+
+ repop_queue.push_back(&repop->queue_item);
+ repop->get();
+
+ osd->logger->inc(l_osd_op_wip);
+
+ dout(10) << __func__ << ": " << *repop << dendl;
+ return repop;
+}
+
+boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
+ eversion_t version,
+ int r,
+ ObcLockManager &&manager,
+ OpRequestRef &&op,
+ std::optional<std::function<void(void)> > &&on_complete)
+{
+ RepGather *repop = new RepGather(
+ std::move(manager),
+ std::move(op),
+ std::move(on_complete),
+ osd->get_tid(),
+ info.last_complete,
+ r);
+ repop->v = version;
+
+ repop->start = ceph_clock_now();
+
+ repop_queue.push_back(&repop->queue_item);
+
+ osd->logger->inc(l_osd_op_wip);
+
+ dout(10) << __func__ << ": " << *repop << dendl;
+ return boost::intrusive_ptr<RepGather>(repop);
+}
+
+void PrimaryLogPG::remove_repop(RepGather *repop)
+{
+ dout(20) << __func__ << " " << *repop << dendl;
+
+ for (auto p = repop->on_finish.begin();
+ p != repop->on_finish.end();
+ repop->on_finish.erase(p++)) {
+ (*p)();
+ }
+
+ release_object_locks(
+ repop->lock_manager);
+ repop->put();
+
+ osd->logger->dec(l_osd_op_wip);
+}
+
+PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
+{
+ dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
+ ceph_tid_t rep_tid = osd->get_tid();
+ osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+ OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
+ ctx->op_t.reset(new PGTransaction());
+ ctx->mtime = ceph_clock_now();
+ return ctx;
+}
+
+void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
+{
+ RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
+ dout(20) << __func__ << " " << repop << dendl;
+ issue_repop(repop, ctx.get());
+ eval_repop(repop);
+ recovery_state.update_trim_to();
+ repop->put();
+}
+
+
+void PrimaryLogPG::submit_log_entries(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObcLockManager &&manager,
+ std::optional<std::function<void(void)> > &&_on_complete,
+ OpRequestRef op,
+ int r)
+{
+ dout(10) << __func__ << " " << entries << dendl;
+ ceph_assert(is_primary());
+
+ eversion_t version;
+ if (!entries.empty()) {
+ ceph_assert(entries.rbegin()->version >= projected_last_update);
+ version = projected_last_update = entries.rbegin()->version;
+ }
+
+ boost::intrusive_ptr<RepGather> repop;
+ std::optional<std::function<void(void)> > on_complete;
+ if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
+ repop = new_repop(
+ version,
+ r,
+ std::move(manager),
+ std::move(op),
+ std::move(_on_complete));
+ } else {
+ on_complete = std::move(_on_complete);
+ }
+
+ pgbackend->call_write_ordered(
+ [this, entries, repop, on_complete]() {
+ ObjectStore::Transaction t;
+ eversion_t old_last_update = info.last_update;
+ recovery_state.merge_new_log_entries(
+ entries, t, recovery_state.get_pg_trim_to(),
+ recovery_state.get_min_last_complete_ondisk());
+
+ set<pg_shard_t> waiting_on;
+ for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
+ i != get_acting_recovery_backfill().end();
+ ++i) {
+ pg_shard_t peer(*i);
+ if (peer == pg_whoami) continue;
+ ceph_assert(recovery_state.get_peer_missing().count(peer));
+ ceph_assert(recovery_state.has_peer_info(peer));
+ if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
+ ceph_assert(repop);
+ MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
+ entries,
+ spg_t(info.pgid.pgid, i->shard),
+ pg_whoami.shard,
+ get_osdmap_epoch(),
+ get_last_peering_reset(),
+ repop->rep_tid,
+ recovery_state.get_pg_trim_to(),
+ recovery_state.get_min_last_complete_ondisk());
+ osd->send_message_osd_cluster(
+ peer.osd, m, get_osdmap_epoch());
+ waiting_on.insert(peer);
+ } else {
+ MOSDPGLog *m = new MOSDPGLog(
+ peer.shard, pg_whoami.shard,
+ info.last_update.epoch,
+ info, get_last_peering_reset());
+ m->log.log = entries;
+ m->log.tail = old_last_update;
+ m->log.head = info.last_update;
+ osd->send_message_osd_cluster(
+ peer.osd, m, get_osdmap_epoch());
+ }
+ }
+ ceph_tid_t rep_tid = repop->rep_tid;
+ waiting_on.insert(pg_whoami);
+ log_entry_update_waiting_on.insert(
+ make_pair(
+ rep_tid,
+ LogUpdateCtx{std::move(repop), std::move(waiting_on)}
+ ));
+ struct OnComplete : public Context {
+ PrimaryLogPGRef pg;
+ ceph_tid_t rep_tid;
+ epoch_t epoch;
+ OnComplete(
+ PrimaryLogPGRef pg,
+ ceph_tid_t rep_tid,
+ epoch_t epoch)
+ : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
+ void finish(int) override {
+ std::scoped_lock l{*pg};
+ if (!pg->pg_has_reset_since(epoch)) {
+ auto it = pg->log_entry_update_waiting_on.find(rep_tid);
+ ceph_assert(it != pg->log_entry_update_waiting_on.end());
+ auto it2 = it->second.waiting_on.find(pg->pg_whoami);
+ ceph_assert(it2 != it->second.waiting_on.end());
+ it->second.waiting_on.erase(it2);
+ if (it->second.waiting_on.empty()) {
+ pg->repop_all_committed(it->second.repop.get());
+ pg->log_entry_update_waiting_on.erase(it);
+ }
+ }
+ }
+ };
+ t.register_on_commit(
+ new OnComplete{this, rep_tid, get_osdmap_epoch()});
+ int r = osd->store->queue_transaction(ch, std::move(t), NULL);
+ ceph_assert(r == 0);
+ op_applied(info.last_update);
+ });
+
+ recovery_state.update_trim_to();
+}
+
+void PrimaryLogPG::cancel_log_updates()
+{
+ // get rid of all the LogUpdateCtx so their references to repops are
+ // dropped
+ log_entry_update_waiting_on.clear();
+}
+
+// -------------------------------------------------------
+
+void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
+{
+ std::scoped_lock l{*this};
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i)) {
+ ObjectContextRef obc(i.second);
+ get_obc_watchers(obc, *ls);
+ }
+}
+
+void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
+{
+ for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
+ obc->watchers.begin();
+ j != obc->watchers.end();
+ ++j) {
+ obj_watch_item_t owi;
+
+ owi.obj = obc->obs.oi.soid;
+ owi.wi.addr = j->second->get_peer_addr();
+ owi.wi.name = j->second->get_entity();
+ owi.wi.cookie = j->second->get_cookie();
+ owi.wi.timeout_seconds = j->second->get_timeout();
+
+ dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
+ << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
+
+ pg_watchers.push_back(owi);
+ }
+}
+
+void PrimaryLogPG::check_blocklisted_watchers()
+{
+ dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i))
+ check_blocklisted_obc_watchers(i.second);
+}
+
+void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
+{
+ dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
+ for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
+ obc->watchers.begin();
+ k != obc->watchers.end();
+ ) {
+ //Advance iterator now so handle_watch_timeout() can erase element
+ map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
+ dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
+ entity_addr_t ea = j->second->get_peer_addr();
+ dout(30) << "watch: Check entity_addr_t " << ea << dendl;
+ if (get_osdmap()->is_blocklisted(ea)) {
+ dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
+ ceph_assert(j->second->get_pg() == this);
+ j->second->unregister_cb();
+ handle_watch_timeout(j->second);
+ }
+ }
+}
+
+void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
+{
+ ceph_assert(is_primary() && is_active());
+ auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
+ ceph_assert((recovering.count(obc->obs.oi.soid) ||
+ !is_missing_object(obc->obs.oi.soid)) ||
+ (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
+ it_objects->second->op ==
+ pg_log_entry_t::LOST_REVERT &&
+ it_objects->second->reverting_to ==
+ obc->obs.oi.version));
+
+ dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
+ ceph_assert(obc->watchers.empty());
+ // populate unconnected_watchers
+ for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
+ obc->obs.oi.watchers.begin();
+ p != obc->obs.oi.watchers.end();
+ ++p) {
+ utime_t expire = info.stats.last_became_active;
+ expire += p->second.timeout_seconds;
+ dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
+ WatchRef watch(
+ Watch::makeWatchRef(
+ this, osd, obc, p->second.timeout_seconds, p->first.first,
+ p->first.second, p->second.addr));
+ watch->disconnect();
+ obc->watchers.insert(
+ make_pair(
+ make_pair(p->first.first, p->first.second),
+ watch));
+ }
+ // Look for watchers from blocklisted clients and drop
+ check_blocklisted_obc_watchers(obc);
+}
+
+void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
+{
+ ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
+ dout(10) << "handle_watch_timeout obc " << obc << dendl;
+
+ if (!is_active()) {
+ dout(10) << "handle_watch_timeout not active, no-op" << dendl;
+ return;
+ }
+ if (!obc->obs.exists) {
+ dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
+ return;
+ }
+ if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
+ callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
+ watch->get_delayed_cb()
+ );
+ dout(10) << "handle_watch_timeout waiting for degraded on obj "
+ << obc->obs.oi.soid
+ << dendl;
+ return;
+ }
+
+ if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
+ dout(10) << "handle_watch_timeout waiting for scrub on obj "
+ << obc->obs.oi.soid
+ << dendl;
+ m_scrubber->add_callback(
+ watch->get_delayed_cb() // This callback!
+ );
+ return;
+ }
+
+ OpContextUPtr ctx = simple_opc_create(obc);
+ ctx->at_version = get_next_version();
+
+ object_info_t& oi = ctx->new_obs.oi;
+ oi.watchers.erase(make_pair(watch->get_cookie(),
+ watch->get_entity()));
+
+ list<watch_disconnect_t> watch_disconnects = {
+ watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
+ };
+ ctx->register_on_success(
+ [this, obc, watch_disconnects]() {
+ complete_disconnect_watches(obc, watch_disconnects);
+ });
+
+
+ PGTransaction *t = ctx->op_t.get();
+ ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
+ ctx->at_version,
+ oi.version,
+ 0,
+ osd_reqid_t(), ctx->mtime, 0));
+
+ oi.prior_version = obc->obs.oi.version;
+ oi.version = ctx->at_version;
+ bufferlist bl;
+ encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
+
+ // apply new object state.
+ ctx->obc->obs = ctx->new_obs;
+
+ // no ctx->delta_stats
+ simple_opc_submit(std::move(ctx));
+}
+
+ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
+ SnapSetContext *ssc)
+{
+ ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
+ ceph_assert(obc->destructor_callback == NULL);
+ obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
+ obc->obs.oi = oi;
+ obc->obs.exists = false;
+ obc->ssc = ssc;
+ if (ssc)
+ register_snapset_context(ssc);
+ dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
+ if (is_active())
+ populate_obc_watchers(obc);
+ return obc;
+}
+
+ObjectContextRef PrimaryLogPG::get_object_context(
+ const hobject_t& soid,
+ bool can_create,
+ const map<string, bufferlist> *attrs)
+{
+ auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
+ ceph_assert(
+ attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
+ // or this is a revert... see recover_primary()
+ (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
+ it_objects->second->op ==
+ pg_log_entry_t::LOST_REVERT));
+ ObjectContextRef obc = object_contexts.lookup(soid);
+ osd->logger->inc(l_osd_object_ctx_cache_total);
+ if (obc) {
+ osd->logger->inc(l_osd_object_ctx_cache_hit);
+ dout(10) << __func__ << ": found obc in cache: " << obc
+ << dendl;
+ } else {
+ dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
+ // check disk
+ bufferlist bv;
+ if (attrs) {
+ auto it_oi = attrs->find(OI_ATTR);
+ ceph_assert(it_oi != attrs->end());
+ bv = it_oi->second;
+ } else {
+ int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
+ if (r < 0) {
+ if (!can_create) {
+ dout(10) << __func__ << ": no obc for soid "
+ << soid << " and !can_create"
+ << dendl;
+ return ObjectContextRef(); // -ENOENT!
+ }
+
+ dout(10) << __func__ << ": no obc for soid "
+ << soid << " but can_create"
+ << dendl;
+ // new object.
+ object_info_t oi(soid);
+ SnapSetContext *ssc = get_snapset_context(
+ soid, true, 0, false);
+ ceph_assert(ssc);
+ obc = create_object_context(oi, ssc);
+ dout(10) << __func__ << ": " << obc << " " << soid
+ << " " << obc->rwstate
+ << " oi: " << obc->obs.oi
+ << " ssc: " << obc->ssc
+ << " snapset: " << obc->ssc->snapset << dendl;
+ return obc;
+ }
+ }
+
+ object_info_t oi;
+ try {
+ bufferlist::const_iterator bliter = bv.begin();
+ decode(oi, bliter);
+ } catch (...) {
+ dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
+ return ObjectContextRef(); // -ENOENT!
+ }
+
+ ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
+
+ obc = object_contexts.lookup_or_create(oi.soid);
+ obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
+ obc->obs.oi = oi;
+ obc->obs.exists = true;
+
+ obc->ssc = get_snapset_context(
+ soid, true,
+ soid.has_snapset() ? attrs : 0);
+
+ if (is_primary() && is_active())
+ populate_obc_watchers(obc);
+
+ if (pool.info.is_erasure()) {
+ if (attrs) {
+ obc->attr_cache = *attrs;
+ } else {
+ int r = pgbackend->objects_get_attrs(
+ soid,
+ &obc->attr_cache);
+ ceph_assert(r == 0);
+ }
+ }
+
+ dout(10) << __func__ << ": creating obc from disk: " << obc
+ << dendl;
+ }
+
+ // XXX: Caller doesn't expect this
+ if (obc->ssc == NULL) {
+ derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
+ return ObjectContextRef(); // -ENOENT!
+ }
+
+ dout(10) << __func__ << ": " << obc << " " << soid
+ << " " << obc->rwstate
+ << " oi: " << obc->obs.oi
+ << " exists: " << (int)obc->obs.exists
+ << " ssc: " << obc->ssc
+ << " snapset: " << obc->ssc->snapset << dendl;
+ return obc;
+}
+
+void PrimaryLogPG::context_registry_on_change()
+{
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i)) {
+ ObjectContextRef obc(i.second);
+ if (obc) {
+ for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
+ obc->watchers.begin();
+ j != obc->watchers.end();
+ obc->watchers.erase(j++)) {
+ j->second->discard();
+ }
+ }
+ }
+}
+
+
+/*
+ * If we return an error, and set *pmissing, then promoting that
+ * object may help.
+ *
+ * If we return -EAGAIN, we will always set *pmissing to the missing
+ * object to wait for.
+ *
+ * If we return an error but do not set *pmissing, then we know the
+ * object does not exist.
+ */
+int PrimaryLogPG::find_object_context(const hobject_t& oid,
+ ObjectContextRef *pobc,
+ bool can_create,
+ bool map_snapid_to_clone,
+ hobject_t *pmissing)
+{
+ FUNCTRACE(cct);
+ ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
+ // want the head?
+ if (oid.snap == CEPH_NOSNAP) {
+ ObjectContextRef obc = get_object_context(oid, can_create);
+ if (!obc) {
+ if (pmissing)
+ *pmissing = oid;
+ return -ENOENT;
+ }
+ dout(10) << __func__ << " " << oid
+ << " @" << oid.snap
+ << " oi=" << obc->obs.oi
+ << dendl;
+ *pobc = obc;
+
+ return 0;
+ }
+
+ // we want a snap
+
+ hobject_t head = oid.get_head();
+ SnapSetContext *ssc = get_snapset_context(oid, can_create);
+ if (!ssc || !(ssc->exists || can_create)) {
+ dout(20) << __func__ << " " << oid << " no snapset" << dendl;
+ if (pmissing)
+ *pmissing = head; // start by getting the head
+ if (ssc)
+ put_snapset_context(ssc);
+ return -ENOENT;
+ }
+
+ if (map_snapid_to_clone) {
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " map_snapid_to_clone=true" << dendl;
+ if (oid.snap > ssc->snapset.seq) {
+ // already must be readable
+ ObjectContextRef obc = get_object_context(head, false);
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " maps to head" << dendl;
+ *pobc = obc;
+ put_snapset_context(ssc);
+ return (obc && obc->obs.exists) ? 0 : -ENOENT;
+ } else {
+ vector<snapid_t>::const_iterator citer = std::find(
+ ssc->snapset.clones.begin(),
+ ssc->snapset.clones.end(),
+ oid.snap);
+ if (citer == ssc->snapset.clones.end()) {
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " maps to nothing" << dendl;
+ put_snapset_context(ssc);
+ return -ENOENT;
+ }
+
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " maps to " << oid << dendl;
+
+ if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " " << oid << " is missing" << dendl;
+ if (pmissing)
+ *pmissing = oid;
+ put_snapset_context(ssc);
+ return -EAGAIN;
+ }
+
+ ObjectContextRef obc = get_object_context(oid, false);
+ if (!obc || !obc->obs.exists) {
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " " << oid << " is not present" << dendl;
+ if (pmissing)
+ *pmissing = oid;
+ put_snapset_context(ssc);
+ return -ENOENT;
+ }
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset
+ << " " << oid << " HIT" << dendl;
+ *pobc = obc;
+ put_snapset_context(ssc);
+ return 0;
+ }
+ ceph_abort(); //unreachable
+ }
+
+ dout(10) << __func__ << " " << oid << " @" << oid.snap
+ << " snapset " << ssc->snapset << dendl;
+
+ // head?
+ if (oid.snap > ssc->snapset.seq) {
+ ObjectContextRef obc = get_object_context(head, false);
+ dout(10) << __func__ << " " << head
+ << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
+ << " -- HIT " << obc->obs
+ << dendl;
+ if (!obc->ssc)
+ obc->ssc = ssc;
+ else {
+ ceph_assert(ssc == obc->ssc);
+ put_snapset_context(ssc);
+ }
+ *pobc = obc;
+ return 0;
+ }
+
+ // which clone would it be?
+ unsigned k = 0;
+ while (k < ssc->snapset.clones.size() &&
+ ssc->snapset.clones[k] < oid.snap)
+ k++;
+ if (k == ssc->snapset.clones.size()) {
+ dout(10) << __func__ << " no clones with last >= oid.snap "
+ << oid.snap << " -- DNE" << dendl;
+ put_snapset_context(ssc);
+ return -ENOENT;
+ }
+ hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
+ info.pgid.pool(), oid.get_namespace());
+
+ if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
+ dout(20) << __func__ << " " << soid << " missing, try again later"
+ << dendl;
+ if (pmissing)
+ *pmissing = soid;
+ put_snapset_context(ssc);
+ return -EAGAIN;
+ }
+
+ ObjectContextRef obc = get_object_context(soid, false);
+ if (!obc || !obc->obs.exists) {
+ if (pmissing)
+ *pmissing = soid;
+ put_snapset_context(ssc);
+ if (is_primary()) {
+ if (is_degraded_or_backfilling_object(soid)) {
+ dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
+ return -EAGAIN;
+ } else if (is_degraded_on_async_recovery_target(soid)) {
+ dout(20) << __func__ << " clone is recovering " << soid << dendl;
+ return -EAGAIN;
+ } else {
+ dout(20) << __func__ << " missing clone " << soid << dendl;
+ return -ENOENT;
+ }
+ } else {
+ dout(20) << __func__ << " replica missing clone" << soid << dendl;
+ return -ENOENT;
+ }
+ }
+
+ if (!obc->ssc) {
+ obc->ssc = ssc;
+ } else {
+ ceph_assert(obc->ssc == ssc);
+ put_snapset_context(ssc);
+ }
+ ssc = 0;
+
+ // clone
+ dout(20) << __func__ << " " << soid
+ << " snapset " << obc->ssc->snapset
+ << dendl;
+ snapid_t first, last;
+ auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
+ ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
+ if (p->second.empty()) {
+ dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
+ ceph_assert(!cct->_conf->osd_debug_verify_snaps);
+ return -ENOENT;
+ }
+ if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
+ p->second.end()) {
+ dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
+ << " does not contain " << oid.snap << " -- DNE" << dendl;
+ return -ENOENT;
+ }
+ if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
+ dout(20) << __func__ << " " << soid << " snap " << oid.snap
+ << " in removed_snaps_queue" << " -- DNE" << dendl;
+ return -ENOENT;
+ }
+ dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
+ << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
+ *pobc = obc;
+ return 0;
+}
+
+void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
+{
+ if (obc->ssc)
+ put_snapset_context(obc->ssc);
+}
+
+void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
+{
+ object_info_t& oi = obc->obs.oi;
+
+ dout(10) << __func__ << " " << oi.soid << dendl;
+ ceph_assert(!oi.soid.is_snapdir());
+
+ object_stat_sum_t stat;
+ stat.num_objects++;
+ if (oi.is_dirty())
+ stat.num_objects_dirty++;
+ if (oi.is_whiteout())
+ stat.num_whiteouts++;
+ if (oi.is_omap())
+ stat.num_objects_omap++;
+ if (oi.is_cache_pinned())
+ stat.num_objects_pinned++;
+ if (oi.has_manifest())
+ stat.num_objects_manifest++;
+
+ if (oi.soid.is_snap()) {
+ stat.num_object_clones++;
+
+ if (!obc->ssc)
+ obc->ssc = get_snapset_context(oi.soid, false);
+ ceph_assert(obc->ssc);
+ stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
+ } else {
+ stat.num_bytes += oi.size;
+ }
+
+ // add it in
+ pgstat->stats.sum.add(stat);
+}
+
+void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
+{
+ const hobject_t& soid = obc->obs.oi.soid;
+ if (obc->is_blocked()) {
+ dout(10) << __func__ << " " << soid << " still blocked" << dendl;
+ return;
+ }
+
+ map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
+ if (p != waiting_for_blocked_object.end()) {
+ list<OpRequestRef>& ls = p->second;
+ dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
+ requeue_ops(ls);
+ waiting_for_blocked_object.erase(p);
+ }
+
+ map<hobject_t, ObjectContextRef>::iterator i =
+ objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
+ if (i != objects_blocked_on_snap_promotion.end()) {
+ ceph_assert(i->second == obc);
+ objects_blocked_on_snap_promotion.erase(i);
+ }
+
+ if (obc->requeue_scrub_on_unblock) {
+
+ obc->requeue_scrub_on_unblock = false;
+
+ dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
+
+ // only requeue if we are still active: we may be unblocking
+ // because we are resetting for a new peering interval
+ if (is_active()) {
+ osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
+ }
+ }
+}
+
+SnapSetContext *PrimaryLogPG::get_snapset_context(
+ const hobject_t& oid,
+ bool can_create,
+ const map<string, bufferlist> *attrs,
+ bool oid_existed)
+{
+ std::lock_guard l(snapset_contexts_lock);
+ SnapSetContext *ssc;
+ map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
+ oid.get_snapdir());
+ if (p != snapset_contexts.end()) {
+ if (can_create || p->second->exists) {
+ ssc = p->second;
+ } else {
+ return NULL;
+ }
+ } else {
+ bufferlist bv;
+ if (!attrs) {
+ int r = -ENOENT;
+ if (!(oid.is_head() && !oid_existed)) {
+ r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
+ }
+ if (r < 0 && !can_create)
+ return NULL;
+ } else {
+ auto it_ss = attrs->find(SS_ATTR);
+ ceph_assert(it_ss != attrs->end());
+ bv = it_ss->second;
+ }
+ ssc = new SnapSetContext(oid.get_snapdir());
+ _register_snapset_context(ssc);
+ if (bv.length()) {
+ bufferlist::const_iterator bvp = bv.begin();
+ try {
+ ssc->snapset.decode(bvp);
+ } catch (const ceph::buffer::error& e) {
+ dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
+ return NULL;
+ }
+ ssc->exists = true;
+ } else {
+ ssc->exists = false;
+ }
+ }
+ ceph_assert(ssc);
+ ssc->ref++;
+ return ssc;
+}
+
+void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
+{
+ std::lock_guard l(snapset_contexts_lock);
+ --ssc->ref;
+ if (ssc->ref == 0) {
+ if (ssc->registered)
+ snapset_contexts.erase(ssc->oid);
+ delete ssc;
+ }
+}
+
+/*
+ * Return values:
+ * NONE - didn't pull anything
+ * YES - pulled what the caller wanted
+ * HEAD - needed to pull head first
+ */
+enum { PULL_NONE, PULL_HEAD, PULL_YES };
+
+int PrimaryLogPG::recover_missing(
+ const hobject_t &soid, eversion_t v,
+ int priority,
+ PGBackend::RecoveryHandle *h)
+{
+ if (recovery_state.get_missing_loc().is_unfound(soid)) {
+ dout(7) << __func__ << " " << soid
+ << " v " << v
+ << " but it is unfound" << dendl;
+ return PULL_NONE;
+ }
+
+ if (recovery_state.get_missing_loc().is_deleted(soid)) {
+ start_recovery_op(soid);
+ ceph_assert(!recovering.count(soid));
+ recovering.insert(make_pair(soid, ObjectContextRef()));
+ epoch_t cur_epoch = get_osdmap_epoch();
+ remove_missing_object(soid, v, new LambdaContext(
+ [=](int) {
+ std::scoped_lock locker{*this};
+ if (!pg_has_reset_since(cur_epoch)) {
+ bool object_missing = false;
+ for (const auto& shard : get_acting_recovery_backfill()) {
+ if (shard == pg_whoami)
+ continue;
+ if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
+ dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
+ object_missing = true;
+ break;
+ }
+ }
+ if (!object_missing) {
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ if (scrub_after_recovery)
+ stat_diff.num_objects_repaired = 1;
+ on_global_recover(soid, stat_diff, true);
+ } else {
+ auto recovery_handle = pgbackend->open_recovery_op();
+ pgbackend->recover_delete_object(soid, v, recovery_handle);
+ pgbackend->run_recovery_op(recovery_handle, priority);
+ }
+ }
+ }));
+ return PULL_YES;
+ }
+
+ // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
+ ObjectContextRef obc;
+ ObjectContextRef head_obc;
+ if (soid.snap && soid.snap < CEPH_NOSNAP) {
+ // do we have the head?
+ hobject_t head = soid.get_head();
+ if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
+ if (recovering.count(head)) {
+ dout(10) << " missing but already recovering head " << head << dendl;
+ return PULL_NONE;
+ } else {
+ int r = recover_missing(
+ head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
+ h);
+ if (r != PULL_NONE)
+ return PULL_HEAD;
+ return PULL_NONE;
+ }
+ }
+ head_obc = get_object_context(
+ head,
+ false,
+ 0);
+ ceph_assert(head_obc);
+ }
+ start_recovery_op(soid);
+ ceph_assert(!recovering.count(soid));
+ recovering.insert(make_pair(soid, obc));
+ int r = pgbackend->recover_object(
+ soid,
+ v,
+ head_obc,
+ obc,
+ h);
+ // This is only a pull which shouldn't return an error
+ ceph_assert(r >= 0);
+ return PULL_YES;
+}
+
+void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
+ eversion_t v, Context *on_complete)
+{
+ dout(20) << __func__ << " " << soid << " " << v << dendl;
+ ceph_assert(on_complete != nullptr);
+ // delete locally
+ ObjectStore::Transaction t;
+ remove_snap_mapped_object(t, soid);
+
+ ObjectRecoveryInfo recovery_info;
+ recovery_info.soid = soid;
+ recovery_info.version = v;
+
+ epoch_t cur_epoch = get_osdmap_epoch();
+ t.register_on_complete(new LambdaContext(
+ [=](int) {
+ std::unique_lock locker{*this};
+ if (!pg_has_reset_since(cur_epoch)) {
+ ObjectStore::Transaction t2;
+ on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
+ t2.register_on_complete(on_complete);
+ int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
+ ceph_assert(r == 0);
+ locker.unlock();
+ } else {
+ locker.unlock();
+ on_complete->complete(-EAGAIN);
+ }
+ }));
+ int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
+ ceph_assert(r == 0);
+}
+
+void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
+{
+ dout(10) << __func__ << " " << oid << dendl;
+ if (callbacks_for_degraded_object.count(oid)) {
+ list<Context*> contexts;
+ contexts.swap(callbacks_for_degraded_object[oid]);
+ callbacks_for_degraded_object.erase(oid);
+ for (list<Context*>::iterator i = contexts.begin();
+ i != contexts.end();
+ ++i) {
+ (*i)->complete(0);
+ }
+ }
+ map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
+ oid.get_head());
+ if (i != objects_blocked_on_degraded_snap.end() &&
+ i->second == oid.snap)
+ objects_blocked_on_degraded_snap.erase(i);
+}
+
+void PrimaryLogPG::_committed_pushed_object(
+ epoch_t epoch, eversion_t last_complete)
+{
+ std::scoped_lock locker{*this};
+ if (!pg_has_reset_since(epoch)) {
+ recovery_state.recovery_committed_to(last_complete);
+ } else {
+ dout(10) << __func__
+ << " pg has changed, not touching last_complete_ondisk" << dendl;
+ }
+}
+
+void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
+{
+ dout(20) << __func__ << dendl;
+ if (obc) {
+ dout(20) << "obc = " << *obc << dendl;
+ }
+ ceph_assert(active_pushes >= 1);
+ --active_pushes;
+
+ // requeue an active chunky scrub waiting on recovery ops
+ if (!recovery_state.is_deleting() && active_pushes == 0 &&
+ m_scrubber->is_scrub_active()) {
+
+ osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
+ }
+}
+
+void PrimaryLogPG::_applied_recovered_object_replica()
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(active_pushes >= 1);
+ --active_pushes;
+
+ // requeue an active scrub waiting on recovery ops
+ if (!recovery_state.is_deleting() && active_pushes == 0 &&
+ m_scrubber->is_scrub_active()) {
+
+ osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
+ }
+}
+
+void PrimaryLogPG::on_failed_pull(
+ const set<pg_shard_t> &from,
+ const hobject_t &soid,
+ const eversion_t &v)
+{
+ dout(20) << __func__ << ": " << soid << dendl;
+ ceph_assert(recovering.count(soid));
+ auto obc = recovering[soid];
+ if (obc) {
+ list<OpRequestRef> blocked_ops;
+ obc->drop_recovery_read(&blocked_ops);
+ requeue_ops(blocked_ops);
+ }
+ recovering.erase(soid);
+ for (auto&& i : from) {
+ if (i != pg_whoami) { // we'll get it below in primary_error
+ recovery_state.force_object_missing(i, soid, v);
+ }
+ }
+
+ dout(0) << __func__ << " " << soid << " from shard " << from
+ << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
+ << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
+ << dendl;
+ finish_recovery_op(soid); // close out this attempt,
+ finish_degraded_object(soid);
+
+ if (from.count(pg_whoami)) {
+ dout(0) << " primary missing oid " << soid << " version " << v << dendl;
+ primary_error(soid, v);
+ backfills_in_flight.erase(soid);
+ }
+}
+
+eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
+{
+ eversion_t v;
+ pg_missing_item pmi;
+ bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
+ ceph_assert(is_missing);
+ v = pmi.have;
+ dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
+
+ ceph_assert(!get_acting_recovery_backfill().empty());
+ for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
+ i != get_acting_recovery_backfill().end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
+ if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
+ continue;
+ }
+ eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
+ dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
+ if (h > v)
+ v = h;
+ }
+
+ dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
+ return v;
+}
+
+void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
+{
+ const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
+ op->get_req());
+ ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
+ ObjectStore::Transaction t;
+ std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+ if (m->pg_trim_to != eversion_t())
+ op_trim_to = m->pg_trim_to;
+ if (m->pg_roll_forward_to != eversion_t())
+ op_roll_forward_to = m->pg_roll_forward_to;
+
+ dout(20) << __func__
+ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
+
+ recovery_state.append_log_entries_update_missing(
+ m->entries, t, op_trim_to, op_roll_forward_to);
+ eversion_t new_lcod = info.last_complete;
+
+ Context *complete = new LambdaContext(
+ [=](int) {
+ const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
+ op->get_req());
+ std::scoped_lock locker{*this};
+ if (!pg_has_reset_since(msg->get_epoch())) {
+ update_last_complete_ondisk(new_lcod);
+ MOSDPGUpdateLogMissingReply *reply =
+ new MOSDPGUpdateLogMissingReply(
+ spg_t(info.pgid.pgid, primary_shard().shard),
+ pg_whoami.shard,
+ msg->get_epoch(),
+ msg->min_epoch,
+ msg->get_tid(),
+ new_lcod);
+ reply->set_priority(CEPH_MSG_PRIO_HIGH);
+ msg->get_connection()->send_message(reply);
+ }
+ });
+
+ if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+ t.register_on_commit(complete);
+ } else {
+ /* Hack to work around the fact that ReplicatedBackend sends
+ * ack+commit if commit happens first
+ *
+ * This behavior is no longer necessary, but we preserve it so old
+ * primaries can keep their repops in order */
+ if (pool.info.is_erasure()) {
+ t.register_on_complete(complete);
+ } else {
+ t.register_on_commit(complete);
+ }
+ }
+ int tr = osd->store->queue_transaction(
+ ch,
+ std::move(t),
+ nullptr);
+ ceph_assert(tr == 0);
+ op_applied(info.last_update);
+}
+
+void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
+{
+ const MOSDPGUpdateLogMissingReply *m =
+ static_cast<const MOSDPGUpdateLogMissingReply*>(
+ op->get_req());
+ dout(20) << __func__ << " got reply from "
+ << m->get_from() << dendl;
+
+ auto it = log_entry_update_waiting_on.find(m->get_tid());
+ if (it != log_entry_update_waiting_on.end()) {
+ if (it->second.waiting_on.count(m->get_from())) {
+ it->second.waiting_on.erase(m->get_from());
+ if (m->last_complete_ondisk != eversion_t()) {
+ update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
+ }
+ } else {
+ osd->clog->error()
+ << info.pgid << " got reply "
+ << *m << " from shard we are not waiting for "
+ << m->get_from();
+ }
+
+ if (it->second.waiting_on.empty()) {
+ repop_all_committed(it->second.repop.get());
+ log_entry_update_waiting_on.erase(it);
+ }
+ } else {
+ osd->clog->error()
+ << info.pgid << " got reply "
+ << *m << " on unknown tid " << m->get_tid();
+ }
+}
+
+/* Mark all unfound objects as lost.
+ */
+void PrimaryLogPG::mark_all_unfound_lost(
+ int what,
+ std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+ dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
+ list<hobject_t> oids;
+
+ dout(30) << __func__ << ": log before:\n";
+ recovery_state.get_pg_log().get_log().print(*_dout);
+ *_dout << dendl;
+
+ mempool::osd_pglog::list<pg_log_entry_t> log_entries;
+
+ utime_t mtime = ceph_clock_now();
+ map<hobject_t, pg_missing_item>::const_iterator m =
+ recovery_state.get_missing_loc().get_needs_recovery().begin();
+ map<hobject_t, pg_missing_item>::const_iterator mend =
+ recovery_state.get_missing_loc().get_needs_recovery().end();
+
+ ObcLockManager manager;
+ eversion_t v = get_next_version();
+ v.epoch = get_osdmap_epoch();
+ uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
+ while (m != mend) {
+ const hobject_t &oid(m->first);
+ if (!recovery_state.get_missing_loc().is_unfound(oid)) {
+ // We only care about unfound objects
+ ++m;
+ continue;
+ }
+
+ ObjectContextRef obc;
+ eversion_t prev;
+
+ switch (what) {
+ case pg_log_entry_t::LOST_MARK:
+ ceph_abort_msg("actually, not implemented yet!");
+ break;
+
+ case pg_log_entry_t::LOST_REVERT:
+ prev = pick_newest_available(oid);
+ if (prev > eversion_t()) {
+ // log it
+ pg_log_entry_t e(
+ pg_log_entry_t::LOST_REVERT, oid, v,
+ m->second.need, 0, osd_reqid_t(), mtime, 0);
+ e.reverting_to = prev;
+ e.mark_unrollbackable();
+ log_entries.push_back(e);
+ dout(10) << e << dendl;
+
+ // we are now missing the new version; recovery code will sort it out.
+ ++v.version;
+ ++m;
+ break;
+ }
+
+ case pg_log_entry_t::LOST_DELETE:
+ {
+ pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
+ 0, osd_reqid_t(), mtime, 0);
+ if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
+ if (pool.info.require_rollback()) {
+ e.mod_desc.try_rmobject(v.version);
+ } else {
+ e.mark_unrollbackable();
+ }
+ } // otherwise, just do what we used to do
+ dout(10) << e << dendl;
+ log_entries.push_back(e);
+ oids.push_back(oid);
+
+ // If context found mark object as deleted in case
+ // of racing with new creation. This can happen if
+ // object lost and EIO at primary.
+ obc = object_contexts.lookup(oid);
+ if (obc)
+ obc->obs.exists = false;
+
+ ++v.version;
+ ++m;
+ }
+ break;
+
+ default:
+ ceph_abort();
+ }
+ }
+
+ recovery_state.update_stats(
+ [](auto &history, auto &stats) {
+ stats.stats_invalid = true;
+ return false;
+ });
+
+ submit_log_entries(
+ log_entries,
+ std::move(manager),
+ std::optional<std::function<void(void)> >(
+ [this, oids, num_unfound, on_finish]() {
+ if (recovery_state.perform_deletes_during_peering()) {
+ for (auto oid : oids) {
+ // clear old locations - merge_new_log_entries will have
+ // handled rebuilding missing_loc for each of these
+ // objects if we have the RECOVERY_DELETES flag
+ recovery_state.object_recovered(oid, object_stat_sum_t());
+ }
+ }
+
+ if (is_recovery_unfound()) {
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DoRecovery())));
+ } else if (is_backfill_unfound()) {
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestBackfill())));
+ } else {
+ queue_recovery();
+ }
+
+ stringstream ss;
+ ss << "pg has " << num_unfound
+ << " objects unfound and apparently lost marking";
+ string rs = ss.str();
+ dout(0) << "do_command r=" << 0 << " " << rs << dendl;
+ osd->clog->info() << rs;
+ bufferlist empty;
+ on_finish(0, rs, empty);
+ }),
+ OpRequestRef());
+}
+
+void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
+{
+ ceph_assert(repop_queue.empty());
+}
+
+/*
+ * pg status change notification
+ */
+
+void PrimaryLogPG::apply_and_flush_repops(bool requeue)
+{
+ list<OpRequestRef> rq;
+
+ // apply all repops
+ while (!repop_queue.empty()) {
+ RepGather *repop = repop_queue.front();
+ repop_queue.pop_front();
+ dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
+ repop->rep_aborted = true;
+ repop->on_committed.clear();
+ repop->on_success.clear();
+
+ if (requeue) {
+ if (repop->op) {
+ dout(10) << " requeuing " << *repop->op->get_req() << dendl;
+ rq.push_back(repop->op);
+ repop->op = OpRequestRef();
+ }
+
+ // also requeue any dups, interleaved into position
+ auto p = waiting_for_ondisk.find(repop->v);
+ if (p != waiting_for_ondisk.end()) {
+ dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
+ for (auto& i : p->second) {
+ rq.push_back(std::get<0>(i));
+ }
+ waiting_for_ondisk.erase(p);
+ }
+ }
+
+ remove_repop(repop);
+ }
+
+ ceph_assert(repop_queue.empty());
+
+ if (requeue) {
+ requeue_ops(rq);
+ if (!waiting_for_ondisk.empty()) {
+ for (auto& i : waiting_for_ondisk) {
+ for (auto& j : i.second) {
+ derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
+ << " waiting on " << i.first << dendl;
+ }
+ }
+ ceph_assert(waiting_for_ondisk.empty());
+ }
+ }
+
+ waiting_for_ondisk.clear();
+}
+
+void PrimaryLogPG::on_flushed()
+{
+ requeue_ops(waiting_for_flush);
+ if (!is_peered() || !is_primary()) {
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i)) {
+ derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
+ }
+ ceph_assert(object_contexts.empty());
+ }
+}
+
+void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
+{
+ dout(10) << __func__ << dendl;
+
+ on_shutdown();
+
+ t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
+}
+
+void PrimaryLogPG::clear_async_reads()
+{
+ dout(10) << __func__ << dendl;
+ for(auto& i : in_progress_async_reads) {
+ dout(10) << "clear ctx: "
+ << "OpRequestRef " << i.first
+ << " OpContext " << i.second
+ << dendl;
+ close_op_ctx(i.second);
+ }
+}
+
+void PrimaryLogPG::clear_cache()
+{
+ object_contexts.clear();
+}
+
+void PrimaryLogPG::on_shutdown()
+{
+ dout(10) << __func__ << dendl;
+
+ if (recovery_queued) {
+ recovery_queued = false;
+ osd->clear_queued_recovery(this);
+ }
+
+ m_scrubber->scrub_clear_state();
+
+ m_scrubber->unreg_next_scrub();
+
+ vector<ceph_tid_t> tids;
+ cancel_copy_ops(false, &tids);
+ cancel_flush_ops(false, &tids);
+ cancel_proxy_ops(false, &tids);
+ cancel_manifest_ops(false, &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+
+ apply_and_flush_repops(false);
+ cancel_log_updates();
+ // we must remove PGRefs, so do this this prior to release_backoffs() callers
+ clear_backoffs();
+ // clean up snap trim references
+ snap_trimmer_machine.process_event(Reset());
+
+ pgbackend->on_change();
+
+ context_registry_on_change();
+ object_contexts.clear();
+
+ clear_async_reads();
+
+ osd->remote_reserver.cancel_reservation(info.pgid);
+ osd->local_reserver.cancel_reservation(info.pgid);
+
+ clear_primary_state();
+ cancel_recovery();
+
+ if (is_primary()) {
+ osd->clear_ready_to_merge(this);
+ }
+}
+
+void PrimaryLogPG::on_activate_complete()
+{
+ check_local();
+ // waiters
+ if (!recovery_state.needs_flush()) {
+ requeue_ops(waiting_for_peered);
+ } else if (!waiting_for_peered.empty()) {
+ dout(10) << __func__ << " flushes in progress, moving "
+ << waiting_for_peered.size()
+ << " items to waiting_for_flush"
+ << dendl;
+ ceph_assert(waiting_for_flush.empty());
+ waiting_for_flush.swap(waiting_for_peered);
+ }
+
+
+ // all clean?
+ if (needs_recovery()) {
+ dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DoRecovery())));
+ } else if (needs_backfill()) {
+ dout(10) << "activate queueing backfill" << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestBackfill())));
+ } else {
+ dout(10) << "activate all replicas clean, no recovery" << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered())));
+ }
+
+ publish_stats_to_osd();
+
+ if (get_backfill_targets().size()) {
+ last_backfill_started = recovery_state.earliest_backfill();
+ new_backfill = true;
+ ceph_assert(!last_backfill_started.is_max());
+ dout(5) << __func__ << ": bft=" << get_backfill_targets()
+ << " from " << last_backfill_started << dendl;
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ dout(5) << "target shard " << *i
+ << " from " << recovery_state.get_peer_info(*i).last_backfill
+ << dendl;
+ }
+ }
+
+ hit_set_setup();
+ agent_setup();
+}
+
+void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
+{
+ dout(10) << __func__ << dendl;
+
+ if (hit_set && hit_set->insert_count() == 0) {
+ dout(20) << " discarding empty hit_set" << dendl;
+ hit_set_clear();
+ }
+
+ if (recovery_queued) {
+ recovery_queued = false;
+ osd->clear_queued_recovery(this);
+ }
+
+ // requeue everything in the reverse order they should be
+ // reexamined.
+ requeue_ops(waiting_for_peered);
+ requeue_ops(waiting_for_flush);
+ requeue_ops(waiting_for_active);
+ requeue_ops(waiting_for_readable);
+
+ vector<ceph_tid_t> tids;
+ cancel_copy_ops(is_primary(), &tids);
+ cancel_flush_ops(is_primary(), &tids);
+ cancel_proxy_ops(is_primary(), &tids);
+ cancel_manifest_ops(is_primary(), &tids);
+ osd->objecter->op_cancel(tids, -ECANCELED);
+
+ // requeue object waiters
+ for (auto& p : waiting_for_unreadable_object) {
+ release_backoffs(p.first);
+ }
+ if (is_primary()) {
+ requeue_object_waiters(waiting_for_unreadable_object);
+ } else {
+ waiting_for_unreadable_object.clear();
+ }
+ for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
+ p != waiting_for_degraded_object.end();
+ waiting_for_degraded_object.erase(p++)) {
+ release_backoffs(p->first);
+ if (is_primary())
+ requeue_ops(p->second);
+ else
+ p->second.clear();
+ finish_degraded_object(p->first);
+ }
+
+ // requeues waiting_for_scrub
+ m_scrubber->scrub_clear_state();
+
+ for (auto p = waiting_for_blocked_object.begin();
+ p != waiting_for_blocked_object.end();
+ waiting_for_blocked_object.erase(p++)) {
+ if (is_primary())
+ requeue_ops(p->second);
+ else
+ p->second.clear();
+ }
+ for (auto i = callbacks_for_degraded_object.begin();
+ i != callbacks_for_degraded_object.end();
+ ) {
+ finish_degraded_object((i++)->first);
+ }
+ ceph_assert(callbacks_for_degraded_object.empty());
+
+ if (is_primary()) {
+ requeue_ops(waiting_for_cache_not_full);
+ } else {
+ waiting_for_cache_not_full.clear();
+ }
+ objects_blocked_on_cache_full.clear();
+
+ for (list<pair<OpRequestRef, OpContext*> >::iterator i =
+ in_progress_async_reads.begin();
+ i != in_progress_async_reads.end();
+ in_progress_async_reads.erase(i++)) {
+ close_op_ctx(i->second);
+ if (is_primary())
+ requeue_op(i->first);
+ }
+
+ // this will requeue ops we were working on but didn't finish, and
+ // any dups
+ apply_and_flush_repops(is_primary());
+ cancel_log_updates();
+
+ // do this *after* apply_and_flush_repops so that we catch any newly
+ // registered watches.
+ context_registry_on_change();
+
+ pgbackend->on_change_cleanup(&t);
+ m_scrubber->cleanup_store(&t);
+ pgbackend->on_change();
+
+ // clear snap_trimmer state
+ snap_trimmer_machine.process_event(Reset());
+
+ debug_op_order.clear();
+ unstable_stats.clear();
+
+ // we don't want to cache object_contexts through the interval change
+ // NOTE: we actually assert that all currently live references are dead
+ // by the time the flush for the next interval completes.
+ object_contexts.clear();
+
+ // should have been cleared above by finishing all of the degraded objects
+ ceph_assert(objects_blocked_on_degraded_snap.empty());
+}
+
+void PrimaryLogPG::plpg_on_role_change()
+{
+ dout(10) << __func__ << dendl;
+ if (get_role() != 0 && hit_set) {
+ dout(10) << " clearing hit set" << dendl;
+ hit_set_clear();
+ }
+}
+
+void PrimaryLogPG::plpg_on_pool_change()
+{
+ dout(10) << __func__ << dendl;
+ // requeue cache full waiters just in case the cache_mode is
+ // changing away from writeback mode. note that if we are not
+ // active the normal requeuing machinery is sufficient (and properly
+ // ordered).
+ if (is_active() &&
+ pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ !waiting_for_cache_not_full.empty()) {
+ dout(10) << __func__ << " requeuing full waiters (not in writeback) "
+ << dendl;
+ requeue_ops(waiting_for_cache_not_full);
+ objects_blocked_on_cache_full.clear();
+ }
+ hit_set_setup();
+ agent_setup();
+}
+
+// clear state. called on recovery completion AND cancellation.
+void PrimaryLogPG::_clear_recovery_state()
+{
+#ifdef DEBUG_RECOVERY_OIDS
+ recovering_oids.clear();
+#endif
+ dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
+
+ last_backfill_started = hobject_t();
+ set<hobject_t>::iterator i = backfills_in_flight.begin();
+ while (i != backfills_in_flight.end()) {
+ backfills_in_flight.erase(i++);
+ }
+
+ list<OpRequestRef> blocked_ops;
+ for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
+ i != recovering.end();
+ recovering.erase(i++)) {
+ if (i->second) {
+ i->second->drop_recovery_read(&blocked_ops);
+ requeue_ops(blocked_ops);
+ }
+ }
+ ceph_assert(backfills_in_flight.empty());
+ pending_backfill_updates.clear();
+ ceph_assert(recovering.empty());
+ pgbackend->clear_recovery_state();
+}
+
+void PrimaryLogPG::cancel_pull(const hobject_t &soid)
+{
+ dout(20) << __func__ << ": " << soid << dendl;
+ ceph_assert(recovering.count(soid));
+ ObjectContextRef obc = recovering[soid];
+ if (obc) {
+ list<OpRequestRef> blocked_ops;
+ obc->drop_recovery_read(&blocked_ops);
+ requeue_ops(blocked_ops);
+ }
+ recovering.erase(soid);
+ finish_recovery_op(soid);
+ release_backoffs(soid);
+ if (waiting_for_degraded_object.count(soid)) {
+ dout(20) << " kicking degraded waiters on " << soid << dendl;
+ requeue_ops(waiting_for_degraded_object[soid]);
+ waiting_for_degraded_object.erase(soid);
+ }
+ if (waiting_for_unreadable_object.count(soid)) {
+ dout(20) << " kicking unreadable waiters on " << soid << dendl;
+ requeue_ops(waiting_for_unreadable_object[soid]);
+ waiting_for_unreadable_object.erase(soid);
+ }
+ if (is_missing_object(soid))
+ recovery_state.set_last_requested(0);
+ finish_degraded_object(soid);
+}
+
+void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
+{
+ pgbackend->check_recovery_sources(osdmap);
+}
+
+bool PrimaryLogPG::start_recovery_ops(
+ uint64_t max,
+ ThreadPool::TPHandle &handle,
+ uint64_t *ops_started)
+{
+ uint64_t& started = *ops_started;
+ started = 0;
+ bool work_in_progress = false;
+ bool recovery_started = false;
+ ceph_assert(is_primary());
+ ceph_assert(is_peered());
+ ceph_assert(!recovery_state.is_deleting());
+
+ ceph_assert(recovery_queued);
+ recovery_queued = false;
+
+ if (!state_test(PG_STATE_RECOVERING) &&
+ !state_test(PG_STATE_BACKFILLING)) {
+ /* TODO: I think this case is broken and will make do_recovery()
+ * unhappy since we're returning false */
+ dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
+ return have_unfound();
+ }
+
+ const auto &missing = recovery_state.get_pg_log().get_missing();
+
+ uint64_t num_unfound = get_num_unfound();
+
+ if (!recovery_state.have_missing()) {
+ recovery_state.local_recovery_complete();
+ }
+
+ if (!missing.have_missing() || // Primary does not have missing
+ // or all of the missing objects are unfound.
+ recovery_state.all_missing_unfound()) {
+ // Recover the replicas.
+ started = recover_replicas(max, handle, &recovery_started);
+ }
+ if (!started) {
+ // We still have missing objects that we should grab from replicas.
+ started += recover_primary(max, handle);
+ }
+ if (!started && num_unfound != get_num_unfound()) {
+ // second chance to recovery replicas
+ started = recover_replicas(max, handle, &recovery_started);
+ }
+
+ if (started || recovery_started)
+ work_in_progress = true;
+
+ bool deferred_backfill = false;
+ if (recovering.empty() &&
+ state_test(PG_STATE_BACKFILLING) &&
+ !get_backfill_targets().empty() && started < max &&
+ missing.num_missing() == 0 &&
+ waiting_on_backfill.empty()) {
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
+ dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
+ deferred_backfill = true;
+ } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
+ !is_degraded()) {
+ dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
+ deferred_backfill = true;
+ } else if (!recovery_state.is_backfill_reserved()) {
+ /* DNMNOTE I think this branch is dead */
+ dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
+ if (!backfill_reserving) {
+ dout(10) << "queueing RequestBackfill" << dendl;
+ backfill_reserving = true;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestBackfill())));
+ }
+ deferred_backfill = true;
+ } else {
+ started += recover_backfill(max - started, handle, &work_in_progress);
+ }
+ }
+
+ dout(10) << " started " << started << dendl;
+ osd->logger->inc(l_osd_rop, started);
+
+ if (!recovering.empty() ||
+ work_in_progress || recovery_ops_active > 0 || deferred_backfill)
+ return !work_in_progress && have_unfound();
+
+ ceph_assert(recovering.empty());
+ ceph_assert(recovery_ops_active == 0);
+
+ dout(10) << __func__ << " needs_recovery: "
+ << recovery_state.get_missing_loc().get_needs_recovery()
+ << dendl;
+ dout(10) << __func__ << " missing_loc: "
+ << recovery_state.get_missing_loc().get_missing_locs()
+ << dendl;
+ int unfound = get_num_unfound();
+ if (unfound) {
+ dout(10) << " still have " << unfound << " unfound" << dendl;
+ return true;
+ }
+
+ if (missing.num_missing() > 0) {
+ // this shouldn't happen!
+ osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
+ << missing.num_missing() << ": " << missing.get_items();
+ return false;
+ }
+
+ if (needs_recovery()) {
+ // this shouldn't happen!
+ // We already checked num_missing() so we must have missing replicas
+ osd->clog->error() << info.pgid
+ << " Unexpected Error: recovery ending with missing replicas";
+ return false;
+ }
+
+ if (state_test(PG_STATE_RECOVERING)) {
+ state_clear(PG_STATE_RECOVERING);
+ state_clear(PG_STATE_FORCED_RECOVERY);
+ if (needs_backfill()) {
+ dout(10) << "recovery done, queuing backfill" << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::RequestBackfill())));
+ } else {
+ dout(10) << "recovery done, no backfill" << dendl;
+ state_clear(PG_STATE_FORCED_BACKFILL);
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::AllReplicasRecovered())));
+ }
+ } else { // backfilling
+ state_clear(PG_STATE_BACKFILLING);
+ state_clear(PG_STATE_FORCED_BACKFILL);
+ state_clear(PG_STATE_FORCED_RECOVERY);
+ dout(10) << "recovery done, backfill done" << dendl;
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::Backfilled())));
+ }
+
+ return false;
+}
+
+/**
+ * do one recovery op.
+ * return true if done, false if nothing left to do.
+ */
+uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
+{
+ ceph_assert(is_primary());
+
+ const auto &missing = recovery_state.get_pg_log().get_missing();
+
+ dout(10) << __func__ << " recovering " << recovering.size()
+ << " in pg,"
+ << " missing " << missing << dendl;
+
+ dout(25) << __func__ << " " << missing.get_items() << dendl;
+
+ // look at log!
+ pg_log_entry_t *latest = 0;
+ unsigned started = 0;
+ int skipped = 0;
+
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ map<version_t, hobject_t>::const_iterator p =
+ missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
+ while (p != missing.get_rmissing().end()) {
+ handle.reset_tp_timeout();
+ hobject_t soid;
+ version_t v = p->first;
+
+ auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
+ if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
+ latest = it_objects->second;
+ ceph_assert(latest->is_update() || latest->is_delete());
+ soid = latest->soid;
+ } else {
+ latest = 0;
+ soid = p->second;
+ }
+ const pg_missing_item& item = missing.get_items().find(p->second)->second;
+ ++p;
+
+ hobject_t head = soid.get_head();
+
+ eversion_t need = item.need;
+
+ dout(10) << __func__ << " "
+ << soid << " " << item.need
+ << (missing.is_missing(soid) ? " (missing)":"")
+ << (missing.is_missing(head) ? " (missing head)":"")
+ << (recovering.count(soid) ? " (recovering)":"")
+ << (recovering.count(head) ? " (recovering head)":"")
+ << dendl;
+
+ if (latest) {
+ switch (latest->op) {
+ case pg_log_entry_t::CLONE:
+ /*
+ * Handling for this special case removed for now, until we
+ * can correctly construct an accurate SnapSet from the old
+ * one.
+ */
+ break;
+
+ case pg_log_entry_t::LOST_REVERT:
+ {
+ if (item.have == latest->reverting_to) {
+ ObjectContextRef obc = get_object_context(soid, true);
+
+ if (obc->obs.oi.version == latest->version) {
+ // I'm already reverting
+ dout(10) << " already reverting " << soid << dendl;
+ } else {
+ dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
+ obc->obs.oi.version = latest->version;
+
+ ObjectStore::Transaction t;
+ bufferlist b2;
+ obc->obs.oi.encode(
+ b2,
+ get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ ceph_assert(!pool.info.require_rollback());
+ t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
+
+ recovery_state.recover_got(
+ soid,
+ latest->version,
+ false,
+ t);
+
+ ++active_pushes;
+
+ t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+ t.register_on_commit(new C_OSD_CommittedPushedObject(
+ this,
+ get_osdmap_epoch(),
+ info.last_complete));
+ osd->store->queue_transaction(ch, std::move(t));
+ continue;
+ }
+ } else {
+ /*
+ * Pull the old version of the object. Update missing_loc here to have the location
+ * of the version we want.
+ *
+ * This doesn't use the usual missing_loc paths, but that's okay:
+ * - if we have it locally, we hit the case above, and go from there.
+ * - if we don't, we always pass through this case during recovery and set up the location
+ * properly.
+ * - this way we don't need to mangle the missing code to be general about needing an old
+ * version...
+ */
+ eversion_t alternate_need = latest->reverting_to;
+ dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
+
+ set<pg_shard_t> good_peers;
+ for (auto p = recovery_state.get_peer_missing().begin();
+ p != recovery_state.get_peer_missing().end();
+ ++p) {
+ if (p->second.is_missing(soid, need) &&
+ p->second.get_items().at(soid).have == alternate_need) {
+ good_peers.insert(p->first);
+ }
+ }
+ recovery_state.set_revert_with_targets(
+ soid,
+ good_peers);
+ dout(10) << " will pull " << alternate_need << " or " << need
+ << " from one of "
+ << recovery_state.get_missing_loc().get_locations(soid)
+ << dendl;
+ }
+ }
+ break;
+ }
+ }
+
+ if (!recovering.count(soid)) {
+ if (recovering.count(head)) {
+ ++skipped;
+ } else {
+ int r = recover_missing(
+ soid, need, get_recovery_op_priority(), h);
+ switch (r) {
+ case PULL_YES:
+ ++started;
+ break;
+ case PULL_HEAD:
+ ++started;
+ case PULL_NONE:
+ ++skipped;
+ break;
+ default:
+ ceph_abort();
+ }
+ if (started >= max)
+ break;
+ }
+ }
+
+ // only advance last_requested if we haven't skipped anything
+ if (!skipped)
+ recovery_state.set_last_requested(v);
+ }
+
+ pgbackend->run_recovery_op(h, get_recovery_op_priority());
+ return started;
+}
+
+bool PrimaryLogPG::primary_error(
+ const hobject_t& soid, eversion_t v)
+{
+ recovery_state.force_object_missing(pg_whoami, soid, v);
+ bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
+ if (uhoh)
+ osd->clog->error() << info.pgid << " missing primary copy of "
+ << soid << ", unfound";
+ else
+ osd->clog->error() << info.pgid << " missing primary copy of "
+ << soid
+ << ", will try copies on "
+ << recovery_state.get_missing_loc().get_locations(soid);
+ return uhoh;
+}
+
+int PrimaryLogPG::prep_object_replica_deletes(
+ const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h,
+ bool *work_started)
+{
+ ceph_assert(is_primary());
+ dout(10) << __func__ << ": on " << soid << dendl;
+
+ ObjectContextRef obc = get_object_context(soid, false);
+ if (obc) {
+ if (!obc->get_recovery_read()) {
+ dout(20) << "replica delete delayed on " << soid
+ << "; could not get rw_manager lock" << dendl;
+ *work_started = true;
+ return 0;
+ } else {
+ dout(20) << "replica delete got recovery read lock on " << soid
+ << dendl;
+ }
+ }
+
+ start_recovery_op(soid);
+ ceph_assert(!recovering.count(soid));
+ if (!obc)
+ recovering.insert(make_pair(soid, ObjectContextRef()));
+ else
+ recovering.insert(make_pair(soid, obc));
+
+ pgbackend->recover_delete_object(soid, v, h);
+ return 1;
+}
+
+int PrimaryLogPG::prep_object_replica_pushes(
+ const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h,
+ bool *work_started)
+{
+ ceph_assert(is_primary());
+ dout(10) << __func__ << ": on " << soid << dendl;
+
+ if (soid.snap && soid.snap < CEPH_NOSNAP) {
+ // do we have the head and/or snapdir?
+ hobject_t head = soid.get_head();
+ if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
+ if (recovering.count(head)) {
+ dout(10) << " missing but already recovering head " << head << dendl;
+ return 0;
+ } else {
+ int r = recover_missing(
+ head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
+ get_recovery_op_priority(), h);
+ if (r != PULL_NONE)
+ return 1;
+ return 0;
+ }
+ }
+ }
+
+ // NOTE: we know we will get a valid oloc off of disk here.
+ ObjectContextRef obc = get_object_context(soid, false);
+ if (!obc) {
+ primary_error(soid, v);
+ return 0;
+ }
+
+ if (!obc->get_recovery_read()) {
+ dout(20) << "recovery delayed on " << soid
+ << "; could not get rw_manager lock" << dendl;
+ *work_started = true;
+ return 0;
+ } else {
+ dout(20) << "recovery got recovery read lock on " << soid
+ << dendl;
+ }
+
+ start_recovery_op(soid);
+ ceph_assert(!recovering.count(soid));
+ recovering.insert(make_pair(soid, obc));
+
+ int r = pgbackend->recover_object(
+ soid,
+ v,
+ ObjectContextRef(),
+ obc, // has snapset context
+ h);
+ if (r < 0) {
+ dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
+ on_failed_pull({ pg_whoami }, soid, v);
+ return 0;
+ }
+ return 1;
+}
+
+uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
+ bool *work_started)
+{
+ dout(10) << __func__ << "(" << max << ")" << dendl;
+ uint64_t started = 0;
+
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+
+ // this is FAR from an optimal recovery order. pretty lame, really.
+ ceph_assert(!get_acting_recovery_backfill().empty());
+ // choose replicas to recover, replica has the shortest missing list first
+ // so we can bring it back to normal ASAP
+ std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
+ async_by_num_missing;
+ replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
+ for (auto &p: get_acting_recovery_backfill()) {
+ if (p == get_primary()) {
+ continue;
+ }
+ auto pm = recovery_state.get_peer_missing().find(p);
+ ceph_assert(pm != recovery_state.get_peer_missing().end());
+ auto nm = pm->second.num_missing();
+ if (nm != 0) {
+ if (is_async_recovery_target(p)) {
+ async_by_num_missing.push_back(make_pair(nm, p));
+ } else {
+ replicas_by_num_missing.push_back(make_pair(nm, p));
+ }
+ }
+ }
+ // sort by number of missing objects, in ascending order.
+ auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
+ const std::pair<unsigned int, pg_shard_t> &rhs) {
+ return lhs.first < rhs.first;
+ };
+ // acting goes first
+ std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
+ // then async_recovery_targets
+ std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
+ replicas_by_num_missing.insert(replicas_by_num_missing.end(),
+ async_by_num_missing.begin(), async_by_num_missing.end());
+ for (auto &replica: replicas_by_num_missing) {
+ pg_shard_t &peer = replica.second;
+ ceph_assert(peer != get_primary());
+ auto pm = recovery_state.get_peer_missing().find(peer);
+ ceph_assert(pm != recovery_state.get_peer_missing().end());
+ size_t m_sz = pm->second.num_missing();
+
+ dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
+ dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
+
+ // oldest first!
+ const pg_missing_t &m(pm->second);
+ for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
+ p != m.get_rmissing().end() && started < max;
+ ++p) {
+ handle.reset_tp_timeout();
+ const hobject_t soid(p->second);
+
+ if (recovery_state.get_missing_loc().is_unfound(soid)) {
+ dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
+ continue;
+ }
+
+ const pg_info_t &pi = recovery_state.get_peer_info(peer);
+ if (soid > pi.last_backfill) {
+ if (!recovering.count(soid)) {
+ derr << __func__ << ": object " << soid << " last_backfill "
+ << pi.last_backfill << dendl;
+ derr << __func__ << ": object added to missing set for backfill, but "
+ << "is not in recovering, error!" << dendl;
+ ceph_abort();
+ }
+ continue;
+ }
+
+ if (recovering.count(soid)) {
+ dout(10) << __func__ << ": already recovering " << soid << dendl;
+ continue;
+ }
+
+ if (recovery_state.get_missing_loc().is_deleted(soid)) {
+ dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
+ map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
+ started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
+ continue;
+ }
+
+ if (soid.is_snap() &&
+ recovery_state.get_pg_log().get_missing().is_missing(
+ soid.get_head())) {
+ dout(10) << __func__ << ": " << soid.get_head()
+ << " still missing on primary" << dendl;
+ continue;
+ }
+
+ if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
+ dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
+ continue;
+ }
+
+ dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
+ map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
+ started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
+ }
+ }
+
+ pgbackend->run_recovery_op(h, get_recovery_op_priority());
+ return started;
+}
+
+hobject_t PrimaryLogPG::earliest_peer_backfill() const
+{
+ hobject_t e = hobject_t::get_max();
+ for (const pg_shard_t& peer : get_backfill_targets()) {
+ const auto iter = peer_backfill_info.find(peer);
+ ceph_assert(iter != peer_backfill_info.end());
+ e = std::min(e, iter->second.begin);
+ }
+ return e;
+}
+
+bool PrimaryLogPG::all_peer_done() const
+{
+ // Primary hasn't got any more objects
+ ceph_assert(backfill_info.empty());
+
+ for (const pg_shard_t& bt : get_backfill_targets()) {
+ const auto piter = peer_backfill_info.find(bt);
+ ceph_assert(piter != peer_backfill_info.end());
+ const BackfillInterval& pbi = piter->second;
+ // See if peer has more to process
+ if (!pbi.extends_to_end() || !pbi.empty())
+ return false;
+ }
+ return true;
+}
+
+/**
+ * recover_backfill
+ *
+ * Invariants:
+ *
+ * backfilled: fully pushed to replica or present in replica's missing set (both
+ * our copy and theirs).
+ *
+ * All objects on a backfill_target in
+ * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
+ * objects have been actually deleted and all logically-valid objects are replicated.
+ * There may be PG objects in this interval yet to be backfilled.
+ *
+ * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
+ * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
+ *
+ * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
+ * backfill_info.begin) in PG are backfilled. No deleted objects in this
+ * interval remain on the backfill target.
+ *
+ * For a backfill target, all objects <= peer_info[target].last_backfill
+ * have been backfilled to target
+ *
+ * There *MAY* be missing/outdated objects between last_backfill_started and
+ * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
+ * io created objects since the last scan. For this reason, we call
+ * update_range() again before continuing backfill.
+ */
+uint64_t PrimaryLogPG::recover_backfill(
+ uint64_t max,
+ ThreadPool::TPHandle &handle, bool *work_started)
+{
+ dout(10) << __func__ << " (" << max << ")"
+ << " bft=" << get_backfill_targets()
+ << " last_backfill_started " << last_backfill_started
+ << (new_backfill ? " new_backfill":"")
+ << dendl;
+ ceph_assert(!get_backfill_targets().empty());
+
+ // Initialize from prior backfill state
+ if (new_backfill) {
+ // on_activate() was called prior to getting here
+ ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
+ new_backfill = false;
+
+ // initialize BackfillIntervals
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ peer_backfill_info[*i].reset(
+ recovery_state.get_peer_info(*i).last_backfill);
+ }
+ backfill_info.reset(last_backfill_started);
+
+ backfills_in_flight.clear();
+ pending_backfill_updates.clear();
+ }
+
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ dout(10) << "peer osd." << *i
+ << " info " << recovery_state.get_peer_info(*i)
+ << " interval " << peer_backfill_info[*i].begin
+ << "-" << peer_backfill_info[*i].end
+ << " " << peer_backfill_info[*i].objects.size() << " objects"
+ << dendl;
+ }
+
+ // update our local interval to cope with recent changes
+ backfill_info.begin = last_backfill_started;
+ update_range(&backfill_info, handle);
+
+ unsigned ops = 0;
+ vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
+ set<hobject_t> add_to_stat;
+
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ peer_backfill_info[*i].trim_to(
+ std::max(
+ recovery_state.get_peer_info(*i).last_backfill,
+ last_backfill_started));
+ }
+ backfill_info.trim_to(last_backfill_started);
+
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ while (ops < max) {
+ if (backfill_info.begin <= earliest_peer_backfill() &&
+ !backfill_info.extends_to_end() && backfill_info.empty()) {
+ hobject_t next = backfill_info.end;
+ backfill_info.reset(next);
+ backfill_info.end = hobject_t::get_max();
+ update_range(&backfill_info, handle);
+ backfill_info.trim();
+ }
+
+ dout(20) << " my backfill interval " << backfill_info << dendl;
+
+ bool sent_scan = false;
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ pg_shard_t bt = *i;
+ BackfillInterval& pbi = peer_backfill_info[bt];
+
+ dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
+ if (pbi.begin <= backfill_info.begin &&
+ !pbi.extends_to_end() && pbi.empty()) {
+ dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
+ epoch_t e = get_osdmap_epoch();
+ MOSDPGScan *m = new MOSDPGScan(
+ MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
+ spg_t(info.pgid.pgid, bt.shard),
+ pbi.end, hobject_t());
+ osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
+ ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
+ waiting_on_backfill.insert(bt);
+ sent_scan = true;
+ }
+ }
+
+ // Count simultaneous scans as a single op and let those complete
+ if (sent_scan) {
+ ops++;
+ start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
+ break;
+ }
+
+ if (backfill_info.empty() && all_peer_done()) {
+ dout(10) << " reached end for both local and all peers" << dendl;
+ break;
+ }
+
+ // Get object within set of peers to operate on and
+ // the set of targets for which that object applies.
+ hobject_t check = earliest_peer_backfill();
+
+ if (check < backfill_info.begin) {
+
+ set<pg_shard_t> check_targets;
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ pg_shard_t bt = *i;
+ BackfillInterval& pbi = peer_backfill_info[bt];
+ if (pbi.begin == check)
+ check_targets.insert(bt);
+ }
+ ceph_assert(!check_targets.empty());
+
+ dout(20) << " BACKFILL removing " << check
+ << " from peers " << check_targets << dendl;
+ for (set<pg_shard_t>::iterator i = check_targets.begin();
+ i != check_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
+ BackfillInterval& pbi = peer_backfill_info[bt];
+ ceph_assert(pbi.begin == check);
+
+ to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
+ pbi.pop_front();
+ }
+
+ last_backfill_started = check;
+
+ // Don't increment ops here because deletions
+ // are cheap and not replied to unlike real recovery_ops,
+ // and we can't increment ops without requeueing ourself
+ // for recovery.
+ } else {
+ eversion_t& obj_v = backfill_info.objects.begin()->second;
+
+ vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ pg_shard_t bt = *i;
+ BackfillInterval& pbi = peer_backfill_info[bt];
+ // Find all check peers that have the wrong version
+ if (check == backfill_info.begin && check == pbi.begin) {
+ if (pbi.objects.begin()->second != obj_v) {
+ need_ver_targs.push_back(bt);
+ } else {
+ keep_ver_targs.push_back(bt);
+ }
+ } else {
+ const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
+
+ // Only include peers that we've caught up to their backfill line
+ // otherwise, they only appear to be missing this object
+ // because their pbi.begin > backfill_info.begin.
+ if (backfill_info.begin > pinfo.last_backfill)
+ missing_targs.push_back(bt);
+ else
+ skip_targs.push_back(bt);
+ }
+ }
+
+ if (!keep_ver_targs.empty()) {
+ // These peers have version obj_v
+ dout(20) << " BACKFILL keeping " << check
+ << " with ver " << obj_v
+ << " on peers " << keep_ver_targs << dendl;
+ //assert(!waiting_for_degraded_object.count(check));
+ }
+ if (!need_ver_targs.empty() || !missing_targs.empty()) {
+ ObjectContextRef obc = get_object_context(backfill_info.begin, false);
+ ceph_assert(obc);
+ if (obc->get_recovery_read()) {
+ if (!need_ver_targs.empty()) {
+ dout(20) << " BACKFILL replacing " << check
+ << " with ver " << obj_v
+ << " to peers " << need_ver_targs << dendl;
+ }
+ if (!missing_targs.empty()) {
+ dout(20) << " BACKFILL pushing " << backfill_info.begin
+ << " with ver " << obj_v
+ << " to peers " << missing_targs << dendl;
+ }
+ vector<pg_shard_t> all_push = need_ver_targs;
+ all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
+
+ handle.reset_tp_timeout();
+ int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
+ if (r < 0) {
+ *work_started = true;
+ dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
+ break;
+ }
+ ops++;
+ } else {
+ *work_started = true;
+ dout(20) << "backfill blocking on " << backfill_info.begin
+ << "; could not get rw_manager lock" << dendl;
+ break;
+ }
+ }
+ dout(20) << "need_ver_targs=" << need_ver_targs
+ << " keep_ver_targs=" << keep_ver_targs << dendl;
+ dout(20) << "backfill_targets=" << get_backfill_targets()
+ << " missing_targs=" << missing_targs
+ << " skip_targs=" << skip_targs << dendl;
+
+ last_backfill_started = backfill_info.begin;
+ add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
+ backfill_info.pop_front();
+ vector<pg_shard_t> check_targets = need_ver_targs;
+ check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
+ for (vector<pg_shard_t>::iterator i = check_targets.begin();
+ i != check_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
+ BackfillInterval& pbi = peer_backfill_info[bt];
+ pbi.pop_front();
+ }
+ }
+ }
+
+ for (set<hobject_t>::iterator i = add_to_stat.begin();
+ i != add_to_stat.end();
+ ++i) {
+ ObjectContextRef obc = get_object_context(*i, false);
+ ceph_assert(obc);
+ pg_stat_t stat;
+ add_object_context_to_pg_stat(obc, &stat);
+ pending_backfill_updates[*i] = stat;
+ }
+ map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
+ for (unsigned i = 0; i < to_remove.size(); ++i) {
+ handle.reset_tp_timeout();
+ const hobject_t& oid = to_remove[i].get<0>();
+ eversion_t v = to_remove[i].get<1>();
+ pg_shard_t peer = to_remove[i].get<2>();
+ MOSDPGBackfillRemove *m;
+ auto it = reqs.find(peer);
+ if (it != reqs.end()) {
+ m = it->second;
+ } else {
+ m = reqs[peer] = new MOSDPGBackfillRemove(
+ spg_t(info.pgid.pgid, peer.shard),
+ get_osdmap_epoch());
+ }
+ m->ls.push_back(make_pair(oid, v));
+
+ if (oid <= last_backfill_started)
+ pending_backfill_updates[oid]; // add empty stat!
+ }
+ for (auto p : reqs) {
+ osd->send_message_osd_cluster(p.first.osd, p.second,
+ get_osdmap_epoch());
+ }
+
+ pgbackend->run_recovery_op(h, get_recovery_op_priority());
+
+ hobject_t backfill_pos =
+ std::min(backfill_info.begin, earliest_peer_backfill());
+ dout(5) << "backfill_pos is " << backfill_pos << dendl;
+ for (set<hobject_t>::iterator i = backfills_in_flight.begin();
+ i != backfills_in_flight.end();
+ ++i) {
+ dout(20) << *i << " is still in flight" << dendl;
+ }
+
+ hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
+ backfill_pos : *(backfills_in_flight.begin());
+ hobject_t new_last_backfill = recovery_state.earliest_backfill();
+ dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
+ for (map<hobject_t, pg_stat_t>::iterator i =
+ pending_backfill_updates.begin();
+ i != pending_backfill_updates.end() &&
+ i->first < next_backfill_to_complete;
+ pending_backfill_updates.erase(i++)) {
+ dout(20) << " pending_backfill_update " << i->first << dendl;
+ ceph_assert(i->first > new_last_backfill);
+ // carried from a previous round – if we are here, then we had to
+ // be requeued (by e.g. on_global_recover()) and those operations
+ // are done.
+ recovery_state.update_complete_backfill_object_stats(
+ i->first,
+ i->second);
+ new_last_backfill = i->first;
+ }
+ dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
+
+ ceph_assert(!pending_backfill_updates.empty() ||
+ new_last_backfill == last_backfill_started);
+ if (pending_backfill_updates.empty() &&
+ backfill_pos.is_max()) {
+ ceph_assert(backfills_in_flight.empty());
+ new_last_backfill = backfill_pos;
+ last_backfill_started = backfill_pos;
+ }
+ dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
+
+ // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
+ // all the backfill targets. Otherwise, we will move last_backfill up on
+ // those targets need it and send OP_BACKFILL_PROGRESS to them.
+ for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+ i != get_backfill_targets().end();
+ ++i) {
+ pg_shard_t bt = *i;
+ const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
+
+ if (new_last_backfill > pinfo.last_backfill) {
+ recovery_state.update_peer_last_backfill(bt, new_last_backfill);
+ epoch_t e = get_osdmap_epoch();
+ MOSDPGBackfill *m = NULL;
+ if (pinfo.last_backfill.is_max()) {
+ m = new MOSDPGBackfill(
+ MOSDPGBackfill::OP_BACKFILL_FINISH,
+ e,
+ get_last_peering_reset(),
+ spg_t(info.pgid.pgid, bt.shard));
+ // Use default priority here, must match sub_op priority
+ start_recovery_op(hobject_t::get_max());
+ } else {
+ m = new MOSDPGBackfill(
+ MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ e,
+ get_last_peering_reset(),
+ spg_t(info.pgid.pgid, bt.shard));
+ // Use default priority here, must match sub_op priority
+ }
+ m->last_backfill = pinfo.last_backfill;
+ m->stats = pinfo.stats;
+ osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
+ dout(10) << " peer " << bt
+ << " num_objects now " << pinfo.stats.stats.sum.num_objects
+ << " / " << info.stats.stats.sum.num_objects << dendl;
+ }
+ }
+
+ if (ops)
+ *work_started = true;
+ return ops;
+}
+
+int PrimaryLogPG::prep_backfill_object_push(
+ hobject_t oid, eversion_t v,
+ ObjectContextRef obc,
+ vector<pg_shard_t> peers,
+ PGBackend::RecoveryHandle *h)
+{
+ dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
+ ceph_assert(!peers.empty());
+
+ backfills_in_flight.insert(oid);
+ recovery_state.prepare_backfill_for_missing(oid, v, peers);
+
+ ceph_assert(!recovering.count(oid));
+
+ start_recovery_op(oid);
+ recovering.insert(make_pair(oid, obc));
+
+ int r = pgbackend->recover_object(
+ oid,
+ v,
+ ObjectContextRef(),
+ obc,
+ h);
+ if (r < 0) {
+ dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
+ on_failed_pull({ pg_whoami }, oid, v);
+ }
+ return r;
+}
+
+void PrimaryLogPG::update_range(
+ BackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
+
+ if (bi->version < info.log_tail) {
+ dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+ << dendl;
+ bi->version = info.last_update;
+ scan_range(local_min, local_max, bi, handle);
+ }
+
+ if (bi->version >= projected_last_update) {
+ dout(10) << __func__<< ": bi is current " << dendl;
+ ceph_assert(bi->version == projected_last_update);
+ } else if (bi->version >= info.log_tail) {
+ if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
+ /* Because we don't move log_tail on split, the log might be
+ * empty even if log_tail != last_update. However, the only
+ * way to get here with an empty log is if log_tail is actually
+ * eversion_t(), because otherwise the entry which changed
+ * last_update since the last scan would have to be present.
+ */
+ ceph_assert(bi->version == eversion_t());
+ return;
+ }
+
+ dout(10) << __func__<< ": bi is old, (" << bi->version
+ << ") can be updated with log to projected_last_update "
+ << projected_last_update << dendl;
+
+ auto func = [&](const pg_log_entry_t &e) {
+ dout(10) << __func__ << ": updating from version " << e.version
+ << dendl;
+ const hobject_t &soid = e.soid;
+ if (soid >= bi->begin &&
+ soid < bi->end) {
+ if (e.is_update()) {
+ dout(10) << __func__ << ": " << e.soid << " updated to version "
+ << e.version << dendl;
+ bi->objects.erase(e.soid);
+ bi->objects.insert(
+ make_pair(
+ e.soid,
+ e.version));
+ } else if (e.is_delete()) {
+ dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
+ bi->objects.erase(e.soid);
+ }
+ }
+ };
+ dout(10) << "scanning pg log first" << dendl;
+ recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
+ dout(10) << "scanning projected log" << dendl;
+ projected_log.scan_log_after(bi->version, func);
+ bi->version = projected_last_update;
+ } else {
+ ceph_abort_msg("scan_range should have raised bi->version past log_tail");
+ }
+}
+
+void PrimaryLogPG::scan_range(
+ int min, int max, BackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ ceph_assert(is_locked());
+ dout(10) << "scan_range from " << bi->begin << dendl;
+ bi->clear_objects();
+
+ vector<hobject_t> ls;
+ ls.reserve(max);
+ int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
+ ceph_assert(r >= 0);
+ dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
+ dout(20) << ls << dendl;
+
+ for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ handle.reset_tp_timeout();
+ ObjectContextRef obc;
+ if (is_primary())
+ obc = object_contexts.lookup(*p);
+ if (obc) {
+ if (!obc->obs.exists) {
+ /* If the object does not exist here, it must have been removed
+ * between the collection_list_partial and here. This can happen
+ * for the first item in the range, which is usually last_backfill.
+ */
+ continue;
+ }
+ bi->objects[*p] = obc->obs.oi.version;
+ dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
+ } else {
+ bufferlist bl;
+ int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
+ /* If the object does not exist here, it must have been removed
+ * between the collection_list_partial and here. This can happen
+ * for the first item in the range, which is usually last_backfill.
+ */
+ if (r == -ENOENT)
+ continue;
+
+ ceph_assert(r >= 0);
+ object_info_t oi(bl);
+ bi->objects[*p] = oi.version;
+ dout(20) << " " << *p << " " << oi.version << dendl;
+ }
+ }
+}
+
+
+/** check_local
+ *
+ * verifies that stray objects have been deleted
+ */
+void PrimaryLogPG::check_local()
+{
+ dout(10) << __func__ << dendl;
+
+ ceph_assert(
+ info.last_update >=
+ recovery_state.get_pg_log().get_tail()); // otherwise we need some help!
+
+ if (!cct->_conf->osd_debug_verify_stray_on_activate)
+ return;
+
+ // just scan the log.
+ set<hobject_t> did;
+ for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
+ p != recovery_state.get_pg_log().get_log().log.rend();
+ ++p) {
+ if (did.count(p->soid))
+ continue;
+ did.insert(p->soid);
+
+ if (p->is_delete() && !is_missing_object(p->soid)) {
+ dout(10) << " checking " << p->soid
+ << " at " << p->version << dendl;
+ struct stat st;
+ int r = osd->store->stat(
+ ch,
+ ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
+ &st);
+ if (r != -ENOENT) {
+ derr << __func__ << " " << p->soid << " exists, but should have been "
+ << "deleted" << dendl;
+ ceph_abort_msg("erroneously present object");
+ }
+ } else {
+ // ignore old(+missing) objects
+ }
+ }
+}
+
+
+
+// ===========================
+// hit sets
+
+hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
+{
+ ostringstream ss;
+ ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
+ hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
+ info.pgid.ps(), info.pgid.pool(),
+ cct->_conf->osd_hit_set_namespace);
+ dout(20) << __func__ << " " << hoid << dendl;
+ return hoid;
+}
+
+hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt)
+{
+ ostringstream ss;
+ ss << "hit_set_" << info.pgid.pgid << "_archive_";
+ if (using_gmt) {
+ start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
+ end.gmtime(ss, true /* legacy pre-octopus form */);
+ } else {
+ start.localtime(ss, true /* legacy pre-octopus form */) << "_";
+ end.localtime(ss, true /* legacy pre-octopus form */);
+ }
+ hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
+ info.pgid.ps(), info.pgid.pool(),
+ cct->_conf->osd_hit_set_namespace);
+ dout(20) << __func__ << " " << hoid << dendl;
+ return hoid;
+}
+
+void PrimaryLogPG::hit_set_clear()
+{
+ dout(20) << __func__ << dendl;
+ hit_set.reset();
+ hit_set_start_stamp = utime_t();
+}
+
+void PrimaryLogPG::hit_set_setup()
+{
+ if (!is_active() ||
+ !is_primary()) {
+ hit_set_clear();
+ return;
+ }
+
+ if (is_active() && is_primary() &&
+ (!pool.info.hit_set_count ||
+ !pool.info.hit_set_period ||
+ pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
+ hit_set_clear();
+
+ // only primary is allowed to remove all the hit set objects
+ hit_set_remove_all();
+ return;
+ }
+
+ // FIXME: discard any previous data for now
+ hit_set_create();
+
+ // include any writes we know about from the pg log. this doesn't
+ // capture reads, but it is better than nothing!
+ hit_set_apply_log();
+}
+
+void PrimaryLogPG::hit_set_remove_all()
+{
+ // If any archives are degraded we skip this
+ for (auto p = info.hit_set.history.begin();
+ p != info.hit_set.history.end();
+ ++p) {
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+ // Once we hit a degraded object just skip
+ if (is_degraded_or_backfilling_object(aoid))
+ return;
+ if (m_scrubber->write_blocked_by_scrub(aoid))
+ return;
+ }
+
+ if (!info.hit_set.history.empty()) {
+ auto p = info.hit_set.history.rbegin();
+ ceph_assert(p != info.hit_set.history.rend());
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ ceph_assert(!is_degraded_or_backfilling_object(oid));
+ ObjectContextRef obc = get_object_context(oid, false);
+ ceph_assert(obc);
+
+ OpContextUPtr ctx = simple_opc_create(obc);
+ ctx->at_version = get_next_version();
+ ctx->updated_hset_history = info.hit_set;
+ utime_t now = ceph_clock_now();
+ ctx->mtime = now;
+ hit_set_trim(ctx, 0);
+ simple_opc_submit(std::move(ctx));
+ }
+
+ recovery_state.update_hset(pg_hit_set_history_t());
+ if (agent_state) {
+ agent_state->discard_hit_sets();
+ }
+}
+
+void PrimaryLogPG::hit_set_create()
+{
+ utime_t now = ceph_clock_now();
+ // make a copy of the params to modify
+ HitSet::Params params(pool.info.hit_set_params);
+
+ dout(20) << __func__ << " " << params << dendl;
+ if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+ BloomHitSet::Params *p =
+ static_cast<BloomHitSet::Params*>(params.impl.get());
+
+ // convert false positive rate so it holds up across the full period
+ p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
+ if (p->get_fpp() <= 0.0)
+ p->set_fpp(.01); // fpp cannot be zero!
+
+ // if we don't have specified size, estimate target size based on the
+ // previous bin!
+ if (p->target_size == 0 && hit_set) {
+ utime_t dur = now - hit_set_start_stamp;
+ unsigned unique = hit_set->approx_unique_insert_count();
+ dout(20) << __func__ << " previous set had approx " << unique
+ << " unique items over " << dur << " seconds" << dendl;
+ p->target_size = (double)unique * (double)pool.info.hit_set_period
+ / (double)dur;
+ }
+ if (p->target_size <
+ static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
+ p->target_size = cct->_conf->osd_hit_set_min_size;
+
+ if (p->target_size
+ > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
+ p->target_size = cct->_conf->osd_hit_set_max_size;
+
+ p->seed = now.sec();
+
+ dout(10) << __func__ << " target_size " << p->target_size
+ << " fpp " << p->get_fpp() << dendl;
+ }
+ hit_set.reset(new HitSet(params));
+ hit_set_start_stamp = now;
+}
+
+/**
+ * apply log entries to set
+ *
+ * this would only happen after peering, to at least capture writes
+ * during an interval that was potentially lost.
+ */
+bool PrimaryLogPG::hit_set_apply_log()
+{
+ if (!hit_set)
+ return false;
+
+ eversion_t to = info.last_update;
+ eversion_t from = info.hit_set.current_last_update;
+ if (to <= from) {
+ dout(20) << __func__ << " no update" << dendl;
+ return false;
+ }
+
+ dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
+ list<pg_log_entry_t>::const_reverse_iterator p =
+ recovery_state.get_pg_log().get_log().log.rbegin();
+ while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
+ ++p;
+ while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
+ hit_set->insert(p->soid);
+ ++p;
+ }
+
+ return true;
+}
+
+void PrimaryLogPG::hit_set_persist()
+{
+ dout(10) << __func__ << dendl;
+ bufferlist bl;
+ unsigned max = pool.info.hit_set_count;
+
+ utime_t now = ceph_clock_now();
+ hobject_t oid;
+
+ // If any archives are degraded we skip this persist request
+ // account for the additional entry being added below
+ for (auto p = info.hit_set.history.begin();
+ p != info.hit_set.history.end();
+ ++p) {
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+ // Once we hit a degraded object just skip further trim
+ if (is_degraded_or_backfilling_object(aoid))
+ return;
+ if (m_scrubber->write_blocked_by_scrub(aoid))
+ return;
+ }
+
+ // If backfill is in progress and we could possibly overlap with the
+ // hit_set_* objects, back off. Since these all have
+ // hobject_t::hash set to pgid.ps(), and those sort first, we can
+ // look just at that. This is necessary because our transactions
+ // may include a modify of the new hit_set *and* a delete of the
+ // old one, and this may span the backfill boundary.
+ for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
+ p != get_backfill_targets().end();
+ ++p) {
+ const pg_info_t& pi = recovery_state.get_peer_info(*p);
+ if (pi.last_backfill == hobject_t() ||
+ pi.last_backfill.get_hash() == info.pgid.ps()) {
+ dout(10) << __func__ << " backfill target osd." << *p
+ << " last_backfill has not progressed past pgid ps"
+ << dendl;
+ return;
+ }
+ }
+
+
+ pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
+ new_hset.begin = hit_set_start_stamp;
+ new_hset.end = now;
+ oid = get_hit_set_archive_object(
+ new_hset.begin,
+ new_hset.end,
+ new_hset.using_gmt);
+
+ // If the current object is degraded we skip this persist request
+ if (m_scrubber->write_blocked_by_scrub(oid))
+ return;
+
+ hit_set->seal();
+ encode(*hit_set, bl);
+ dout(20) << __func__ << " archive " << oid << dendl;
+
+ if (agent_state) {
+ agent_state->add_hit_set(new_hset.begin, hit_set);
+ uint32_t size = agent_state->hit_set_map.size();
+ if (size >= pool.info.hit_set_count) {
+ size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
+ }
+ hit_set_in_memory_trim(size);
+ }
+
+ ObjectContextRef obc = get_object_context(oid, true);
+ OpContextUPtr ctx = simple_opc_create(obc);
+
+ ctx->at_version = get_next_version();
+ ctx->updated_hset_history = info.hit_set;
+ pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
+
+ updated_hit_set_hist.current_last_update = info.last_update;
+ new_hset.version = ctx->at_version;
+
+ updated_hit_set_hist.history.push_back(new_hset);
+ hit_set_create();
+
+ // fabricate an object_info_t and SnapSet
+ obc->obs.oi.version = ctx->at_version;
+ obc->obs.oi.mtime = now;
+ obc->obs.oi.size = bl.length();
+ obc->obs.exists = true;
+ obc->obs.oi.set_data_digest(bl.crc32c(-1));
+
+ ctx->new_obs = obc->obs;
+
+ ctx->new_snapset = obc->ssc->snapset;
+
+ ctx->delta_stats.num_objects++;
+ ctx->delta_stats.num_objects_hit_set_archive++;
+
+ ctx->delta_stats.num_bytes += bl.length();
+ ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
+
+ bufferlist bss;
+ encode(ctx->new_snapset, bss);
+ bufferlist boi(sizeof(ctx->new_obs.oi));
+ encode(ctx->new_obs.oi, boi,
+ get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+ ctx->op_t->create(oid);
+ if (bl.length()) {
+ ctx->op_t->write(oid, 0, bl.length(), bl, 0);
+ write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
+ 0, bl.length());
+ ctx->clean_regions.mark_data_region_dirty(0, bl.length());
+ }
+ map <string, bufferlist> attrs;
+ attrs[OI_ATTR] = std::move(boi);
+ attrs[SS_ATTR] = std::move(bss);
+ setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
+ ctx->log.push_back(
+ pg_log_entry_t(
+ pg_log_entry_t::MODIFY,
+ oid,
+ ctx->at_version,
+ eversion_t(),
+ 0,
+ osd_reqid_t(),
+ ctx->mtime,
+ 0)
+ );
+ ctx->log.back().clean_regions = ctx->clean_regions;
+
+ hit_set_trim(ctx, max);
+
+ simple_opc_submit(std::move(ctx));
+}
+
+void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
+{
+ ceph_assert(ctx->updated_hset_history);
+ pg_hit_set_history_t &updated_hit_set_hist =
+ *(ctx->updated_hset_history);
+ for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
+ list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
+ ceph_assert(p != updated_hit_set_hist.history.end());
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+ ceph_assert(!is_degraded_or_backfilling_object(oid));
+
+ dout(20) << __func__ << " removing " << oid << dendl;
+ ++ctx->at_version.version;
+ ctx->log.push_back(
+ pg_log_entry_t(pg_log_entry_t::DELETE,
+ oid,
+ ctx->at_version,
+ p->version,
+ 0,
+ osd_reqid_t(),
+ ctx->mtime,
+ 0));
+
+ ctx->op_t->remove(oid);
+ updated_hit_set_hist.history.pop_front();
+
+ ObjectContextRef obc = get_object_context(oid, false);
+ ceph_assert(obc);
+ --ctx->delta_stats.num_objects;
+ --ctx->delta_stats.num_objects_hit_set_archive;
+ ctx->delta_stats.num_bytes -= obc->obs.oi.size;
+ ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
+ }
+}
+
+void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
+{
+ while (agent_state->hit_set_map.size() > max_in_memory) {
+ agent_state->remove_oldest_hit_set();
+ }
+}
+
+
+// =======================================
+// cache agent
+
+void PrimaryLogPG::agent_setup()
+{
+ ceph_assert(is_locked());
+ if (!is_active() ||
+ !is_primary() ||
+ state_test(PG_STATE_PREMERGE) ||
+ pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
+ pool.info.tier_of < 0 ||
+ !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
+ agent_clear();
+ return;
+ }
+ if (!agent_state) {
+ agent_state.reset(new TierAgentState);
+
+ // choose random starting position
+ agent_state->position = hobject_t();
+ agent_state->position.pool = info.pgid.pool();
+ agent_state->position.set_hash(pool.info.get_random_pg_position(
+ info.pgid.pgid,
+ rand()));
+ agent_state->start = agent_state->position;
+
+ dout(10) << __func__ << " allocated new state, position "
+ << agent_state->position << dendl;
+ } else {
+ dout(10) << __func__ << " keeping existing state" << dendl;
+ }
+
+ if (info.stats.stats_invalid) {
+ osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
+ }
+
+ agent_choose_mode();
+}
+
+void PrimaryLogPG::agent_clear()
+{
+ agent_stop();
+ agent_state.reset(NULL);
+}
+
+// Return false if no objects operated on since start of object hash space
+bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
+{
+ std::scoped_lock locker{*this};
+ if (!agent_state) {
+ dout(10) << __func__ << " no agent state, stopping" << dendl;
+ return true;
+ }
+
+ ceph_assert(!recovery_state.is_deleting());
+
+ if (agent_state->is_idle()) {
+ dout(10) << __func__ << " idle, stopping" << dendl;
+ return true;
+ }
+
+ osd->logger->inc(l_osd_agent_wake);
+
+ dout(10) << __func__
+ << " max " << start_max
+ << ", flush " << agent_state->get_flush_mode_name()
+ << ", evict " << agent_state->get_evict_mode_name()
+ << ", pos " << agent_state->position
+ << dendl;
+ ceph_assert(is_primary());
+ ceph_assert(is_active());
+
+ agent_load_hit_sets();
+
+ const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
+ ceph_assert(base_pool);
+
+ int ls_min = 1;
+ int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
+
+ // list some objects. this conveniently lists clones (oldest to
+ // newest) before heads... the same order we want to flush in.
+ //
+ // NOTE: do not flush the Sequencer. we will assume that the
+ // listing we get back is imprecise.
+ vector<hobject_t> ls;
+ hobject_t next;
+ int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
+ &ls, &next);
+ ceph_assert(r >= 0);
+ dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
+ int started = 0;
+ for (vector<hobject_t>::iterator p = ls.begin();
+ p != ls.end();
+ ++p) {
+ if (p->nspace == cct->_conf->osd_hit_set_namespace) {
+ dout(20) << __func__ << " skip (hit set) " << *p << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ if (is_degraded_or_backfilling_object(*p)) {
+ dout(20) << __func__ << " skip (degraded) " << *p << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ if (is_missing_object(p->get_head())) {
+ dout(20) << __func__ << " skip (missing head) " << *p << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ ObjectContextRef obc = get_object_context(*p, false, NULL);
+ if (!obc) {
+ // we didn't flush; we may miss something here.
+ dout(20) << __func__ << " skip (no obc) " << *p << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ if (!obc->obs.exists) {
+ dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
+ obc->obs.oi.soid.get_head())) {
+ dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ if (obc->is_blocked()) {
+ dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+ if (obc->is_request_pending()) {
+ dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+
+ // be careful flushing omap to an EC pool.
+ if (!base_pool->supports_omap() &&
+ obc->obs.oi.is_omap()) {
+ dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ continue;
+ }
+
+ if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
+ agent_maybe_evict(obc, false))
+ ++started;
+ else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
+ agent_flush_quota > 0 && agent_maybe_flush(obc)) {
+ ++started;
+ --agent_flush_quota;
+ }
+ if (started >= start_max) {
+ // If finishing early, set "next" to the next object
+ if (++p != ls.end())
+ next = *p;
+ break;
+ }
+ }
+
+ if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
+ dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
+ agent_state->hist_age = 0;
+ agent_state->temp_hist.decay();
+ }
+
+ // Total objects operated on so far
+ int total_started = agent_state->started + started;
+ bool need_delay = false;
+
+ dout(20) << __func__ << " start pos " << agent_state->position
+ << " next start pos " << next
+ << " started " << total_started << dendl;
+
+ // See if we've made a full pass over the object hash space
+ // This might check at most ls_max objects a second time to notice that
+ // we've checked every objects at least once.
+ if (agent_state->position < agent_state->start &&
+ next >= agent_state->start) {
+ dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
+ if (total_started == 0)
+ need_delay = true;
+ else
+ total_started = 0;
+ agent_state->start = next;
+ }
+ agent_state->started = total_started;
+
+ // See if we are starting from beginning
+ if (next.is_max())
+ agent_state->position = hobject_t();
+ else
+ agent_state->position = next;
+
+ // Discard old in memory HitSets
+ hit_set_in_memory_trim(pool.info.hit_set_count);
+
+ if (need_delay) {
+ ceph_assert(agent_state->delaying == false);
+ agent_delay();
+ return false;
+ }
+ agent_choose_mode();
+ return true;
+}
+
+void PrimaryLogPG::agent_load_hit_sets()
+{
+ if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
+ return;
+ }
+
+ if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
+ dout(10) << __func__ << dendl;
+ for (auto p = info.hit_set.history.begin();
+ p != info.hit_set.history.end(); ++p) {
+ if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
+ dout(10) << __func__ << " loading " << p->begin << "-"
+ << p->end << dendl;
+ if (!pool.info.is_replicated()) {
+ // FIXME: EC not supported here yet
+ derr << __func__ << " on non-replicated pool" << dendl;
+ break;
+ }
+
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ if (is_unreadable_object(oid)) {
+ dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
+ break;
+ }
+
+ ObjectContextRef obc = get_object_context(oid, false);
+ if (!obc) {
+ derr << __func__ << ": could not load hitset " << oid << dendl;
+ break;
+ }
+
+ bufferlist bl;
+ {
+ int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
+ ceph_assert(r >= 0);
+ }
+ HitSetRef hs(new HitSet);
+ bufferlist::const_iterator pbl = bl.begin();
+ decode(*hs, pbl);
+ agent_state->add_hit_set(p->begin.sec(), hs);
+ }
+ }
+ }
+}
+
+bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
+{
+ if (!obc->obs.oi.is_dirty()) {
+ dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ return false;
+ }
+ if (obc->obs.oi.is_cache_pinned()) {
+ dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ return false;
+ }
+
+ utime_t now = ceph_clock_now();
+ utime_t ob_local_mtime;
+ if (obc->obs.oi.local_mtime != utime_t()) {
+ ob_local_mtime = obc->obs.oi.local_mtime;
+ } else {
+ ob_local_mtime = obc->obs.oi.mtime;
+ }
+ bool evict_mode_full =
+ (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
+ if (!evict_mode_full &&
+ obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
+ (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
+ dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ return false;
+ }
+
+ if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
+ dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ return false;
+ }
+
+ dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
+
+ // FIXME: flush anything dirty, regardless of what distribution of
+ // ages we expect.
+
+ hobject_t oid = obc->obs.oi.soid;
+ osd->agent_start_op(oid);
+ // no need to capture a pg ref, can't outlive fop or ctx
+ std::function<void()> on_flush = [this, oid]() {
+ osd->agent_finish_op(oid);
+ };
+
+ int result = start_flush(
+ OpRequestRef(), obc, false, NULL,
+ on_flush);
+ if (result != -EINPROGRESS) {
+ on_flush();
+ dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
+ << " with " << result << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ return false;
+ }
+
+ osd->logger->inc(l_osd_agent_flush);
+ return true;
+}
+
+bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
+{
+ const hobject_t& soid = obc->obs.oi.soid;
+ if (!after_flush && obc->obs.oi.is_dirty()) {
+ dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
+ return false;
+ }
+ // This is already checked by agent_work() which passes after_flush = false
+ if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
+ dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
+ return false;
+ }
+ if (!obc->obs.oi.watchers.empty()) {
+ dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
+ return false;
+ }
+ if (obc->is_blocked()) {
+ dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
+ return false;
+ }
+ if (obc->obs.oi.is_cache_pinned()) {
+ dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
+ return false;
+ }
+
+ if (soid.snap == CEPH_NOSNAP) {
+ int result = _verify_no_head_clones(soid, obc->ssc->snapset);
+ if (result < 0) {
+ dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
+ return false;
+ }
+ }
+
+ if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
+ // is this object old than cache_min_evict_age?
+ utime_t now = ceph_clock_now();
+ utime_t ob_local_mtime;
+ if (obc->obs.oi.local_mtime != utime_t()) {
+ ob_local_mtime = obc->obs.oi.local_mtime;
+ } else {
+ ob_local_mtime = obc->obs.oi.mtime;
+ }
+ if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
+ dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
+ osd->logger->inc(l_osd_agent_skip);
+ return false;
+ }
+ // is this object old and/or cold enough?
+ int temp = 0;
+ uint64_t temp_upper = 0, temp_lower = 0;
+ if (hit_set)
+ agent_estimate_temp(soid, &temp);
+ agent_state->temp_hist.add(temp);
+ agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
+
+ dout(20) << __func__
+ << " temp " << temp
+ << " pos " << temp_lower << "-" << temp_upper
+ << ", evict_effort " << agent_state->evict_effort
+ << dendl;
+ dout(30) << "agent_state:\n";
+ Formatter *f = Formatter::create("");
+ f->open_object_section("agent_state");
+ agent_state->dump(f);
+ f->close_section();
+ f->flush(*_dout);
+ delete f;
+ *_dout << dendl;
+
+ if (1000000 - temp_upper >= agent_state->evict_effort)
+ return false;
+ }
+
+ dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
+ OpContextUPtr ctx = simple_opc_create(obc);
+
+ auto null_op_req = OpRequestRef();
+ if (!ctx->lock_manager.get_lock_type(
+ RWState::RWWRITE,
+ obc->obs.oi.soid,
+ obc,
+ null_op_req)) {
+ close_op_ctx(ctx.release());
+ dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
+ return false;
+ }
+
+ osd->agent_start_evict_op();
+ ctx->register_on_finish(
+ [this]() {
+ osd->agent_finish_evict_op();
+ });
+
+ ctx->at_version = get_next_version();
+ ceph_assert(ctx->new_obs.exists);
+ int r = _delete_oid(ctx.get(), true, false);
+ if (obc->obs.oi.is_omap())
+ ctx->delta_stats.num_objects_omap--;
+ ctx->delta_stats.num_evict++;
+ ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
+ if (obc->obs.oi.is_dirty())
+ --ctx->delta_stats.num_objects_dirty;
+ ceph_assert(r == 0);
+ finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
+ simple_opc_submit(std::move(ctx));
+ osd->logger->inc(l_osd_tier_evict);
+ osd->logger->inc(l_osd_agent_evict);
+ return true;
+}
+
+void PrimaryLogPG::agent_stop()
+{
+ dout(20) << __func__ << dendl;
+ if (agent_state && !agent_state->is_idle()) {
+ agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
+ agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
+ osd->agent_disable_pg(this, agent_state->evict_effort);
+ }
+}
+
+void PrimaryLogPG::agent_delay()
+{
+ dout(20) << __func__ << dendl;
+ if (agent_state && !agent_state->is_idle()) {
+ ceph_assert(agent_state->delaying == false);
+ agent_state->delaying = true;
+ osd->agent_disable_pg(this, agent_state->evict_effort);
+ }
+}
+
+void PrimaryLogPG::agent_choose_mode_restart()
+{
+ dout(20) << __func__ << dendl;
+ std::scoped_lock locker{*this};
+ if (agent_state && agent_state->delaying) {
+ agent_state->delaying = false;
+ agent_choose_mode(true);
+ }
+}
+
+bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
+{
+ bool requeued = false;
+ // Let delay play out
+ if (agent_state->delaying) {
+ dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
+ return requeued;
+ }
+
+ TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
+ TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
+ unsigned evict_effort = 0;
+
+ if (info.stats.stats_invalid) {
+ // idle; stats can't be trusted until we scrub.
+ dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
+ goto skip_calc;
+ }
+
+ {
+ uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
+ ceph_assert(divisor > 0);
+
+ // adjust (effective) user objects down based on the number
+ // of HitSet objects, which should not count toward our total since
+ // they cannot be flushed.
+ uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
+
+ // also exclude omap objects if ec backing pool
+ const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
+ ceph_assert(base_pool);
+ if (!base_pool->supports_omap())
+ unflushable += info.stats.stats.sum.num_objects_omap;
+
+ uint64_t num_user_objects = info.stats.stats.sum.num_objects;
+ if (num_user_objects > unflushable)
+ num_user_objects -= unflushable;
+ else
+ num_user_objects = 0;
+
+ uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
+ uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
+ num_user_bytes -= unflushable_bytes;
+ uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
+ num_user_bytes += num_overhead_bytes;
+
+ // also reduce the num_dirty by num_objects_omap
+ int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
+ if (!base_pool->supports_omap()) {
+ if (num_dirty > info.stats.stats.sum.num_objects_omap)
+ num_dirty -= info.stats.stats.sum.num_objects_omap;
+ else
+ num_dirty = 0;
+ }
+
+ dout(10) << __func__
+ << " flush_mode: "
+ << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
+ << " evict_mode: "
+ << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+ << " num_objects: " << info.stats.stats.sum.num_objects
+ << " num_bytes: " << info.stats.stats.sum.num_bytes
+ << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
+ << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
+ << " num_dirty: " << num_dirty
+ << " num_user_objects: " << num_user_objects
+ << " num_user_bytes: " << num_user_bytes
+ << " num_overhead_bytes: " << num_overhead_bytes
+ << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
+ << " pool.info.target_max_objects: " << pool.info.target_max_objects
+ << dendl;
+
+ // get dirty, full ratios
+ uint64_t dirty_micro = 0;
+ uint64_t full_micro = 0;
+ if (pool.info.target_max_bytes && num_user_objects > 0) {
+ uint64_t avg_size = num_user_bytes / num_user_objects;
+ dirty_micro =
+ num_dirty * avg_size * 1000000 /
+ std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
+ full_micro =
+ num_user_objects * avg_size * 1000000 /
+ std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
+ }
+ if (pool.info.target_max_objects > 0) {
+ uint64_t dirty_objects_micro =
+ num_dirty * 1000000 /
+ std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
+ if (dirty_objects_micro > dirty_micro)
+ dirty_micro = dirty_objects_micro;
+ uint64_t full_objects_micro =
+ num_user_objects * 1000000 /
+ std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
+ if (full_objects_micro > full_micro)
+ full_micro = full_objects_micro;
+ }
+ dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
+ << " full " << ((float)full_micro / 1000000.0)
+ << dendl;
+
+ // flush mode
+ uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
+ uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
+ uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
+ if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
+ flush_target += flush_slop;
+ flush_high_target += flush_slop;
+ } else {
+ flush_target -= std::min(flush_target, flush_slop);
+ flush_high_target -= std::min(flush_high_target, flush_slop);
+ }
+
+ if (dirty_micro > flush_high_target) {
+ flush_mode = TierAgentState::FLUSH_MODE_HIGH;
+ } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
+ flush_mode = TierAgentState::FLUSH_MODE_LOW;
+ }
+
+ // evict mode
+ uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
+ uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
+ if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
+ evict_target += evict_slop;
+ else
+ evict_target -= std::min(evict_target, evict_slop);
+
+ if (full_micro > 1000000) {
+ // evict anything clean
+ evict_mode = TierAgentState::EVICT_MODE_FULL;
+ evict_effort = 1000000;
+ } else if (full_micro > evict_target) {
+ // set effort in [0..1] range based on where we are between
+ evict_mode = TierAgentState::EVICT_MODE_SOME;
+ uint64_t over = full_micro - evict_target;
+ uint64_t span = 1000000 - evict_target;
+ evict_effort = std::max(over * 1000000 / span,
+ uint64_t(1000000.0 *
+ cct->_conf->osd_agent_min_evict_effort));
+
+ // quantize effort to avoid too much reordering in the agent_queue.
+ uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
+ ceph_assert(inc > 0);
+ uint64_t was = evict_effort;
+ evict_effort -= evict_effort % inc;
+ if (evict_effort < inc)
+ evict_effort = inc;
+ ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
+ dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
+ }
+ }
+
+ skip_calc:
+ bool old_idle = agent_state->is_idle();
+ if (flush_mode != agent_state->flush_mode) {
+ dout(5) << __func__ << " flush_mode "
+ << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
+ << " -> "
+ << TierAgentState::get_flush_mode_name(flush_mode)
+ << dendl;
+ recovery_state.update_stats(
+ [=](auto &history, auto &stats) {
+ if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
+ osd->agent_inc_high_count();
+ stats.stats.sum.num_flush_mode_high = 1;
+ } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
+ stats.stats.sum.num_flush_mode_low = 1;
+ }
+ if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
+ osd->agent_dec_high_count();
+ stats.stats.sum.num_flush_mode_high = 0;
+ } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
+ stats.stats.sum.num_flush_mode_low = 0;
+ }
+ return false;
+ });
+ agent_state->flush_mode = flush_mode;
+ }
+ if (evict_mode != agent_state->evict_mode) {
+ dout(5) << __func__ << " evict_mode "
+ << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+ << " -> "
+ << TierAgentState::get_evict_mode_name(evict_mode)
+ << dendl;
+ if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
+ is_active()) {
+ if (op)
+ requeue_op(op);
+ requeue_ops(waiting_for_flush);
+ requeue_ops(waiting_for_active);
+ requeue_ops(waiting_for_readable);
+ requeue_ops(waiting_for_scrub);
+ requeue_ops(waiting_for_cache_not_full);
+ objects_blocked_on_cache_full.clear();
+ requeued = true;
+ }
+ recovery_state.update_stats(
+ [=](auto &history, auto &stats) {
+ if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
+ stats.stats.sum.num_evict_mode_some = 1;
+ } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
+ stats.stats.sum.num_evict_mode_full = 1;
+ }
+ if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
+ stats.stats.sum.num_evict_mode_some = 0;
+ } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+ stats.stats.sum.num_evict_mode_full = 0;
+ }
+ return false;
+ });
+ agent_state->evict_mode = evict_mode;
+ }
+ uint64_t old_effort = agent_state->evict_effort;
+ if (evict_effort != agent_state->evict_effort) {
+ dout(5) << __func__ << " evict_effort "
+ << ((float)agent_state->evict_effort / 1000000.0)
+ << " -> "
+ << ((float)evict_effort / 1000000.0)
+ << dendl;
+ agent_state->evict_effort = evict_effort;
+ }
+
+ // NOTE: we are using evict_effort as a proxy for *all* agent effort
+ // (including flush). This is probably fine (they should be
+ // correlated) but it is not precisely correct.
+ if (agent_state->is_idle()) {
+ if (!restart && !old_idle) {
+ osd->agent_disable_pg(this, old_effort);
+ }
+ } else {
+ if (restart || old_idle) {
+ osd->agent_enable_pg(this, agent_state->evict_effort);
+ } else if (old_effort != agent_state->evict_effort) {
+ osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
+ }
+ }
+ return requeued;
+}
+
+void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
+{
+ ceph_assert(hit_set);
+ ceph_assert(temp);
+ *temp = 0;
+ if (hit_set->contains(oid))
+ *temp = 1000000;
+ unsigned i = 0;
+ int last_n = pool.info.hit_set_search_last_n;
+ for (map<time_t,HitSetRef>::reverse_iterator p =
+ agent_state->hit_set_map.rbegin(); last_n > 0 &&
+ p != agent_state->hit_set_map.rend(); ++p, ++i) {
+ if (p->second->contains(oid)) {
+ *temp += pool.info.get_grade(i);
+ --last_n;
+ }
+ }
+}
+
+// Dup op detection
+
+bool PrimaryLogPG::already_complete(eversion_t v)
+{
+ dout(20) << __func__ << ": " << v << dendl;
+ for (xlist<RepGather*>::iterator i = repop_queue.begin();
+ !i.end();
+ ++i) {
+ dout(20) << __func__ << ": " << **i << dendl;
+ // skip copy from temp object ops
+ if ((*i)->v == eversion_t()) {
+ dout(20) << __func__ << ": " << **i
+ << " version is empty" << dendl;
+ continue;
+ }
+ if ((*i)->v > v) {
+ dout(20) << __func__ << ": " << **i
+ << " (*i)->v past v" << dendl;
+ break;
+ }
+ if (!(*i)->all_committed) {
+ dout(20) << __func__ << ": " << **i
+ << " not committed, returning false"
+ << dendl;
+ return false;
+ }
+ }
+ dout(20) << __func__ << ": returning true" << dendl;
+ return true;
+}
+
+
+// ==========================================================================================
+// SCRUB
+
+void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
+{
+ dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl;
+ op->mark_started();
+
+ if (!m_scrubber->is_scrub_active()) {
+ dout(10) << __func__ << " scrub isn't active" << dendl;
+ return;
+ }
+ m_scrubber->map_from_replica(op);
+}
+
+bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
+ const hobject_t& end)
+{
+ pair<hobject_t, ObjectContextRef> next;
+ next.second = object_contexts.lookup(begin);
+ next.first = begin;
+ bool more = true;
+ while (more && next.first < end) {
+ if (next.second && next.second->is_blocked()) {
+ next.second->requeue_scrub_on_unblock = true;
+ dout(10) << __func__ << ": scrub delayed, "
+ << next.first << " is blocked"
+ << dendl;
+ return false;
+ }
+ more = object_contexts.get_next(next.first, &next);
+ }
+ return true;
+}
+
+
+int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
+{
+ OpRequestRef op = ctx->op;
+ // Only supports replicated pools
+ ceph_assert(!pool.info.is_erasure());
+ ceph_assert(is_primary());
+
+ dout(10) << __func__ << " " << soid
+ << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
+
+ if (!is_clean()) {
+ block_for_clean(soid, op);
+ return -EAGAIN;
+ }
+
+ ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
+ auto& oi = ctx->new_obs.oi;
+ eversion_t v = oi.version;
+
+ if (primary_error(soid, v)) {
+ dout(0) << __func__ << " No other replicas available for " << soid << dendl;
+ // XXX: If we knew that there is no down osd which could include this
+ // object, it would be nice if we could return EIO here.
+ // If a "never fail" flag was available, that could be used
+ // for rbd to NOT return EIO until object marked lost.
+
+ // Drop through to save this op in case an osd comes up with the object.
+ }
+
+ // Restart the op after object becomes readable again
+ waiting_for_unreadable_object[soid].push_back(op);
+ op->mark_delayed("waiting for missing object");
+
+ ceph_assert(is_clean());
+ state_set(PG_STATE_REPAIR);
+ state_clear(PG_STATE_CLEAN);
+ queue_peering_event(
+ PGPeeringEventRef(
+ std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(),
+ get_osdmap_epoch(),
+ PeeringState::DoRecovery())));
+
+ return -EAGAIN;
+}
+
+/*---SnapTrimmer Logging---*/
+#undef dout_prefix
+#define dout_prefix pg->gen_prefix(*_dout)
+
+void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
+{
+ ldout(pg->cct, 20) << "enter " << state_name << dendl;
+}
+
+void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
+{
+ ldout(pg->cct, 20) << "exit " << state_name << dendl;
+}
+
+bool PrimaryLogPG::SnapTrimmer::permit_trim() {
+ return
+ pg->is_clean() &&
+ !pg->is_scrub_queued_or_active() &&
+ !pg->snap_trimq.empty();
+}
+
+/*---SnapTrimmer states---*/
+#undef dout_prefix
+#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
+ << "SnapTrimmer state<" << get_state_name() << ">: ")
+
+/* NotTrimming */
+PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "NotTrimming")
+{
+ context< SnapTrimmer >().log_enter(state_name);
+}
+
+void PrimaryLogPG::NotTrimming::exit()
+{
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+}
+
+boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
+{
+ PrimaryLogPG *pg = context< SnapTrimmer >().pg;
+ ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
+
+ if (!(pg->is_primary() && pg->is_active())) {
+ ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
+ return discard_event();
+ }
+ if (!pg->is_clean() ||
+ pg->snap_trimq.empty()) {
+ ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
+ return discard_event();
+ }
+ if (pg->is_scrub_queued_or_active()) {
+ ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
+ return transit< WaitScrub >();
+ } else {
+ return transit< Trimming >();
+ }
+}
+
+boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
+{
+ PrimaryLogPG *pg = context< SnapTrimmer >().pg;
+ ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
+
+ pending = nullptr;
+ if (!context< SnapTrimmer >().can_trim()) {
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ }
+
+ context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
+ ldout(pg->cct, 10) << "NotTrimming: trimming "
+ << pg->snap_trimq.range_start()
+ << dendl;
+ return transit< AwaitAsyncWork >();
+}
+
+/* AwaitAsyncWork */
+PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming/AwaitAsyncWork")
+{
+ auto *pg = context< SnapTrimmer >().pg;
+ context< SnapTrimmer >().log_enter(state_name);
+ context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
+ pg->state_set(PG_STATE_SNAPTRIM);
+ pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
+ pg->publish_stats_to_osd();
+}
+
+boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
+{
+ PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
+ snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
+ auto &in_flight = context<Trimming>().in_flight;
+ ceph_assert(in_flight.empty());
+
+ ceph_assert(pg->is_primary() && pg->is_active());
+ if (!context< SnapTrimmer >().can_trim()) {
+ ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ }
+
+ ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
+
+ vector<hobject_t> to_trim;
+ unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
+ // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
+ // the ENOENT below and erase snap_to_trim.
+ ceph_assert(max > 0);
+ to_trim.reserve(max);
+ int r = pg->snap_mapper.get_next_objects_to_trim(
+ snap_to_trim,
+ max,
+ &to_trim);
+ if (r != 0 && r != -ENOENT) {
+ lderr(pg->cct) << "get_next_objects_to_trim returned "
+ << cpp_strerror(r) << dendl;
+ ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
+ } else if (r == -ENOENT) {
+ // Done!
+ ldout(pg->cct, 10) << "got ENOENT" << dendl;
+
+ pg->snap_trimq.erase(snap_to_trim);
+
+ if (pg->snap_trimq_repeat.count(snap_to_trim)) {
+ ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
+ pg->snap_trimq_repeat.erase(snap_to_trim);
+ } else {
+ ldout(pg->cct, 10) << "adding snap " << snap_to_trim
+ << " to purged_snaps"
+ << dendl;
+ ObjectStore::Transaction t;
+ pg->recovery_state.adjust_purged_snaps(
+ [snap_to_trim](auto &purged_snaps) {
+ purged_snaps.insert(snap_to_trim);
+ });
+ pg->write_if_dirty(t);
+
+ ldout(pg->cct, 10) << "purged_snaps now "
+ << pg->info.purged_snaps << ", snap_trimq now "
+ << pg->snap_trimq << dendl;
+
+ int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
+ ceph_assert(tr == 0);
+
+ pg->recovery_state.share_pg_info();
+ }
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ }
+ ceph_assert(!to_trim.empty());
+
+ for (auto &&object: to_trim) {
+ // Get next
+ ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
+ OpContextUPtr ctx;
+ int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
+ if (error) {
+ if (error == -ENOLCK) {
+ ldout(pg->cct, 10) << "could not get write lock on obj "
+ << object << dendl;
+ } else {
+ pg->state_set(PG_STATE_SNAPTRIM_ERROR);
+ ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
+ }
+ if (!in_flight.empty()) {
+ ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
+ return transit< WaitRepops >();
+ }
+ if (error == -ENOLCK) {
+ ldout(pg->cct, 10) << "waiting for it to clear"
+ << dendl;
+ return transit< WaitRWLock >();
+ } else {
+ return transit< NotTrimming >();
+ }
+ }
+
+ in_flight.insert(object);
+ ctx->register_on_success(
+ [pg, object, &in_flight]() {
+ ceph_assert(in_flight.find(object) != in_flight.end());
+ in_flight.erase(object);
+ if (in_flight.empty()) {
+ if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
+ pg->snap_trimmer_machine.process_event(Reset());
+ } else {
+ pg->snap_trimmer_machine.process_event(RepopsComplete());
+ }
+ }
+ });
+
+ pg->simple_opc_submit(std::move(ctx));
+ }
+
+ return transit< WaitRepops >();
+}
+
+void PrimaryLogPG::setattr_maybe_cache(
+ ObjectContextRef obc,
+ PGTransaction *t,
+ const string &key,
+ bufferlist &val)
+{
+ t->setattr(obc->obs.oi.soid, key, val);
+}
+
+void PrimaryLogPG::setattrs_maybe_cache(
+ ObjectContextRef obc,
+ PGTransaction *t,
+ map<string, bufferlist> &attrs)
+{
+ t->setattrs(obc->obs.oi.soid, attrs);
+}
+
+void PrimaryLogPG::rmattr_maybe_cache(
+ ObjectContextRef obc,
+ PGTransaction *t,
+ const string &key)
+{
+ t->rmattr(obc->obs.oi.soid, key);
+}
+
+int PrimaryLogPG::getattr_maybe_cache(
+ ObjectContextRef obc,
+ const string &key,
+ bufferlist *val)
+{
+ if (pool.info.is_erasure()) {
+ map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
+ if (i != obc->attr_cache.end()) {
+ if (val)
+ *val = i->second;
+ return 0;
+ } else {
+ return -ENODATA;
+ }
+ }
+ return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
+}
+
+int PrimaryLogPG::getattrs_maybe_cache(
+ ObjectContextRef obc,
+ map<string, bufferlist> *out)
+{
+ int r = 0;
+ ceph_assert(out);
+ if (pool.info.is_erasure()) {
+ *out = obc->attr_cache;
+ } else {
+ r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
+ }
+ map<string, bufferlist> tmp;
+ for (map<string, bufferlist>::iterator i = out->begin();
+ i != out->end();
+ ++i) {
+ if (i->first.size() > 1 && i->first[0] == '_')
+ tmp[i->first.substr(1, i->first.size())] = std::move(i->second);
+ }
+ tmp.swap(*out);
+ return r;
+}
+
+bool PrimaryLogPG::check_failsafe_full() {
+ return osd->check_failsafe_full(get_dpp());
+}
+
+bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
+{
+ return m_scrubber->write_blocked_by_scrub(oid);
+}
+
+void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
+void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
+
+#ifdef PG_DEBUG_REFS
+uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
+void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
+#endif
+
+void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
+void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
new file mode 100644
index 000000000..68cdec24e
--- /dev/null
+++ b/src/osd/PrimaryLogPG.h
@@ -0,0 +1,1969 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_REPLICATEDPG_H
+#define CEPH_REPLICATEDPG_H
+
+#include <boost/tuple/tuple.hpp>
+#include "include/ceph_assert.h"
+#include "DynamicPerfStats.h"
+#include "OSD.h"
+#include "PG.h"
+#include "Watch.h"
+#include "TierAgentState.h"
+#include "messages/MOSDOpReply.h"
+#include "common/Checksummer.h"
+#include "common/sharedptr_registry.hpp"
+#include "common/shared_cache.hpp"
+#include "ReplicatedBackend.h"
+#include "PGTransaction.h"
+#include "cls/cas/cls_cas_ops.h"
+
+class CopyFromCallback;
+class PromoteCallback;
+struct RefCountCallback;
+
+class PrimaryLogPG;
+class PGLSFilter;
+class HitSet;
+struct TierAgentState;
+class OSDService;
+
+void intrusive_ptr_add_ref(PrimaryLogPG *pg);
+void intrusive_ptr_release(PrimaryLogPG *pg);
+uint64_t get_with_id(PrimaryLogPG *pg);
+void put_with_id(PrimaryLogPG *pg, uint64_t id);
+
+#ifdef PG_DEBUG_REFS
+ typedef TrackedIntPtr<PrimaryLogPG> PrimaryLogPGRef;
+#else
+ typedef boost::intrusive_ptr<PrimaryLogPG> PrimaryLogPGRef;
+#endif
+
+struct inconsistent_snapset_wrapper;
+
+class PrimaryLogPG : public PG, public PGBackend::Listener {
+ friend class OSD;
+ friend class Watch;
+ friend class PrimaryLogScrub;
+
+public:
+ MEMPOOL_CLASS_HELPERS();
+
+ /*
+ * state associated with a copy operation
+ */
+ struct OpContext;
+ class CopyCallback;
+
+ /**
+ * CopyResults stores the object metadata of interest to a copy initiator.
+ */
+ struct CopyResults {
+ ceph::real_time mtime; ///< the copy source's mtime
+ uint64_t object_size; ///< the copied object's size
+ bool started_temp_obj; ///< true if the callback needs to delete temp object
+ hobject_t temp_oid; ///< temp object (if any)
+
+ /**
+ * Function to fill in transaction; if non-empty the callback
+ * must execute it before any other accesses to the object
+ * (in order to complete the copy).
+ */
+ std::function<void(PGTransaction *)> fill_in_final_tx;
+
+ version_t user_version; ///< The copy source's user version
+ bool should_requeue; ///< op should be requeued on cancel
+ std::vector<snapid_t> snaps; ///< src's snaps (if clone)
+ snapid_t snap_seq; ///< src's snap_seq (if head)
+ librados::snap_set_t snapset; ///< src snapset (if head)
+ bool mirror_snapset;
+ bool has_omap;
+ uint32_t flags; // object_copy_data_t::FLAG_*
+ uint32_t source_data_digest, source_omap_digest;
+ uint32_t data_digest, omap_digest;
+ mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)]
+ mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; // std::map reqids by index to error code
+ std::map<std::string, ceph::buffer::list> attrs; // xattrs
+ uint64_t truncate_seq;
+ uint64_t truncate_size;
+ bool is_data_digest() {
+ return flags & object_copy_data_t::FLAG_DATA_DIGEST;
+ }
+ bool is_omap_digest() {
+ return flags & object_copy_data_t::FLAG_OMAP_DIGEST;
+ }
+ CopyResults()
+ : object_size(0), started_temp_obj(false),
+ user_version(0),
+ should_requeue(false), mirror_snapset(false),
+ has_omap(false),
+ flags(0),
+ source_data_digest(-1), source_omap_digest(-1),
+ data_digest(-1), omap_digest(-1),
+ truncate_seq(0), truncate_size(0)
+ {}
+ };
+
+ struct CopyOp;
+ typedef std::shared_ptr<CopyOp> CopyOpRef;
+
+ struct CopyOp {
+ CopyCallback *cb;
+ ObjectContextRef obc;
+ hobject_t src;
+ object_locator_t oloc;
+ unsigned flags;
+ bool mirror_snapset;
+
+ CopyResults results;
+
+ ceph_tid_t objecter_tid;
+ ceph_tid_t objecter_tid2;
+
+ object_copy_cursor_t cursor;
+ std::map<std::string,ceph::buffer::list> attrs;
+ ceph::buffer::list data;
+ ceph::buffer::list omap_header;
+ ceph::buffer::list omap_data;
+ int rval;
+
+ object_copy_cursor_t temp_cursor;
+
+ /*
+ * For CopyOp the process is:
+ * step1: read the data(attr/omap/data) from the source object
+ * step2: handle those data(w/ those data create a new object)
+ * src_obj_fadvise_flags used in step1;
+ * dest_obj_fadvise_flags used in step2
+ */
+ unsigned src_obj_fadvise_flags;
+ unsigned dest_obj_fadvise_flags;
+
+ std::map<uint64_t, CopyOpRef> chunk_cops;
+ int num_chunk;
+ bool failed;
+ uint64_t start_offset = 0;
+ uint64_t last_offset = 0;
+ std::vector<OSDOp> chunk_ops;
+
+ CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s,
+ object_locator_t l,
+ version_t v,
+ unsigned f,
+ bool ms,
+ unsigned src_obj_fadvise_flags,
+ unsigned dest_obj_fadvise_flags)
+ : cb(cb_), obc(_obc), src(s), oloc(l), flags(f),
+ mirror_snapset(ms),
+ objecter_tid(0),
+ objecter_tid2(0),
+ rval(-1),
+ src_obj_fadvise_flags(src_obj_fadvise_flags),
+ dest_obj_fadvise_flags(dest_obj_fadvise_flags),
+ num_chunk(0),
+ failed(false)
+ {
+ results.user_version = v;
+ results.mirror_snapset = mirror_snapset;
+ }
+ };
+
+ /**
+ * The CopyCallback class defines an interface for completions to the
+ * copy_start code. Users of the copy infrastructure must implement
+ * one and give an instance of the class to start_copy.
+ *
+ * The implementer is responsible for making sure that the CopyCallback
+ * can associate itself with the correct copy operation.
+ */
+ typedef boost::tuple<int, CopyResults*> CopyCallbackResults;
+
+ friend class CopyFromCallback;
+ friend struct CopyFromFinisher;
+ friend class PromoteCallback;
+ friend struct PromoteFinisher;
+
+ struct ProxyReadOp {
+ OpRequestRef op;
+ hobject_t soid;
+ ceph_tid_t objecter_tid;
+ std::vector<OSDOp> &ops;
+ version_t user_version;
+ int data_offset;
+ bool canceled; ///< true if canceled
+
+ ProxyReadOp(OpRequestRef _op, hobject_t oid, std::vector<OSDOp>& _ops)
+ : op(_op), soid(oid),
+ objecter_tid(0), ops(_ops),
+ user_version(0), data_offset(0),
+ canceled(false) { }
+ };
+ typedef std::shared_ptr<ProxyReadOp> ProxyReadOpRef;
+
+ struct ProxyWriteOp {
+ OpContext *ctx;
+ OpRequestRef op;
+ hobject_t soid;
+ ceph_tid_t objecter_tid;
+ std::vector<OSDOp> &ops;
+ version_t user_version;
+ bool sent_reply;
+ utime_t mtime;
+ bool canceled;
+ osd_reqid_t reqid;
+
+ ProxyWriteOp(OpRequestRef _op, hobject_t oid, std::vector<OSDOp>& _ops, osd_reqid_t _reqid)
+ : ctx(NULL), op(_op), soid(oid),
+ objecter_tid(0), ops(_ops),
+ user_version(0), sent_reply(false),
+ canceled(false),
+ reqid(_reqid) { }
+ };
+ typedef std::shared_ptr<ProxyWriteOp> ProxyWriteOpRef;
+
+ struct FlushOp {
+ ObjectContextRef obc; ///< obc we are flushing
+ OpRequestRef op; ///< initiating op
+ std::list<OpRequestRef> dup_ops; ///< bandwagon jumpers
+ version_t flushed_version; ///< user version we are flushing
+ ceph_tid_t objecter_tid; ///< copy-from request tid
+ int rval; ///< copy-from result
+ bool blocking; ///< whether we are blocking updates
+ bool removal; ///< we are removing the backend object
+ std::optional<std::function<void()>> on_flush; ///< callback, may be null
+ // for chunked object
+ std::map<uint64_t, int> io_results;
+ std::map<uint64_t, ceph_tid_t> io_tids;
+ uint64_t chunks;
+
+ FlushOp()
+ : flushed_version(0), objecter_tid(0), rval(0),
+ blocking(false), removal(false), chunks(0) {}
+ ~FlushOp() { ceph_assert(!on_flush); }
+ };
+ typedef std::shared_ptr<FlushOp> FlushOpRef;
+
+ friend struct RefCountCallback;
+ struct ManifestOp {
+ RefCountCallback *cb;
+ ceph_tid_t objecter_tid;
+ OpRequestRef op;
+ std::map<uint64_t, int> results;
+ std::map<uint64_t, ceph_tid_t> tids;
+ std::map<hobject_t, pair<uint64_t, uint64_t>> chunks;
+ uint64_t num_chunks = 0;
+ object_manifest_t new_manifest;
+
+
+ ManifestOp(RefCountCallback* cb)
+ : cb(cb), objecter_tid(0) {}
+ };
+ typedef std::shared_ptr<ManifestOp> ManifestOpRef;
+ std::map<hobject_t, ManifestOpRef> manifest_ops;
+
+ boost::scoped_ptr<PGBackend> pgbackend;
+ PGBackend *get_pgbackend() override {
+ return pgbackend.get();
+ }
+
+ const PGBackend *get_pgbackend() const override {
+ return pgbackend.get();
+ }
+
+ /// Listener methods
+ DoutPrefixProvider *get_dpp() override {
+ return this;
+ }
+
+ void on_local_recover(
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ bool is_delete,
+ ObjectStore::Transaction *t
+ ) override;
+ void on_peer_recover(
+ pg_shard_t peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info
+ ) override {
+ recovery_state.on_peer_recover(peer, oid, recovery_info.version);
+ }
+ void begin_peer_recover(
+ pg_shard_t peer,
+ const hobject_t oid) override {
+ recovery_state.begin_peer_recover(peer, oid);
+ }
+ void on_global_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ bool is_delete) override;
+ void on_failed_pull(
+ const std::set<pg_shard_t> &from,
+ const hobject_t &soid,
+ const eversion_t &version) override;
+ void cancel_pull(const hobject_t &soid) override;
+ void apply_stats(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats) override;
+
+ bool primary_error(const hobject_t& soid, eversion_t v);
+
+ void remove_missing_object(const hobject_t &oid,
+ eversion_t v,
+ Context *on_complete) override;
+
+ template<class T> class BlessedGenContext;
+ template<class T> class UnlockedBlessedGenContext;
+ class BlessedContext;
+ Context *bless_context(Context *c) override;
+
+ GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) override;
+ GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) override;
+
+ void send_message(int to_osd, Message *m) override {
+ osd->send_message_osd_cluster(to_osd, m, get_osdmap_epoch());
+ }
+ void queue_transaction(ObjectStore::Transaction&& t,
+ OpRequestRef op) override {
+ osd->store->queue_transaction(ch, std::move(t), op);
+ }
+ void queue_transactions(std::vector<ObjectStore::Transaction>& tls,
+ OpRequestRef op) override {
+ osd->store->queue_transactions(ch, tls, op, NULL);
+ }
+ epoch_t get_interval_start_epoch() const override {
+ return info.history.same_interval_since;
+ }
+ epoch_t get_last_peering_reset_epoch() const override {
+ return get_last_peering_reset();
+ }
+ const std::set<pg_shard_t> &get_acting_recovery_backfill_shards() const override {
+ return get_acting_recovery_backfill();
+ }
+ const std::set<pg_shard_t> &get_acting_shards() const override {
+ return recovery_state.get_actingset();
+ }
+ const std::set<pg_shard_t> &get_backfill_shards() const override {
+ return get_backfill_targets();
+ }
+
+ std::ostream& gen_dbg_prefix(std::ostream& out) const override {
+ return gen_prefix(out);
+ }
+
+ const HobjToShardSetMapping& get_missing_loc_shards() const override
+ {
+ return recovery_state.get_missing_loc().get_missing_locs();
+ }
+ const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
+ return recovery_state.get_peer_missing();
+ }
+ using PGBackend::Listener::get_shard_missing;
+ const std::map<pg_shard_t, pg_info_t> &get_shard_info() const override {
+ return recovery_state.get_peer_info();
+ }
+ using PGBackend::Listener::get_shard_info;
+ const pg_missing_tracker_t &get_local_missing() const override {
+ return recovery_state.get_pg_log().get_missing();
+ }
+ const PGLog &get_log() const override {
+ return recovery_state.get_pg_log();
+ }
+ void add_local_next_event(const pg_log_entry_t& e) override {
+ recovery_state.add_local_next_event(e);
+ }
+ bool pgb_is_primary() const override {
+ return is_primary();
+ }
+ const OSDMapRef& pgb_get_osdmap() const override final {
+ return get_osdmap();
+ }
+ epoch_t pgb_get_osdmap_epoch() const override final {
+ return get_osdmap_epoch();
+ }
+ const pg_info_t &get_info() const override {
+ return info;
+ }
+ const pg_pool_t &get_pool() const override {
+ return pool.info;
+ }
+
+ ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ const std::map<std::string, ceph::buffer::list> &attrs) override {
+ return get_object_context(hoid, true, &attrs);
+ }
+
+ bool try_lock_for_read(
+ const hobject_t &hoid,
+ ObcLockManager &manager) override {
+ if (is_missing_object(hoid))
+ return false;
+ auto obc = get_object_context(hoid, false, nullptr);
+ if (!obc)
+ return false;
+ return manager.try_get_read_lock(hoid, obc);
+ }
+
+ void release_locks(ObcLockManager &manager) override {
+ release_object_locks(manager);
+ }
+
+ bool pg_is_repair() override {
+ return is_repair();
+ }
+ void inc_osd_stat_repaired() override {
+ osd->inc_osd_stat_repaired();
+ }
+ bool pg_is_remote_backfilling() override {
+ return is_remote_backfilling();
+ }
+ void pg_add_local_num_bytes(int64_t num_bytes) override {
+ add_local_num_bytes(num_bytes);
+ }
+ void pg_sub_local_num_bytes(int64_t num_bytes) override {
+ sub_local_num_bytes(num_bytes);
+ }
+ void pg_add_num_bytes(int64_t num_bytes) override {
+ add_num_bytes(num_bytes);
+ }
+ void pg_sub_num_bytes(int64_t num_bytes) override {
+ sub_num_bytes(num_bytes);
+ }
+
+ void pgb_set_object_snap_mapping(
+ const hobject_t &soid,
+ const std::set<snapid_t> &snaps,
+ ObjectStore::Transaction *t) override {
+ return update_object_snap_mapping(t, soid, snaps);
+ }
+ void pgb_clear_object_snap_mapping(
+ const hobject_t &soid,
+ ObjectStore::Transaction *t) override {
+ return clear_object_snap_mapping(t, soid);
+ }
+
+ void log_operation(
+ std::vector<pg_log_entry_t>&& logv,
+ const std::optional<pg_hit_set_history_t> &hset_history,
+ const eversion_t &trim_to,
+ const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk,
+ bool transaction_applied,
+ ObjectStore::Transaction &t,
+ bool async = false) override {
+ if (is_primary()) {
+ ceph_assert(trim_to <= recovery_state.get_last_update_ondisk());
+ }
+ if (hset_history) {
+ recovery_state.update_hset(*hset_history);
+ }
+ if (transaction_applied) {
+ update_snap_map(logv, t);
+ }
+ auto last = logv.rbegin();
+ if (is_primary() && last != logv.rend()) {
+ projected_log.skip_can_rollback_to_to_head();
+ projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
+ }
+ if (!is_primary() && !is_ec_pg()) {
+ replica_clear_repop_obc(logv, t);
+ }
+ recovery_state.append_log(
+ std::move(logv), trim_to, roll_forward_to, min_last_complete_ondisk,
+ t, transaction_applied, async);
+ }
+
+ void replica_clear_repop_obc(
+ const std::vector<pg_log_entry_t> &logv,
+ ObjectStore::Transaction &t);
+
+ void op_applied(const eversion_t &applied_version) override;
+
+ bool should_send_op(
+ pg_shard_t peer,
+ const hobject_t &hoid) override;
+
+ bool pg_is_undersized() const override {
+ return is_undersized();
+ }
+
+ bool pg_is_repair() const override {
+ return is_repair();
+ }
+
+ void update_peer_last_complete_ondisk(
+ pg_shard_t fromosd,
+ eversion_t lcod) override {
+ recovery_state.update_peer_last_complete_ondisk(fromosd, lcod);
+ }
+
+ void update_last_complete_ondisk(
+ eversion_t lcod) override {
+ recovery_state.update_last_complete_ondisk(lcod);
+ }
+
+ void update_stats(
+ const pg_stat_t &stat) override {
+ recovery_state.update_stats(
+ [&stat](auto &history, auto &stats) {
+ stats = stat;
+ return false;
+ });
+ }
+
+ void schedule_recovery_work(
+ GenContext<ThreadPool::TPHandle&> *c) override;
+
+ pg_shard_t whoami_shard() const override {
+ return pg_whoami;
+ }
+ spg_t primary_spg_t() const override {
+ return spg_t(info.pgid.pgid, get_primary().shard);
+ }
+ pg_shard_t primary_shard() const override {
+ return get_primary();
+ }
+ uint64_t min_peer_features() const override {
+ return recovery_state.get_min_peer_features();
+ }
+ uint64_t min_upacting_features() const override {
+ return recovery_state.get_min_upacting_features();
+ }
+ void send_message_osd_cluster(
+ int peer, Message *m, epoch_t from_epoch) override {
+ osd->send_message_osd_cluster(peer, m, from_epoch);
+ }
+ void send_message_osd_cluster(
+ std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch) override {
+ osd->send_message_osd_cluster(messages, from_epoch);
+ }
+ void send_message_osd_cluster(
+ MessageRef m, Connection *con) override {
+ osd->send_message_osd_cluster(std::move(m), con);
+ }
+ void send_message_osd_cluster(
+ Message *m, const ConnectionRef& con) override {
+ osd->send_message_osd_cluster(m, con);
+ }
+ ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override;
+ entity_name_t get_cluster_msgr_name() override {
+ return osd->get_cluster_msgr_name();
+ }
+
+ PerfCounters *get_logger() override;
+
+ ceph_tid_t get_tid() override { return osd->get_tid(); }
+
+ OstreamTemp clog_error() override { return osd->clog->error(); }
+ OstreamTemp clog_warn() override { return osd->clog->warn(); }
+
+ /**
+ * a scrub-map arrived from a replica
+ */
+ void do_replica_scrub_map(OpRequestRef op);
+
+ struct watch_disconnect_t {
+ uint64_t cookie;
+ entity_name_t name;
+ bool send_disconnect;
+ watch_disconnect_t(uint64_t c, entity_name_t n, bool sd)
+ : cookie(c), name(n), send_disconnect(sd) {}
+ };
+ void complete_disconnect_watches(
+ ObjectContextRef obc,
+ const std::list<watch_disconnect_t> &to_disconnect);
+
+ struct OpFinisher {
+ virtual ~OpFinisher() {
+ }
+
+ virtual int execute() = 0;
+ };
+
+ /*
+ * Capture all object state associated with an in-progress read or write.
+ */
+ struct OpContext {
+ OpRequestRef op;
+ osd_reqid_t reqid;
+ std::vector<OSDOp> *ops;
+
+ const ObjectState *obs; // Old objectstate
+ const SnapSet *snapset; // Old snapset
+
+ ObjectState new_obs; // resulting ObjectState
+ SnapSet new_snapset; // resulting SnapSet (in case of a write)
+ //pg_stat_t new_stats; // resulting Stats
+ object_stat_sum_t delta_stats;
+
+ bool modify; // (force) modification (even if op_t is empty)
+ bool user_modify; // user-visible modification
+ bool undirty; // user explicitly un-dirtying this object
+ bool cache_operation; ///< true if this is a cache eviction
+ bool ignore_cache; ///< true if IGNORE_CACHE flag is std::set
+ bool ignore_log_op_stats; // don't log op stats
+ bool update_log_only; ///< this is a write that returned an error - just record in pg log for dup detection
+ ObjectCleanRegions clean_regions;
+
+ // side effects
+ std::list<std::pair<watch_info_t,bool> > watch_connects; ///< new watch + will_ping flag
+ std::list<watch_disconnect_t> watch_disconnects; ///< old watch + send_discon
+ std::list<notify_info_t> notifies;
+ struct NotifyAck {
+ std::optional<uint64_t> watch_cookie;
+ uint64_t notify_id;
+ ceph::buffer::list reply_bl;
+ explicit NotifyAck(uint64_t notify_id) : notify_id(notify_id) {}
+ NotifyAck(uint64_t notify_id, uint64_t cookie, ceph::buffer::list& rbl)
+ : watch_cookie(cookie), notify_id(notify_id) {
+ reply_bl = std::move(rbl);
+ }
+ };
+ std::list<NotifyAck> notify_acks;
+
+ uint64_t bytes_written, bytes_read;
+
+ utime_t mtime;
+ SnapContext snapc; // writer snap context
+ eversion_t at_version; // pg's current version pointer
+ version_t user_at_version; // pg's current user version pointer
+
+ /// index of the current subop - only valid inside of do_osd_ops()
+ int current_osd_subop_num;
+ /// total number of subops processed in this context for cls_cxx_subop_version()
+ int processed_subop_count = 0;
+
+ PGTransactionUPtr op_t;
+ std::vector<pg_log_entry_t> log;
+ std::optional<pg_hit_set_history_t> updated_hset_history;
+
+ interval_set<uint64_t> modified_ranges;
+ ObjectContextRef obc;
+ ObjectContextRef clone_obc; // if we created a clone
+ ObjectContextRef head_obc; // if we also update snapset (see trim_object)
+
+ // FIXME: we may want to kill this msgr hint off at some point!
+ std::optional<int> data_off = std::nullopt;
+
+ MOSDOpReply *reply;
+
+ PrimaryLogPG *pg;
+
+ int num_read; ///< count read ops
+ int num_write; ///< count update ops
+
+ mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
+ mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
+
+ hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
+
+ std::list<std::function<void()>> on_applied;
+ std::list<std::function<void()>> on_committed;
+ std::list<std::function<void()>> on_finish;
+ std::list<std::function<void()>> on_success;
+ template <typename F>
+ void register_on_finish(F &&f) {
+ on_finish.emplace_back(std::forward<F>(f));
+ }
+ template <typename F>
+ void register_on_success(F &&f) {
+ on_success.emplace_back(std::forward<F>(f));
+ }
+ template <typename F>
+ void register_on_applied(F &&f) {
+ on_applied.emplace_back(std::forward<F>(f));
+ }
+ template <typename F>
+ void register_on_commit(F &&f) {
+ on_committed.emplace_back(std::forward<F>(f));
+ }
+
+ bool sent_reply = false;
+
+ // pending async reads <off, len, op_flags> -> <outbl, outr>
+ std::list<std::pair<boost::tuple<uint64_t, uint64_t, unsigned>,
+ std::pair<ceph::buffer::list*, Context*> > > pending_async_reads;
+ int inflightreads;
+ friend struct OnReadComplete;
+ void start_async_reads(PrimaryLogPG *pg);
+ void finish_read(PrimaryLogPG *pg);
+ bool async_reads_complete() {
+ return inflightreads == 0;
+ }
+
+ RWState::State lock_type;
+ ObcLockManager lock_manager;
+
+ std::map<int, std::unique_ptr<OpFinisher>> op_finishers;
+
+ OpContext(const OpContext& other);
+ const OpContext& operator=(const OpContext& other);
+
+ OpContext(OpRequestRef _op, osd_reqid_t _reqid, std::vector<OSDOp>* _ops,
+ ObjectContextRef& obc,
+ PrimaryLogPG *_pg) :
+ op(_op), reqid(_reqid), ops(_ops),
+ obs(&obc->obs),
+ snapset(0),
+ new_obs(obs->oi, obs->exists),
+ modify(false), user_modify(false), undirty(false), cache_operation(false),
+ ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
+ bytes_written(0), bytes_read(0), user_at_version(0),
+ current_osd_subop_num(0),
+ obc(obc),
+ reply(NULL), pg(_pg),
+ num_read(0),
+ num_write(0),
+ sent_reply(false),
+ inflightreads(0),
+ lock_type(RWState::RWNONE) {
+ if (obc->ssc) {
+ new_snapset = obc->ssc->snapset;
+ snapset = &obc->ssc->snapset;
+ }
+ }
+ OpContext(OpRequestRef _op, osd_reqid_t _reqid,
+ std::vector<OSDOp>* _ops, PrimaryLogPG *_pg) :
+ op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0),
+ modify(false), user_modify(false), undirty(false), cache_operation(false),
+ ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
+ bytes_written(0), bytes_read(0), user_at_version(0),
+ current_osd_subop_num(0),
+ reply(NULL), pg(_pg),
+ num_read(0),
+ num_write(0),
+ inflightreads(0),
+ lock_type(RWState::RWNONE) {}
+ void reset_obs(ObjectContextRef obc) {
+ new_obs = ObjectState(obc->obs.oi, obc->obs.exists);
+ if (obc->ssc) {
+ new_snapset = obc->ssc->snapset;
+ snapset = &obc->ssc->snapset;
+ }
+ }
+ ~OpContext() {
+ ceph_assert(!op_t);
+ if (reply)
+ reply->put();
+ for (std::list<std::pair<boost::tuple<uint64_t, uint64_t, unsigned>,
+ std::pair<ceph::buffer::list*, Context*> > >::iterator i =
+ pending_async_reads.begin();
+ i != pending_async_reads.end();
+ pending_async_reads.erase(i++)) {
+ delete i->second.second;
+ }
+ }
+ uint64_t get_features() {
+ if (op && op->get_req()) {
+ return op->get_req()->get_connection()->get_features();
+ }
+ return -1ull;
+ }
+ };
+ using OpContextUPtr = std::unique_ptr<OpContext>;
+ friend struct OpContext;
+
+ /*
+ * State on the PG primary associated with the replicated mutation
+ */
+ class RepGather {
+ public:
+ hobject_t hoid;
+ OpRequestRef op;
+ xlist<RepGather*>::item queue_item;
+ int nref;
+
+ eversion_t v;
+ int r = 0;
+
+ ceph_tid_t rep_tid;
+
+ bool rep_aborted;
+ bool all_committed;
+
+ utime_t start;
+
+ eversion_t pg_local_last_complete;
+
+ ObcLockManager lock_manager;
+
+ std::list<std::function<void()>> on_committed;
+ std::list<std::function<void()>> on_success;
+ std::list<std::function<void()>> on_finish;
+
+ RepGather(
+ OpContext *c, ceph_tid_t rt,
+ eversion_t lc) :
+ hoid(c->obc->obs.oi.soid),
+ op(c->op),
+ queue_item(this),
+ nref(1),
+ rep_tid(rt),
+ rep_aborted(false),
+ all_committed(false),
+ pg_local_last_complete(lc),
+ lock_manager(std::move(c->lock_manager)),
+ on_committed(std::move(c->on_committed)),
+ on_success(std::move(c->on_success)),
+ on_finish(std::move(c->on_finish)) {}
+
+ RepGather(
+ ObcLockManager &&manager,
+ OpRequestRef &&o,
+ std::optional<std::function<void(void)> > &&on_complete,
+ ceph_tid_t rt,
+ eversion_t lc,
+ int r) :
+ op(o),
+ queue_item(this),
+ nref(1),
+ r(r),
+ rep_tid(rt),
+ rep_aborted(false),
+ all_committed(false),
+ pg_local_last_complete(lc),
+ lock_manager(std::move(manager)) {
+ if (on_complete) {
+ on_success.push_back(std::move(*on_complete));
+ }
+ }
+
+ RepGather *get() {
+ nref++;
+ return this;
+ }
+ void put() {
+ ceph_assert(nref > 0);
+ if (--nref == 0) {
+ delete this;
+ //generic_dout(0) << "deleting " << this << dendl;
+ }
+ }
+ };
+
+
+protected:
+
+ /**
+ * Grabs locks for OpContext, should be cleaned up in close_op_ctx
+ *
+ * @param ctx [in,out] ctx to get locks for
+ * @return true on success, false if we are queued
+ */
+ bool get_rw_locks(bool write_ordered, OpContext *ctx) {
+ /* If head_obc, !obc->obs->exists and we will always take the
+ * snapdir lock *before* the head lock. Since all callers will do
+ * this (read or write) if we get the first we will be guaranteed
+ * to get the second.
+ */
+ if (write_ordered && ctx->op->may_read()) {
+ ctx->lock_type = RWState::RWEXCL;
+ } else if (write_ordered) {
+ ctx->lock_type = RWState::RWWRITE;
+ } else {
+ ceph_assert(ctx->op->may_read());
+ ctx->lock_type = RWState::RWREAD;
+ }
+
+ if (ctx->head_obc) {
+ ceph_assert(!ctx->obc->obs.exists);
+ if (!ctx->lock_manager.get_lock_type(
+ ctx->lock_type,
+ ctx->head_obc->obs.oi.soid,
+ ctx->head_obc,
+ ctx->op)) {
+ ctx->lock_type = RWState::RWNONE;
+ return false;
+ }
+ }
+ if (ctx->lock_manager.get_lock_type(
+ ctx->lock_type,
+ ctx->obc->obs.oi.soid,
+ ctx->obc,
+ ctx->op)) {
+ return true;
+ } else {
+ ceph_assert(!ctx->head_obc);
+ ctx->lock_type = RWState::RWNONE;
+ return false;
+ }
+ }
+
+ /**
+ * Cleans up OpContext
+ *
+ * @param ctx [in] ctx to clean up
+ */
+ void close_op_ctx(OpContext *ctx);
+
+ /**
+ * Releases locks
+ *
+ * @param manager [in] manager with locks to release
+ *
+ * (moved to .cc due to scrubber access)
+ */
+ void release_object_locks(ObcLockManager &lock_manager);
+
+ // replica ops
+ // [primary|tail]
+ xlist<RepGather*> repop_queue;
+
+ friend class C_OSD_RepopCommit;
+ void repop_all_committed(RepGather *repop);
+ void eval_repop(RepGather*);
+ void issue_repop(RepGather *repop, OpContext *ctx);
+ RepGather *new_repop(
+ OpContext *ctx,
+ ObjectContextRef obc,
+ ceph_tid_t rep_tid);
+ boost::intrusive_ptr<RepGather> new_repop(
+ eversion_t version,
+ int r,
+ ObcLockManager &&manager,
+ OpRequestRef &&op,
+ std::optional<std::function<void(void)> > &&on_complete);
+ void remove_repop(RepGather *repop);
+
+ OpContextUPtr simple_opc_create(ObjectContextRef obc);
+ void simple_opc_submit(OpContextUPtr ctx);
+
+ /**
+ * Merge entries atomically into all acting_recovery_backfill osds
+ * adjusting missing and recovery state as necessary.
+ *
+ * Also used to store error log entries for dup detection.
+ */
+ void submit_log_entries(
+ const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+ ObcLockManager &&manager,
+ std::optional<std::function<void(void)> > &&on_complete,
+ OpRequestRef op = OpRequestRef(),
+ int r = 0);
+ struct LogUpdateCtx {
+ boost::intrusive_ptr<RepGather> repop;
+ std::set<pg_shard_t> waiting_on;
+ };
+ void cancel_log_updates();
+ std::map<ceph_tid_t, LogUpdateCtx> log_entry_update_waiting_on;
+
+
+ // hot/cold tracking
+ HitSetRef hit_set; ///< currently accumulating HitSet
+ utime_t hit_set_start_stamp; ///< time the current HitSet started recording
+
+
+ void hit_set_clear(); ///< discard any HitSet state
+ void hit_set_setup(); ///< initialize HitSet state
+ void hit_set_create(); ///< create a new HitSet
+ void hit_set_persist(); ///< persist hit info
+ bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
+ void hit_set_trim(OpContextUPtr &ctx, unsigned max); ///< discard old HitSets
+ void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets
+ void hit_set_remove_all();
+
+ hobject_t get_hit_set_current_object(utime_t stamp);
+ hobject_t get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt);
+
+ // agent
+ boost::scoped_ptr<TierAgentState> agent_state;
+
+ void agent_setup(); ///< initialize agent state
+ bool agent_work(int max) override ///< entry point to do some agent work
+ {
+ return agent_work(max, max);
+ }
+ bool agent_work(int max, int agent_flush_quota) override;
+ bool agent_maybe_flush(ObjectContextRef& obc); ///< maybe flush
+ bool agent_maybe_evict(ObjectContextRef& obc, bool after_flush); ///< maybe evict
+
+ void agent_load_hit_sets(); ///< load HitSets, if needed
+
+ /// estimate object atime and temperature
+ ///
+ /// @param oid [in] object name
+ /// @param temperature [out] relative temperature (# consider both access time and frequency)
+ void agent_estimate_temp(const hobject_t& oid, int *temperature);
+
+ /// stop the agent
+ void agent_stop() override;
+ void agent_delay() override;
+
+ /// clear agent state
+ void agent_clear() override;
+
+ /// choose (new) agent mode(s), returns true if op is requeued
+ bool agent_choose_mode(bool restart = false, OpRequestRef op = OpRequestRef());
+ void agent_choose_mode_restart() override;
+
+ /// true if we can send an ondisk/commit for v
+ bool already_complete(eversion_t v);
+
+ // projected object info
+ SharedLRU<hobject_t, ObjectContext> object_contexts;
+ // std::map from oid.snapdir() to SnapSetContext *
+ std::map<hobject_t, SnapSetContext*> snapset_contexts;
+ ceph::mutex snapset_contexts_lock =
+ ceph::make_mutex("PrimaryLogPG::snapset_contexts_lock");
+
+ // debug order that client ops are applied
+ std::map<hobject_t, std::map<client_t, ceph_tid_t>> debug_op_order;
+
+ void populate_obc_watchers(ObjectContextRef obc);
+ void check_blocklisted_obc_watchers(ObjectContextRef obc);
+ void check_blocklisted_watchers() override;
+ void get_watchers(std::list<obj_watch_item_t> *ls) override;
+ void get_obc_watchers(ObjectContextRef obc, std::list<obj_watch_item_t> &pg_watchers);
+public:
+ void handle_watch_timeout(WatchRef watch);
+protected:
+
+ ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
+ ObjectContextRef get_object_context(
+ const hobject_t& soid,
+ bool can_create,
+ const std::map<std::string, ceph::buffer::list> *attrs = 0
+ );
+
+ void context_registry_on_change();
+ void object_context_destructor_callback(ObjectContext *obc);
+ class C_PG_ObjectContext;
+
+ int find_object_context(const hobject_t& oid,
+ ObjectContextRef *pobc,
+ bool can_create,
+ bool map_snapid_to_clone=false,
+ hobject_t *missing_oid=NULL);
+
+ void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat);
+
+ void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
+
+ SnapSetContext *get_snapset_context(
+ const hobject_t& oid,
+ bool can_create,
+ const std::map<std::string, ceph::buffer::list> *attrs = 0,
+ bool oid_existed = true //indicate this oid whether exsited in backend
+ );
+ void register_snapset_context(SnapSetContext *ssc) {
+ std::lock_guard l(snapset_contexts_lock);
+ _register_snapset_context(ssc);
+ }
+ void _register_snapset_context(SnapSetContext *ssc) {
+ ceph_assert(ceph_mutex_is_locked(snapset_contexts_lock));
+ if (!ssc->registered) {
+ ceph_assert(snapset_contexts.count(ssc->oid) == 0);
+ ssc->registered = true;
+ snapset_contexts[ssc->oid] = ssc;
+ }
+ }
+ void put_snapset_context(SnapSetContext *ssc);
+
+ std::map<hobject_t, ObjectContextRef> recovering;
+
+ /*
+ * Backfill
+ *
+ * peer_info[backfill_target].last_backfill == info.last_backfill on the peer.
+ *
+ * objects prior to peer_info[backfill_target].last_backfill
+ * - are on the peer
+ * - are included in the peer stats
+ *
+ * objects \in (last_backfill, last_backfill_started]
+ * - are on the peer or are in backfills_in_flight
+ * - are not included in pg stats (yet)
+ * - have their stats in pending_backfill_updates on the primary
+ */
+ std::set<hobject_t> backfills_in_flight;
+ std::map<hobject_t, pg_stat_t> pending_backfill_updates;
+
+ void dump_recovery_info(ceph::Formatter *f) const override {
+ f->open_array_section("waiting_on_backfill");
+ for (std::set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin();
+ p != waiting_on_backfill.end(); ++p)
+ f->dump_stream("osd") << *p;
+ f->close_section();
+ f->dump_stream("last_backfill_started") << last_backfill_started;
+ {
+ f->open_object_section("backfill_info");
+ backfill_info.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_array_section("peer_backfill_info");
+ for (std::map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+ peer_backfill_info.begin();
+ pbi != peer_backfill_info.end(); ++pbi) {
+ f->dump_stream("osd") << pbi->first;
+ f->open_object_section("BackfillInterval");
+ pbi->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ {
+ f->open_array_section("backfills_in_flight");
+ for (std::set<hobject_t>::const_iterator i = backfills_in_flight.begin();
+ i != backfills_in_flight.end();
+ ++i) {
+ f->dump_stream("object") << *i;
+ }
+ f->close_section();
+ }
+ {
+ f->open_array_section("recovering");
+ for (std::map<hobject_t, ObjectContextRef>::const_iterator i = recovering.begin();
+ i != recovering.end();
+ ++i) {
+ f->dump_stream("object") << i->first;
+ }
+ f->close_section();
+ }
+ {
+ f->open_object_section("pg_backend");
+ pgbackend->dump_recovery_info(f);
+ f->close_section();
+ }
+ }
+
+ /// last backfill operation started
+ hobject_t last_backfill_started;
+ bool new_backfill;
+
+ int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h,
+ bool *work_started);
+ int prep_object_replica_deletes(const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h,
+ bool *work_started);
+
+ void finish_degraded_object(const hobject_t oid);
+
+ // Cancels/resets pulls from peer
+ void check_recovery_sources(const OSDMapRef& map) override ;
+
+ int recover_missing(
+ const hobject_t& oid,
+ eversion_t v,
+ int priority,
+ PGBackend::RecoveryHandle *h);
+
+ // low level ops
+
+ void _make_clone(
+ OpContext *ctx,
+ PGTransaction* t,
+ ObjectContextRef obc,
+ const hobject_t& head, const hobject_t& coid,
+ object_info_t *poi);
+ void execute_ctx(OpContext *ctx);
+ void finish_ctx(OpContext *ctx, int log_op_type, int result=0);
+ void reply_ctx(OpContext *ctx, int err);
+ void make_writeable(OpContext *ctx);
+ void log_op_stats(const OpRequest& op, uint64_t inb, uint64_t outb);
+
+ void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
+ interval_set<uint64_t>& modified, uint64_t offset,
+ uint64_t length, bool write_full=false);
+ inline void truncate_update_size_and_usage(
+ object_stat_sum_t& delta_stats,
+ object_info_t& oi,
+ uint64_t truncate_size);
+
+ enum class cache_result_t {
+ NOOP,
+ BLOCKED_FULL,
+ BLOCKED_PROMOTE,
+ HANDLED_PROXY,
+ HANDLED_REDIRECT,
+ REPLIED_WITH_EAGAIN,
+ BLOCKED_RECOVERY,
+ };
+ cache_result_t maybe_handle_cache_detail(OpRequestRef op,
+ bool write_ordered,
+ ObjectContextRef obc, int r,
+ hobject_t missing_oid,
+ bool must_promote,
+ bool in_hit_set,
+ ObjectContextRef *promote_obc);
+ cache_result_t maybe_handle_manifest_detail(OpRequestRef op,
+ bool write_ordered,
+ ObjectContextRef obc);
+ bool maybe_handle_manifest(OpRequestRef op,
+ bool write_ordered,
+ ObjectContextRef obc) {
+ return cache_result_t::NOOP != maybe_handle_manifest_detail(
+ op,
+ write_ordered,
+ obc);
+ }
+
+ /**
+ * This helper function is called from do_op if the ObjectContext lookup fails.
+ * @returns true if the caching code is handling the Op, false otherwise.
+ */
+ bool maybe_handle_cache(OpRequestRef op,
+ bool write_ordered,
+ ObjectContextRef obc, int r,
+ const hobject_t& missing_oid,
+ bool must_promote,
+ bool in_hit_set = false) {
+ return cache_result_t::NOOP != maybe_handle_cache_detail(
+ op,
+ write_ordered,
+ obc,
+ r,
+ missing_oid,
+ must_promote,
+ in_hit_set,
+ nullptr);
+ }
+
+ /**
+ * This helper function checks if a promotion is needed.
+ */
+ bool maybe_promote(ObjectContextRef obc,
+ const hobject_t& missing_oid,
+ const object_locator_t& oloc,
+ bool in_hit_set,
+ uint32_t recency,
+ OpRequestRef promote_op,
+ ObjectContextRef *promote_obc = nullptr);
+ /**
+ * This helper function tells the client to redirect their request elsewhere.
+ */
+ void do_cache_redirect(OpRequestRef op);
+ /**
+ * This function attempts to start a promote. Either it succeeds,
+ * or places op on a wait std::list. If op is null, failure means that
+ * this is a noop. If a future user wants to be able to distinguish
+ * these cases, a return value should be added.
+ */
+ void promote_object(
+ ObjectContextRef obc, ///< [optional] obc
+ const hobject_t& missing_object, ///< oid (if !obc)
+ const object_locator_t& oloc, ///< locator for obc|oid
+ OpRequestRef op, ///< [optional] client op
+ ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object
+ );
+
+ int prepare_transaction(OpContext *ctx);
+ std::list<std::pair<OpRequestRef, OpContext*> > in_progress_async_reads;
+ void complete_read_ctx(int result, OpContext *ctx);
+
+ // pg on-disk content
+ void check_local() override;
+
+ void _clear_recovery_state() override;
+
+ bool start_recovery_ops(
+ uint64_t max,
+ ThreadPool::TPHandle &handle, uint64_t *started) override;
+
+ uint64_t recover_primary(uint64_t max, ThreadPool::TPHandle &handle);
+ uint64_t recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
+ bool *recovery_started);
+ hobject_t earliest_peer_backfill() const;
+ bool all_peer_done() const;
+ /**
+ * @param work_started will be std::set to true if recover_backfill got anywhere
+ * @returns the number of operations started
+ */
+ uint64_t recover_backfill(uint64_t max, ThreadPool::TPHandle &handle,
+ bool *work_started);
+
+ /**
+ * scan a (hash) range of objects in the current pg
+ *
+ * @min return at least this many items, unless we are done
+ * @max return no more than this many items
+ * @bi.begin first item should be >= this value
+ * @bi [out] resulting std::map of objects to eversion_t's
+ */
+ void scan_range(
+ int min, int max, BackfillInterval *bi,
+ ThreadPool::TPHandle &handle
+ );
+
+ /// Update a hash range to reflect changes since the last scan
+ void update_range(
+ BackfillInterval *bi, ///< [in,out] interval to update
+ ThreadPool::TPHandle &handle ///< [in] tp handle
+ );
+
+ int prep_backfill_object_push(
+ hobject_t oid, eversion_t v, ObjectContextRef obc,
+ std::vector<pg_shard_t> peers,
+ PGBackend::RecoveryHandle *h);
+ void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer);
+
+
+ class C_OSD_AppliedRecoveredObject;
+ class C_OSD_CommittedPushedObject;
+ class C_OSD_AppliedRecoveredObjectReplica;
+
+ void _applied_recovered_object(ObjectContextRef obc);
+ void _applied_recovered_object_replica();
+ void _committed_pushed_object(epoch_t epoch, eversion_t lc);
+ void recover_got(hobject_t oid, eversion_t v);
+
+ // -- copyfrom --
+ std::map<hobject_t, CopyOpRef> copy_ops;
+
+ int do_copy_get(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& op,
+ ObjectContextRef& obc);
+ int finish_copy_get();
+
+ void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
+ OSDOp& osd_op);
+
+ /**
+ * To copy an object, call start_copy.
+ *
+ * @param cb: The CopyCallback to be activated when the copy is complete
+ * @param obc: The ObjectContext we are copying into
+ * @param src: The source object
+ * @param oloc: the source object locator
+ * @param version: the version of the source object to copy (0 for any)
+ */
+ void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+ object_locator_t oloc, version_t version, unsigned flags,
+ bool mirror_snapset, unsigned src_obj_fadvise_flags,
+ unsigned dest_obj_fadvise_flags);
+ void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r);
+ void _write_copy_chunk(CopyOpRef cop, PGTransaction *t);
+ uint64_t get_copy_chunk_size() const {
+ uint64_t size = cct->_conf->osd_copyfrom_max_chunk;
+ if (pool.info.required_alignment()) {
+ uint64_t alignment = pool.info.required_alignment();
+ if (size % alignment) {
+ size += alignment - (size % alignment);
+ }
+ }
+ return size;
+ }
+ void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+ void finish_copyfrom(CopyFromCallback *cb);
+ void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
+ void cancel_copy(CopyOpRef cop, bool requeue, std::vector<ceph_tid_t> *tids);
+ void cancel_copy_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+
+ friend struct C_Copyfrom;
+
+ // -- flush --
+ std::map<hobject_t, FlushOpRef> flush_ops;
+
+ /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS
+ int start_flush(
+ OpRequestRef op, ObjectContextRef obc,
+ bool blocking, hobject_t *pmissing,
+ std::optional<std::function<void()>> &&on_flush);
+ void finish_flush(hobject_t oid, ceph_tid_t tid, int r);
+ int try_flush_mark_clean(FlushOpRef fop);
+ void cancel_flush(FlushOpRef fop, bool requeue, std::vector<ceph_tid_t> *tids);
+ void cancel_flush_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+
+ /// @return false if clone is has been evicted
+ bool is_present_clone(hobject_t coid);
+
+ friend struct C_Flush;
+
+ // -- scrub --
+ bool _range_available_for_scrub(
+ const hobject_t &begin, const hobject_t &end) override;
+
+ void _split_into(pg_t child_pgid, PG *child,
+ unsigned split_bits) override;
+ void apply_and_flush_repops(bool requeue);
+
+ int do_xattr_cmp_u64(int op, __u64 v1, ceph::buffer::list& xattr);
+ int do_xattr_cmp_str(int op, std::string& v1s, ceph::buffer::list& xattr);
+
+ // -- checksum --
+ int do_checksum(OpContext *ctx, OSDOp& osd_op, ceph::buffer::list::const_iterator *bl_it);
+ int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type,
+ ceph::buffer::list::const_iterator *init_value_bl_it,
+ const ceph::buffer::list &read_bl);
+
+ friend struct C_ChecksumRead;
+
+ int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
+ int finish_extent_cmp(OSDOp& osd_op, const ceph::buffer::list &read_bl);
+
+ friend struct C_ExtentCmpRead;
+
+ int do_read(OpContext *ctx, OSDOp& osd_op);
+ int do_sparse_read(OpContext *ctx, OSDOp& osd_op);
+ int do_writesame(OpContext *ctx, OSDOp& osd_op);
+
+ bool pgls_filter(const PGLSFilter& filter, const hobject_t& sobj);
+
+ std::pair<int, std::unique_ptr<const PGLSFilter>> get_pgls_filter(
+ ceph::buffer::list::const_iterator& iter);
+
+ std::map<hobject_t, std::list<OpRequestRef>> in_progress_proxy_ops;
+ void kick_proxy_ops_blocked(hobject_t& soid);
+ void cancel_proxy_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+
+ // -- proxyread --
+ std::map<ceph_tid_t, ProxyReadOpRef> proxyread_ops;
+
+ void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL);
+ void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r);
+ void cancel_proxy_read(ProxyReadOpRef prdop, std::vector<ceph_tid_t> *tids);
+
+ friend struct C_ProxyRead;
+
+ // -- proxywrite --
+ std::map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops;
+
+ void do_proxy_write(OpRequestRef op, ObjectContextRef obc = NULL);
+ void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r);
+ void cancel_proxy_write(ProxyWriteOpRef pwop, std::vector<ceph_tid_t> *tids);
+
+ friend struct C_ProxyWrite_Commit;
+
+ // -- chunkop --
+ enum class refcount_t {
+ INCREMENT_REF,
+ DECREMENT_REF,
+ CREATE_OR_GET_REF,
+ };
+ void do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
+ ObjectContextRef obc, bool write_ordered);
+ void do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
+ uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
+ uint64_t req_total_len, bool write_ordered);
+ bool can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc);
+ void _copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset);
+ void process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset);
+ void finish_promote_manifest(int r, CopyResults *results, ObjectContextRef obc);
+ void cancel_and_requeue_proxy_ops(hobject_t oid);
+ void cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids);
+ ceph_tid_t refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
+ Context *cb, std::optional<bufferlist> chunk);
+ void dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx);
+ void dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs);
+ void dec_refcount_by_dirty(OpContext* ctx);
+ ObjectContextRef get_prev_clone_obc(ObjectContextRef obc);
+ bool recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op);
+ void get_adjacent_clones(ObjectContextRef src_obc,
+ ObjectContextRef& _l, ObjectContextRef& _g);
+ bool inc_refcount_by_set(OpContext* ctx, object_manifest_t& tgt,
+ OSDOp& osd_op);
+ int do_cdc(const object_info_t& oi, std::map<uint64_t, chunk_info_t>& chunk_map,
+ std::map<uint64_t, bufferlist>& chunks);
+ int start_dedup(OpRequestRef op, ObjectContextRef obc);
+ std::pair<int, hobject_t> get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk);
+ int finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset);
+
+ friend struct C_ProxyChunkRead;
+ friend class PromoteManifestCallback;
+ friend struct C_CopyChunk;
+ friend struct RefCountCallback;
+ friend struct C_SetDedupChunks;
+
+public:
+ PrimaryLogPG(OSDService *o, OSDMapRef curmap,
+ const PGPool &_pool,
+ const std::map<std::string,std::string>& ec_profile,
+ spg_t p);
+ ~PrimaryLogPG() override;
+
+ void do_command(
+ const std::string_view& prefix,
+ const cmdmap_t& cmdmap,
+ const ceph::buffer::list& idata,
+ std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish) override;
+
+ void clear_cache() override;
+ int get_cache_obj_count() override {
+ return object_contexts.get_count();
+ }
+ unsigned get_pg_shard() const {
+ return info.pgid.hash_to_shard(osd->get_num_shards());
+ }
+ void do_request(
+ OpRequestRef& op,
+ ThreadPool::TPHandle &handle) override;
+ void do_op(OpRequestRef& op);
+ void record_write_error(OpRequestRef op, const hobject_t &soid,
+ MOSDOpReply *orig_reply, int r,
+ OpContext *ctx_for_op_returns=nullptr);
+ void do_pg_op(OpRequestRef op);
+ void do_scan(
+ OpRequestRef op,
+ ThreadPool::TPHandle &handle);
+ void do_backfill(OpRequestRef op);
+ void do_backfill_remove(OpRequestRef op);
+
+ void handle_backoff(OpRequestRef& op);
+
+ int trim_object(bool first, const hobject_t &coid, snapid_t snap_to_trim,
+ OpContextUPtr *ctxp);
+ void snap_trimmer(epoch_t e) override;
+ void kick_snap_trim() override;
+ void snap_trimmer_scrub_complete() override;
+ int do_osd_ops(OpContext *ctx, std::vector<OSDOp>& ops);
+
+ int _get_tmap(OpContext *ctx, ceph::buffer::list *header, ceph::buffer::list *vals);
+ int do_tmap2omap(OpContext *ctx, unsigned flags);
+ int do_tmapup(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& osd_op);
+ int do_tmapup_slow(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& osd_op, ceph::buffer::list& bl);
+
+ void do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn);
+private:
+ int do_scrub_ls(const MOSDOp *op, OSDOp *osd_op);
+ bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const;
+
+ uint64_t temp_seq; ///< last id for naming temp objects
+ /// generate a new temp object name
+ hobject_t generate_temp_object(const hobject_t& target);
+ /// generate a new temp object name (for recovery)
+ hobject_t get_temp_recovery_object(const hobject_t& target,
+ eversion_t version) override;
+ int get_recovery_op_priority() const {
+ int64_t pri = 0;
+ pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+ return pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+ }
+
+public:
+ coll_t get_coll() {
+ return coll;
+ }
+ void split_colls(
+ spg_t child,
+ int split_bits,
+ int seed,
+ const pg_pool_t *pool,
+ ObjectStore::Transaction &t) override {
+ coll_t target = coll_t(child);
+ create_pg_collection(t, child, split_bits);
+ t.split_collection(
+ coll,
+ split_bits,
+ seed,
+ target);
+ init_pg_ondisk(t, child, pool);
+ }
+private:
+
+ struct DoSnapWork : boost::statechart::event< DoSnapWork > {
+ DoSnapWork() : boost::statechart::event < DoSnapWork >() {}
+ };
+ struct KickTrim : boost::statechart::event< KickTrim > {
+ KickTrim() : boost::statechart::event < KickTrim >() {}
+ };
+ struct RepopsComplete : boost::statechart::event< RepopsComplete > {
+ RepopsComplete() : boost::statechart::event < RepopsComplete >() {}
+ };
+ struct ScrubComplete : boost::statechart::event< ScrubComplete > {
+ ScrubComplete() : boost::statechart::event < ScrubComplete >() {}
+ };
+ struct TrimWriteUnblocked : boost::statechart::event< TrimWriteUnblocked > {
+ TrimWriteUnblocked() : boost::statechart::event < TrimWriteUnblocked >() {}
+ };
+ struct Reset : boost::statechart::event< Reset > {
+ Reset() : boost::statechart::event< Reset >() {}
+ };
+ struct SnapTrimReserved : boost::statechart::event< SnapTrimReserved > {
+ SnapTrimReserved() : boost::statechart::event< SnapTrimReserved >() {}
+ };
+ struct SnapTrimTimerReady : boost::statechart::event< SnapTrimTimerReady > {
+ SnapTrimTimerReady() : boost::statechart::event< SnapTrimTimerReady >() {}
+ };
+
+ struct NotTrimming;
+ struct SnapTrimmer : public boost::statechart::state_machine< SnapTrimmer, NotTrimming > {
+ PrimaryLogPG *pg;
+ explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {}
+ void log_enter(const char *state_name);
+ void log_exit(const char *state_name, utime_t duration);
+ bool permit_trim();
+ bool can_trim() {
+ return
+ permit_trim() &&
+ !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM);
+ }
+ } snap_trimmer_machine;
+
+ struct WaitReservation;
+ struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< KickTrim >,
+ boost::statechart::transition< Reset, NotTrimming >
+ > reactions;
+
+ std::set<hobject_t> in_flight;
+ snapid_t snap_to_trim;
+
+ explicit Trimming(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming") {
+ context< SnapTrimmer >().log_enter(state_name);
+ ceph_assert(context< SnapTrimmer >().permit_trim());
+ ceph_assert(in_flight.empty());
+ }
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ auto *pg = context< SnapTrimmer >().pg;
+ pg->osd->snap_reserver.cancel_reservation(pg->get_pgid());
+ pg->state_clear(PG_STATE_SNAPTRIM);
+ pg->publish_stats_to_osd();
+ }
+ boost::statechart::result react(const KickTrim&) {
+ return discard_event();
+ }
+ };
+
+ /* SnapTrimmerStates */
+ struct WaitTrimTimer : boost::statechart::state< WaitTrimTimer, Trimming >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< SnapTrimTimerReady >
+ > reactions;
+ Context *wakeup = nullptr;
+ explicit WaitTrimTimer(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming/WaitTrimTimer") {
+ context< SnapTrimmer >().log_enter(state_name);
+ ceph_assert(context<Trimming>().in_flight.empty());
+ struct OnTimer : Context {
+ PrimaryLogPGRef pg;
+ epoch_t epoch;
+ OnTimer(PrimaryLogPGRef pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
+ void finish(int) override {
+ pg->lock();
+ if (!pg->pg_has_reset_since(epoch))
+ pg->snap_trimmer_machine.process_event(SnapTrimTimerReady());
+ pg->unlock();
+ }
+ };
+ auto *pg = context< SnapTrimmer >().pg;
+ float osd_snap_trim_sleep = pg->osd->osd->get_osd_snap_trim_sleep();
+ if (osd_snap_trim_sleep > 0) {
+ std::lock_guard l(pg->osd->sleep_lock);
+ wakeup = pg->osd->sleep_timer.add_event_after(
+ osd_snap_trim_sleep,
+ new OnTimer{pg, pg->get_osdmap_epoch()});
+ } else {
+ post_event(SnapTrimTimerReady());
+ }
+ }
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ auto *pg = context< SnapTrimmer >().pg;
+ if (wakeup) {
+ std::lock_guard l(pg->osd->sleep_lock);
+ pg->osd->sleep_timer.cancel_event(wakeup);
+ wakeup = nullptr;
+ }
+ }
+ boost::statechart::result react(const SnapTrimTimerReady &) {
+ wakeup = nullptr;
+ if (!context< SnapTrimmer >().can_trim()) {
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ } else {
+ return transit< AwaitAsyncWork >();
+ }
+ }
+ };
+
+ struct WaitRWLock : boost::statechart::state< WaitRWLock, Trimming >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< TrimWriteUnblocked >
+ > reactions;
+ explicit WaitRWLock(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming/WaitRWLock") {
+ context< SnapTrimmer >().log_enter(state_name);
+ ceph_assert(context<Trimming>().in_flight.empty());
+ }
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ }
+ boost::statechart::result react(const TrimWriteUnblocked&) {
+ if (!context< SnapTrimmer >().can_trim()) {
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ } else {
+ return transit< AwaitAsyncWork >();
+ }
+ }
+ };
+
+ struct WaitRepops : boost::statechart::state< WaitRepops, Trimming >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< RepopsComplete >
+ > reactions;
+ explicit WaitRepops(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming/WaitRepops") {
+ context< SnapTrimmer >().log_enter(state_name);
+ ceph_assert(!context<Trimming>().in_flight.empty());
+ }
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ }
+ boost::statechart::result react(const RepopsComplete&) {
+ if (!context< SnapTrimmer >().can_trim()) {
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ } else {
+ return transit< WaitTrimTimer >();
+ }
+ }
+ };
+
+ struct AwaitAsyncWork : boost::statechart::state< AwaitAsyncWork, Trimming >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< DoSnapWork >
+ > reactions;
+ explicit AwaitAsyncWork(my_context ctx);
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ }
+ boost::statechart::result react(const DoSnapWork&);
+ };
+
+ struct WaitReservation : boost::statechart::state< WaitReservation, Trimming >, NamedState {
+ /* WaitReservation is a sub-state of trimming simply so that exiting Trimming
+ * always cancels the reservation */
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< SnapTrimReserved >
+ > reactions;
+ struct ReservationCB : public Context {
+ PrimaryLogPGRef pg;
+ bool canceled;
+ explicit ReservationCB(PrimaryLogPG *pg) : pg(pg), canceled(false) {}
+ void finish(int) override {
+ pg->lock();
+ if (!canceled)
+ pg->snap_trimmer_machine.process_event(SnapTrimReserved());
+ pg->unlock();
+ }
+ void cancel() {
+ ceph_assert(pg->is_locked());
+ ceph_assert(!canceled);
+ canceled = true;
+ }
+ };
+ ReservationCB *pending = nullptr;
+
+ explicit WaitReservation(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming/WaitReservation") {
+ context< SnapTrimmer >().log_enter(state_name);
+ ceph_assert(context<Trimming>().in_flight.empty());
+ auto *pg = context< SnapTrimmer >().pg;
+ pending = new ReservationCB(pg);
+ pg->osd->snap_reserver.request_reservation(
+ pg->get_pgid(),
+ pending,
+ 0);
+ pg->state_set(PG_STATE_SNAPTRIM_WAIT);
+ pg->publish_stats_to_osd();
+ }
+ boost::statechart::result react(const SnapTrimReserved&);
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ if (pending)
+ pending->cancel();
+ pending = nullptr;
+ auto *pg = context< SnapTrimmer >().pg;
+ pg->state_clear(PG_STATE_SNAPTRIM_WAIT);
+ pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
+ pg->publish_stats_to_osd();
+ }
+ };
+
+ struct WaitScrub : boost::statechart::state< WaitScrub, SnapTrimmer >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< ScrubComplete >,
+ boost::statechart::custom_reaction< KickTrim >,
+ boost::statechart::transition< Reset, NotTrimming >
+ > reactions;
+ explicit WaitScrub(my_context ctx)
+ : my_base(ctx),
+ NamedState(nullptr, "Trimming/WaitScrub") {
+ context< SnapTrimmer >().log_enter(state_name);
+ }
+ void exit() {
+ context< SnapTrimmer >().log_exit(state_name, enter_time);
+ }
+ boost::statechart::result react(const ScrubComplete&) {
+ post_event(KickTrim());
+ return transit< NotTrimming >();
+ }
+ boost::statechart::result react(const KickTrim&) {
+ return discard_event();
+ }
+ };
+
+ struct NotTrimming : boost::statechart::state< NotTrimming, SnapTrimmer >, NamedState {
+ typedef boost::mpl::list <
+ boost::statechart::custom_reaction< KickTrim >,
+ boost::statechart::transition< Reset, NotTrimming >
+ > reactions;
+ explicit NotTrimming(my_context ctx);
+ void exit();
+ boost::statechart::result react(const KickTrim&);
+ };
+
+ int _verify_no_head_clones(const hobject_t& soid,
+ const SnapSet& ss);
+ // return true if we're creating a local object, false for a
+ // whiteout or no change.
+ void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false);
+ int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout);
+ int _rollback_to(OpContext *ctx, ceph_osd_op& op);
+public:
+ bool is_missing_object(const hobject_t& oid) const;
+ bool is_unreadable_object(const hobject_t &oid) const {
+ return is_missing_object(oid) ||
+ !recovery_state.get_missing_loc().readable_with_acting(
+ oid, get_actingset());
+ }
+ void maybe_kick_recovery(const hobject_t &soid);
+ void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
+
+ int get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op);
+
+ bool check_laggy(OpRequestRef& op);
+ bool check_laggy_requeue(OpRequestRef& op);
+ void recheck_readable() override;
+
+ bool is_backfill_target(pg_shard_t osd) const {
+ return recovery_state.is_backfill_target(osd);
+ }
+ const std::set<pg_shard_t> &get_backfill_targets() const {
+ return recovery_state.get_backfill_targets();
+ }
+ bool is_async_recovery_target(pg_shard_t peer) const {
+ return recovery_state.is_async_recovery_target(peer);
+ }
+ const std::set<pg_shard_t> &get_async_recovery_targets() const {
+ return recovery_state.get_async_recovery_targets();
+ }
+ bool is_degraded_or_backfilling_object(const hobject_t& oid);
+ bool is_degraded_on_async_recovery_target(const hobject_t& soid);
+ void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
+
+ void block_write_on_full_cache(
+ const hobject_t& oid, OpRequestRef op);
+ void block_for_clean(
+ const hobject_t& oid, OpRequestRef op);
+ void block_write_on_snap_rollback(
+ const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
+ void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
+
+ bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op);
+ void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
+ void kick_object_context_blocked(ObjectContextRef obc);
+
+ void maybe_force_recovery();
+
+ void mark_all_unfound_lost(
+ int what,
+ std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish);
+ eversion_t pick_newest_available(const hobject_t& oid);
+
+ void do_update_log_missing(
+ OpRequestRef &op);
+
+ void do_update_log_missing_reply(
+ OpRequestRef &op);
+
+ void plpg_on_role_change() override;
+ void plpg_on_pool_change() override;
+ void clear_async_reads();
+ void on_change(ObjectStore::Transaction &t) override;
+ void on_activate_complete() override;
+ void on_flushed() override;
+ void on_removal(ObjectStore::Transaction &t) override;
+ void on_shutdown() override;
+ bool check_failsafe_full() override;
+ bool maybe_preempt_replica_scrub(const hobject_t& oid) override;
+ int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx);
+
+ // attr cache handling
+ void setattr_maybe_cache(
+ ObjectContextRef obc,
+ PGTransaction *t,
+ const std::string &key,
+ ceph::buffer::list &val);
+ void setattrs_maybe_cache(
+ ObjectContextRef obc,
+ PGTransaction *t,
+ std::map<std::string, ceph::buffer::list> &attrs);
+ void rmattr_maybe_cache(
+ ObjectContextRef obc,
+ PGTransaction *t,
+ const std::string &key);
+ int getattr_maybe_cache(
+ ObjectContextRef obc,
+ const std::string &key,
+ ceph::buffer::list *val);
+ int getattrs_maybe_cache(
+ ObjectContextRef obc,
+ std::map<std::string, ceph::buffer::list> *out);
+
+public:
+ void set_dynamic_perf_stats_queries(
+ const std::list<OSDPerfMetricQuery> &queries) override;
+ void get_dynamic_perf_stats(DynamicPerfStats *stats) override;
+
+private:
+ DynamicPerfStats m_dynamic_perf_stats;
+};
+
+inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop)
+{
+ out << "repgather(" << &repop
+ << " " << repop.v
+ << " rep_tid=" << repop.rep_tid
+ << " committed?=" << repop.all_committed
+ << " r=" << repop.r
+ << ")";
+ return out;
+}
+
+inline ostream& operator<<(ostream& out,
+ const PrimaryLogPG::ProxyWriteOpRef& pwop)
+{
+ out << "proxywrite(" << &pwop
+ << " " << pwop->user_version
+ << " pwop_tid=" << pwop->objecter_tid;
+ if (pwop->ctx->op)
+ out << " op=" << *(pwop->ctx->op->get_req());
+ out << ")";
+ return out;
+}
+
+void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop);
+void intrusive_ptr_release(PrimaryLogPG::RepGather *repop);
+
+
+#endif
diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc
new file mode 100644
index 000000000..8cf76dd1d
--- /dev/null
+++ b/src/osd/PrimaryLogScrub.cc
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrimaryLogScrub.h"
+
+#include "common/scrub_types.h"
+
+#include "PeeringState.h"
+#include "PrimaryLogPG.h"
+#include "scrub_machine.h"
+
+#define dout_context (m_osds->cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+template <class T>
+static ostream& _prefix(std::ostream* _dout, T* t)
+{
+ return t->gen_prefix(*_dout);
+}
+
+using namespace Scrub;
+using Scrub::ScrubMachine;
+
+bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const
+{
+ if (!m_store) {
+ return false;
+ }
+
+ if (arg.get_snapsets) {
+ res_inout.vals =
+ m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
+ } else {
+ res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
+ arg.max_return);
+ }
+ return true;
+}
+
+void PrimaryLogScrub::_scrub_finish()
+{
+ auto& info = m_pg->info; ///< a temporary alias
+
+ dout(10) << __func__
+ << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
+ << dendl;
+
+ if (info.stats.stats_invalid) {
+ m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
+ stats.stats = m_scrub_cstat;
+ stats.stats_invalid = false;
+ return false;
+ });
+
+ if (m_pl_pg->agent_state)
+ m_pl_pg->agent_choose_mode();
+ }
+
+ dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
+ << info.stats.stats.sum.num_objects << " objects, "
+ << m_scrub_cstat.sum.num_object_clones << "/"
+ << info.stats.stats.sum.num_object_clones << " clones, "
+ << m_scrub_cstat.sum.num_objects_dirty << "/"
+ << info.stats.stats.sum.num_objects_dirty << " dirty, "
+ << m_scrub_cstat.sum.num_objects_omap << "/"
+ << info.stats.stats.sum.num_objects_omap << " omap, "
+ << m_scrub_cstat.sum.num_objects_pinned << "/"
+ << info.stats.stats.sum.num_objects_pinned << " pinned, "
+ << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+ << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
+ << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
+ << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
+ << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
+ << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+ << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
+ << dendl;
+
+ if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
+ m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
+ (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
+ !info.stats.dirty_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
+ !info.stats.omap_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
+ !info.stats.pin_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_hit_set_archive !=
+ info.stats.stats.sum.num_objects_hit_set_archive &&
+ !info.stats.hitset_stats_invalid) ||
+ (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
+ info.stats.stats.sum.num_bytes_hit_set_archive &&
+ !info.stats.hitset_bytes_stats_invalid) ||
+ (m_scrub_cstat.sum.num_objects_manifest !=
+ info.stats.stats.sum.num_objects_manifest &&
+ !info.stats.manifest_stats_invalid) ||
+ m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
+ m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
+ << m_scrub_cstat.sum.num_objects << "/"
+ << info.stats.stats.sum.num_objects << " objects, "
+ << m_scrub_cstat.sum.num_object_clones << "/"
+ << info.stats.stats.sum.num_object_clones << " clones, "
+ << m_scrub_cstat.sum.num_objects_dirty << "/"
+ << info.stats.stats.sum.num_objects_dirty << " dirty, "
+ << m_scrub_cstat.sum.num_objects_omap << "/"
+ << info.stats.stats.sum.num_objects_omap << " omap, "
+ << m_scrub_cstat.sum.num_objects_pinned << "/"
+ << info.stats.stats.sum.num_objects_pinned << " pinned, "
+ << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+ << info.stats.stats.sum.num_objects_hit_set_archive
+ << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
+ << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
+ << m_scrub_cstat.sum.num_bytes << "/"
+ << info.stats.stats.sum.num_bytes << " bytes, "
+ << m_scrub_cstat.sum.num_objects_manifest << "/"
+ << info.stats.stats.sum.num_objects_manifest
+ << " manifest objects, "
+ << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+ << info.stats.stats.sum.num_bytes_hit_set_archive
+ << " hit_set_archive bytes.";
+ ++m_shallow_errors;
+
+ if (m_is_repair) {
+ ++m_fixed_count;
+ m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
+ stats.stats = m_scrub_cstat;
+ stats.dirty_stats_invalid = false;
+ stats.omap_stats_invalid = false;
+ stats.hitset_stats_invalid = false;
+ stats.hitset_bytes_stats_invalid = false;
+ stats.pin_stats_invalid = false;
+ stats.manifest_stats_invalid = false;
+ return false;
+ });
+ m_pl_pg->publish_stats_to_osd();
+ m_pl_pg->recovery_state.share_pg_info();
+ }
+ }
+ // Clear object context cache to get repair information
+ if (m_is_repair)
+ m_pl_pg->object_contexts.clear();
+}
+
+static bool doing_clones(const std::optional<SnapSet>& snapset,
+ const vector<snapid_t>::reverse_iterator& curclone)
+{
+ return snapset && curclone != snapset->clones.rend();
+}
+
+void PrimaryLogScrub::log_missing(int missing,
+ const std::optional<hobject_t>& head,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ const char* func,
+ bool allow_incomplete_clones)
+{
+ ceph_assert(head);
+ if (allow_incomplete_clones) {
+ dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
+ << missing << " clone(s) in cache tier" << dendl;
+ } else {
+ clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
+ << " missing clone(s)";
+ }
+}
+
+int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
+ const std::optional<SnapSet>& snapset,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ bool allow_incomplete_clones,
+ std::optional<snapid_t> target,
+ vector<snapid_t>::reverse_iterator* curclone,
+ inconsistent_snapset_wrapper& e)
+{
+ ceph_assert(head);
+ ceph_assert(snapset);
+ int missing_count = 0;
+
+ // NOTE: clones are in descending order, thus **curclone > target test here
+ hobject_t next_clone(*head);
+ while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+
+ ++missing_count;
+ // it is okay to be missing one or more clones in a cache tier.
+ // skip higher-numbered clones in the list.
+ if (!allow_incomplete_clones) {
+ next_clone.snap = **curclone;
+ clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
+ << next_clone << " " << m_missing << " missing";
+ ++m_shallow_errors;
+ e.set_clone_missing(next_clone.snap);
+ }
+ // Clones are descending
+ ++(*curclone);
+ }
+ return missing_count;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head.
+ *
+ * Example
+ *
+ * objects expected
+ * ======= =======
+ * obj1 snap 1 head, unexpected obj1 snap 1
+ * obj2 head head, match
+ * [SnapSet clones 6 4 2 1]
+ * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6 obj2 snap 6, match
+ * obj2 snap 4 obj2 snap 4, match
+ * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
+ * [Snapset clones 3 1]
+ * obj3 snap 3 obj3 snap 3 match
+ * obj3 snap 1 obj3 snap 1 match
+ * obj4 head head, match
+ * [Snapset clones 4]
+ * EOL obj4 snap 4, (expected)
+ */
+void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
+ const missing_map_t& missing_digest)
+{
+ dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
+ << dendl;
+
+ auto& info = m_pl_pg->info;
+ const PGPool& pool = m_pl_pg->pool;
+ bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
+
+ std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
+
+ // traverse in reverse order.
+ std::optional<hobject_t> head;
+ std::optional<SnapSet> snapset; // If initialized so will head (above)
+ vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
+ int missing = 0;
+ inconsistent_snapset_wrapper soid_error, head_error;
+ int soid_error_count = 0;
+
+ for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
+
+ const hobject_t& soid = p->first;
+ ceph_assert(!soid.is_snapdir());
+ soid_error = inconsistent_snapset_wrapper{soid};
+ object_stat_sum_t stat;
+ std::optional<object_info_t> oi;
+
+ stat.num_objects++;
+
+ if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+ stat.num_objects_hit_set_archive++;
+
+ if (soid.is_snap()) {
+ // it's a clone
+ stat.num_object_clones++;
+ }
+
+ // basic checks.
+ if (p->second.attrs.count(OI_ATTR) == 0) {
+ oi = std::nullopt;
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+ << OI_ATTR << "' attr";
+ ++m_shallow_errors;
+ soid_error.set_info_missing();
+ } else {
+ bufferlist bv;
+ bv.push_back(p->second.attrs[OI_ATTR]);
+ try {
+ oi = object_info_t(); // Initialize optional<> before decode into it
+ oi->decode(bv);
+ } catch (ceph::buffer::error& e) {
+ oi = std::nullopt;
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : can't decode '" << OI_ATTR << "' attr " << e.what();
+ ++m_shallow_errors;
+ soid_error.set_info_corrupted();
+ soid_error.set_info_missing(); // Not available too
+ }
+ }
+
+ if (oi) {
+ if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : on disk size (" << p->second.size
+ << ") does not match object info size (" << oi->size
+ << ") adjusted for ondisk to ("
+ << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
+ soid_error.set_size_mismatch();
+ ++m_shallow_errors;
+ }
+
+ dout(20) << m_mode_desc << " " << soid << " " << *oi << dendl;
+
+ // A clone num_bytes will be added later when we have snapset
+ if (!soid.is_snap()) {
+ stat.num_bytes += oi->size;
+ }
+ if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+ stat.num_bytes_hit_set_archive += oi->size;
+
+ if (oi->is_dirty())
+ ++stat.num_objects_dirty;
+ if (oi->is_whiteout())
+ ++stat.num_whiteouts;
+ if (oi->is_omap())
+ ++stat.num_objects_omap;
+ if (oi->is_cache_pinned())
+ ++stat.num_objects_pinned;
+ if (oi->has_manifest())
+ ++stat.num_objects_manifest;
+ }
+
+ // Check for any problems while processing clones
+ if (doing_clones(snapset, curclone)) {
+ std::optional<snapid_t> target;
+ // Expecting an object with snap for current head
+ if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+ dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
+ << " while processing " << *head << dendl;
+
+ target = all_clones;
+ } else {
+ ceph_assert(soid.is_snap());
+ target = soid.snap;
+ }
+
+ // Log any clones we were expecting to be there up to target
+ // This will set missing, but will be a no-op if snap.soid == *curclone.
+ missing +=
+ process_clones_to(head, snapset, m_osds->clog, info.pgid,
+ allow_incomplete_clones, target, &curclone, head_error);
+ }
+
+ bool expected;
+ // Check doing_clones() again in case we ran process_clones_to()
+ if (doing_clones(snapset, curclone)) {
+ // A head would have processed all clones above
+ // or all greater than *curclone.
+ ceph_assert(soid.is_snap() && *curclone <= soid.snap);
+
+ // After processing above clone snap should match the expected curclone
+ expected = (*curclone == soid.snap);
+ } else {
+ // If we aren't doing clones any longer, then expecting head
+ expected = soid.has_snapset();
+ }
+ if (!expected) {
+ // If we couldn't read the head's snapset, just ignore clones
+ if (head && !snapset) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : clone ignored due to missing snapset";
+ } else {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : is an unexpected clone";
+ }
+ ++m_shallow_errors;
+ soid_error.set_headless();
+ m_store->add_snap_error(pool.id, soid_error);
+ ++soid_error_count;
+ if (head && soid.get_head() == head->get_head())
+ head_error.set_clone(soid.snap);
+ continue;
+ }
+
+ // new snapset?
+ if (soid.has_snapset()) {
+
+ if (missing) {
+ log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+ pool.info.allow_incomplete_clones());
+ }
+
+ // Save previous head error information
+ if (head && (head_error.errors || soid_error_count))
+ m_store->add_snap_error(pool.id, head_error);
+ // Set this as a new head object
+ head = soid;
+ missing = 0;
+ head_error = soid_error;
+ soid_error_count = 0;
+
+ dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
+
+ if (p->second.attrs.count(SS_ATTR) == 0) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+ << SS_ATTR << "' attr";
+ ++m_shallow_errors;
+ snapset = std::nullopt;
+ head_error.set_snapset_missing();
+ } else {
+ bufferlist bl;
+ bl.push_back(p->second.attrs[SS_ATTR]);
+ auto blp = bl.cbegin();
+ try {
+ snapset = SnapSet(); // Initialize optional<> before decoding into it
+ decode(*snapset, blp);
+ head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
+ } catch (ceph::buffer::error& e) {
+ snapset = std::nullopt;
+ m_osds->clog->error()
+ << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
+ << "' attr " << e.what();
+ ++m_shallow_errors;
+ head_error.set_snapset_corrupted();
+ }
+ }
+
+ if (snapset) {
+ // what will be next?
+ curclone = snapset->clones.rbegin();
+
+ if (!snapset->clones.empty()) {
+ dout(20) << " snapset " << *snapset << dendl;
+ if (snapset->seq == 0) {
+ m_osds->clog->error()
+ << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
+ ++m_shallow_errors;
+ head_error.set_snapset_error();
+ }
+ }
+ }
+ } else {
+ ceph_assert(soid.is_snap());
+ ceph_assert(head);
+ ceph_assert(snapset);
+ ceph_assert(soid.snap == *curclone);
+
+ dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
+
+ if (snapset->clone_size.count(soid.snap) == 0) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : is missing in clone_size";
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ } else {
+ if (oi && oi->size != snapset->clone_size[soid.snap]) {
+ m_osds->clog->error()
+ << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
+ << " != clone_size " << snapset->clone_size[*curclone];
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ }
+
+ if (snapset->clone_overlap.count(soid.snap) == 0) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : is missing in clone_overlap";
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ } else {
+ // This checking is based on get_clone_bytes(). The first 2 asserts
+ // can't happen because we know we have a clone_size and
+ // a clone_overlap. Now we check that the interval_set won't
+ // cause the last assert.
+ uint64_t size = snapset->clone_size.find(soid.snap)->second;
+ const interval_set<uint64_t>& overlap =
+ snapset->clone_overlap.find(soid.snap)->second;
+ bool bad_interval_set = false;
+ for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+ i != overlap.end(); ++i) {
+ if (size < i.get_len()) {
+ bad_interval_set = true;
+ break;
+ }
+ size -= i.get_len();
+ }
+
+ if (bad_interval_set) {
+ m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+ << " : bad interval_set in clone_overlap";
+ ++m_shallow_errors;
+ soid_error.set_size_mismatch();
+ } else {
+ stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+ }
+ }
+ }
+
+ // what's next?
+ ++curclone;
+ if (soid_error.errors) {
+ m_store->add_snap_error(pool.id, soid_error);
+ ++soid_error_count;
+ }
+ }
+ m_scrub_cstat.add(stat);
+ }
+
+ if (doing_clones(snapset, curclone)) {
+ dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
+ << " No more objects while processing " << *head << dendl;
+
+ missing +=
+ process_clones_to(head, snapset, m_osds->clog, info.pgid,
+ allow_incomplete_clones, all_clones, &curclone, head_error);
+ }
+
+ // There could be missing found by the test above or even
+ // before dropping out of the loop for the last head.
+ if (missing) {
+ log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+ allow_incomplete_clones);
+ }
+ if (head && (head_error.errors || soid_error_count))
+ m_store->add_snap_error(pool.id, head_error);
+
+ dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
+ << dendl;
+ for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
+
+ ceph_assert(!p->first.is_snapdir());
+ dout(10) << __func__ << " recording digests for " << p->first << dendl;
+
+ ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
+ if (!obc) {
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc
+ << " cannot get object context for object " << p->first;
+ continue;
+ }
+ if (obc->obs.oi.soid != p->first) {
+ m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
+ << " : object has a valid oi attr with a mismatched name, "
+ << " obc->obs.oi.soid: " << obc->obs.oi.soid;
+ continue;
+ }
+ PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
+ ctx->at_version = m_pl_pg->get_next_version();
+ ctx->mtime = utime_t(); // do not update mtime
+ if (p->second.first) {
+ ctx->new_obs.oi.set_data_digest(*p->second.first);
+ } else {
+ ctx->new_obs.oi.clear_data_digest();
+ }
+ if (p->second.second) {
+ ctx->new_obs.oi.set_omap_digest(*p->second.second);
+ } else {
+ ctx->new_obs.oi.clear_omap_digest();
+ }
+ m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
+
+ ++num_digest_updates_pending;
+ ctx->register_on_success([this]() {
+ if ((num_digest_updates_pending >= 1) &&
+ (--num_digest_updates_pending == 0)) {
+ m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
+ }
+ });
+
+ m_pl_pg->simple_opc_submit(std::move(ctx));
+ }
+
+ dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
+}
+
+PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
+
+void PrimaryLogScrub::_scrub_clear_state()
+{
+ dout(15) << __func__ << dendl;
+ m_scrub_cstat = object_stat_collection_t();
+}
+
+void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid)
+{
+ // We scrub objects in hobject_t order, so objects before m_start have already been
+ // scrubbed and their stats have already been added to the scrubber. Objects after that
+ // point haven't been included in the scrubber's stats accounting yet, so they will be
+ // included when the scrubber gets to that object.
+ dout(15) << __func__ << " soid: " << soid << " scrub is active? " << is_scrub_active()
+ << dendl;
+ if (is_primary() && is_scrub_active()) {
+ if (soid < m_start) {
+ dout(20) << __func__ << " " << soid << " < [" << m_start << "," << m_end << ")"
+ << dendl;
+ m_scrub_cstat.add(delta_stats);
+ } else {
+ dout(20) << __func__ << " " << soid << " >= [" << m_start << "," << m_end << ")"
+ << dendl;
+ }
+ }
+}
diff --git a/src/osd/PrimaryLogScrub.h b/src/osd/PrimaryLogScrub.h
new file mode 100644
index 000000000..78353d6db
--- /dev/null
+++ b/src/osd/PrimaryLogScrub.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+// the './' includes are marked this way to affect clang-format
+#include "./pg_scrubber.h"
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "OSD.h"
+#include "scrub_machine.h"
+
+class PrimaryLogPG;
+
+/**
+ * The derivative of PgScrubber that is used by PrimaryLogPG.
+ */
+class PrimaryLogScrub : public PgScrubber {
+ public:
+ explicit PrimaryLogScrub(PrimaryLogPG* pg);
+
+ void _scrub_finish() final;
+
+ bool get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const final;
+
+ void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid) final;
+
+ private:
+ // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
+ PrimaryLogPG* const m_pl_pg;
+
+ /**
+ * Validate consistency of the object info and snap sets.
+ */
+ void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
+
+ void log_missing(int missing,
+ const std::optional<hobject_t>& head,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ const char* func,
+ bool allow_incomplete_clones);
+
+ int process_clones_to(const std::optional<hobject_t>& head,
+ const std::optional<SnapSet>& snapset,
+ LogChannelRef clog,
+ const spg_t& pgid,
+ bool allow_incomplete_clones,
+ std::optional<snapid_t> target,
+ std::vector<snapid_t>::reverse_iterator* curclone,
+ inconsistent_snapset_wrapper& snap_error);
+
+
+ // handle our part in stats collection
+ object_stat_collection_t m_scrub_cstat;
+ void _scrub_clear_state() final; // which just clears the stats
+};
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
new file mode 100644
index 000000000..1468764c3
--- /dev/null
+++ b/src/osd/ReplicatedBackend.cc
@@ -0,0 +1,2425 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "common/errno.h"
+#include "ReplicatedBackend.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPushReply.h"
+#include "common/EventTrace.h"
+#include "include/random.h"
+#include "include/util.h"
+#include "OSD.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) {
+ return pgb->get_parent()->gen_dbg_prefix(*_dout);
+}
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostringstream;
+using std::set;
+using std::pair;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferhash;
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+namespace {
+class PG_SendMessageOnConn: public Context {
+ PGBackend::Listener *pg;
+ Message *reply;
+ ConnectionRef conn;
+ public:
+ PG_SendMessageOnConn(
+ PGBackend::Listener *pg,
+ Message *reply,
+ ConnectionRef conn) : pg(pg), reply(reply), conn(conn) {}
+ void finish(int) override {
+ pg->send_message_osd_cluster(MessageRef(reply, false), conn.get());
+ }
+};
+
+class PG_RecoveryQueueAsync : public Context {
+ PGBackend::Listener *pg;
+ unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+ public:
+ PG_RecoveryQueueAsync(
+ PGBackend::Listener *pg,
+ GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {}
+ void finish(int) override {
+ pg->schedule_recovery_work(c.release());
+ }
+};
+}
+
+struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context {
+ ReplicatedBackend *pg;
+ RepModifyRef rm;
+ C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r)
+ : pg(pg), rm(r) {}
+ void finish(int r) override {
+ pg->repop_commit(rm);
+ }
+};
+
+static void log_subop_stats(
+ PerfCounters *logger,
+ OpRequestRef op, int subop)
+{
+ utime_t latency = ceph_clock_now();
+ latency -= op->get_req()->get_recv_stamp();
+
+
+ logger->inc(l_osd_sop);
+ logger->tinc(l_osd_sop_lat, latency);
+ logger->inc(subop);
+
+ if (subop != l_osd_sop_pull) {
+ uint64_t inb = op->get_req()->get_data().length();
+ logger->inc(l_osd_sop_inb, inb);
+ if (subop == l_osd_sop_w) {
+ logger->inc(l_osd_sop_w_inb, inb);
+ logger->tinc(l_osd_sop_w_lat, latency);
+ } else if (subop == l_osd_sop_push) {
+ logger->inc(l_osd_sop_push_inb, inb);
+ logger->tinc(l_osd_sop_push_lat, latency);
+ } else
+ ceph_abort_msg("no support subop");
+ } else {
+ logger->tinc(l_osd_sop_pull_lat, latency);
+ }
+}
+
+ReplicatedBackend::ReplicatedBackend(
+ PGBackend::Listener *pg,
+ const coll_t &coll,
+ ObjectStore::CollectionHandle &c,
+ ObjectStore *store,
+ CephContext *cct) :
+ PGBackend(cct, pg, store, coll, c) {}
+
+void ReplicatedBackend::run_recovery_op(
+ PGBackend::RecoveryHandle *_h,
+ int priority)
+{
+ RPGHandle *h = static_cast<RPGHandle *>(_h);
+ send_pushes(priority, h->pushes);
+ send_pulls(priority, h->pulls);
+ send_recovery_deletes(priority, h->deletes);
+ delete h;
+}
+
+int ReplicatedBackend::recover_object(
+ const hobject_t &hoid,
+ eversion_t v,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *_h
+ )
+{
+ dout(10) << __func__ << ": " << hoid << dendl;
+ RPGHandle *h = static_cast<RPGHandle *>(_h);
+ if (get_parent()->get_local_missing().is_missing(hoid)) {
+ ceph_assert(!obc);
+ // pull
+ prepare_pull(
+ v,
+ hoid,
+ head,
+ h);
+ } else {
+ ceph_assert(obc);
+ int started = start_pushes(
+ hoid,
+ obc,
+ h);
+ if (started < 0) {
+ pushing[hoid].clear();
+ return started;
+ }
+ }
+ return 0;
+}
+
+void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap)
+{
+ for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin();
+ i != pull_from_peer.end();
+ ) {
+ if (osdmap->is_down(i->first.osd)) {
+ dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
+ << ", osdmap has it marked down" << dendl;
+ for (set<hobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ get_parent()->cancel_pull(*j);
+ clear_pull(pulling.find(*j), false);
+ }
+ pull_from_peer.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+}
+
+bool ReplicatedBackend::can_handle_while_inactive(OpRequestRef op)
+{
+ dout(10) << __func__ << ": " << op << dendl;
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_PULL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool ReplicatedBackend::_handle_message(
+ OpRequestRef op
+ )
+{
+ dout(10) << __func__ << ": " << op << dendl;
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_PUSH:
+ do_push(op);
+ return true;
+
+ case MSG_OSD_PG_PULL:
+ do_pull(op);
+ return true;
+
+ case MSG_OSD_PG_PUSH_REPLY:
+ do_push_reply(op);
+ return true;
+
+ case MSG_OSD_REPOP: {
+ do_repop(op);
+ return true;
+ }
+
+ case MSG_OSD_REPOPREPLY: {
+ do_repop_reply(op);
+ return true;
+ }
+
+ default:
+ break;
+ }
+ return false;
+}
+
+void ReplicatedBackend::clear_recovery_state()
+{
+ // clear pushing/pulling maps
+ for (auto &&i: pushing) {
+ for (auto &&j: i.second) {
+ get_parent()->release_locks(j.second.lock_manager);
+ }
+ }
+ pushing.clear();
+
+ for (auto &&i: pulling) {
+ get_parent()->release_locks(i.second.lock_manager);
+ }
+ pulling.clear();
+ pull_from_peer.clear();
+}
+
+void ReplicatedBackend::on_change()
+{
+ dout(10) << __func__ << dendl;
+ for (auto& op : in_progress_ops) {
+ delete op.second->on_commit;
+ op.second->on_commit = nullptr;
+ }
+ in_progress_ops.clear();
+ clear_recovery_state();
+}
+
+int ReplicatedBackend::objects_read_sync(
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ bufferlist *bl)
+{
+ return store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags);
+}
+
+int ReplicatedBackend::objects_readv_sync(
+ const hobject_t &hoid,
+ map<uint64_t, uint64_t>&& m,
+ uint32_t op_flags,
+ bufferlist *bl)
+{
+ interval_set<uint64_t> im(std::move(m));
+ auto r = store->readv(ch, ghobject_t(hoid), im, *bl, op_flags);
+ if (r >= 0) {
+ m = std::move(im).detach();
+ }
+ return r;
+}
+
+void ReplicatedBackend::objects_read_async(
+ const hobject_t &hoid,
+ const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ pair<bufferlist*, Context*> > > &to_read,
+ Context *on_complete,
+ bool fast_read)
+{
+ ceph_abort_msg("async read is not used by replica pool");
+}
+
+class C_OSD_OnOpCommit : public Context {
+ ReplicatedBackend *pg;
+ ceph::ref_t<ReplicatedBackend::InProgressOp> op;
+public:
+ C_OSD_OnOpCommit(ReplicatedBackend *pg, ceph::ref_t<ReplicatedBackend::InProgressOp> op)
+ : pg(pg), op(std::move(op)) {}
+ void finish(int) override {
+ pg->op_commit(op);
+ }
+};
+
+void generate_transaction(
+ PGTransactionUPtr &pgt,
+ const coll_t &coll,
+ vector<pg_log_entry_t> &log_entries,
+ ObjectStore::Transaction *t,
+ set<hobject_t> *added,
+ set<hobject_t> *removed,
+ const ceph_release_t require_osd_release = ceph_release_t::unknown )
+{
+ ceph_assert(t);
+ ceph_assert(added);
+ ceph_assert(removed);
+
+ for (auto &&le: log_entries) {
+ le.mark_unrollbackable();
+ auto oiter = pgt->op_map.find(le.soid);
+ if (oiter != pgt->op_map.end() && oiter->second.updated_snaps) {
+ bufferlist bl(oiter->second.updated_snaps->second.size() * 8 + 8);
+ encode(oiter->second.updated_snaps->second, bl);
+ le.snaps.swap(bl);
+ le.snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+ }
+
+ pgt->safe_create_traverse(
+ [&](pair<const hobject_t, PGTransaction::ObjectOperation> &obj_op) {
+ const hobject_t &oid = obj_op.first;
+ const ghobject_t goid =
+ ghobject_t(oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD);
+ const PGTransaction::ObjectOperation &op = obj_op.second;
+
+ if (oid.is_temp()) {
+ if (op.is_fresh_object()) {
+ added->insert(oid);
+ } else if (op.is_delete()) {
+ removed->insert(oid);
+ }
+ }
+
+ if (op.delete_first) {
+ t->remove(coll, goid);
+ }
+
+ match(
+ op.init_type,
+ [&](const PGTransaction::ObjectOperation::Init::None &) {
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Create &op) {
+ if (require_osd_release >= ceph_release_t::octopus) {
+ t->create(coll, goid);
+ } else {
+ t->touch(coll, goid);
+ }
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Clone &op) {
+ t->clone(
+ coll,
+ ghobject_t(
+ op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+ goid);
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Rename &op) {
+ ceph_assert(op.source.is_temp());
+ t->collection_move_rename(
+ coll,
+ ghobject_t(
+ op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+ coll,
+ goid);
+ });
+
+ if (op.truncate) {
+ t->truncate(coll, goid, op.truncate->first);
+ if (op.truncate->first != op.truncate->second)
+ t->truncate(coll, goid, op.truncate->second);
+ }
+
+ if (!op.attr_updates.empty()) {
+ map<string, bufferlist> attrs;
+ for (auto &&p: op.attr_updates) {
+ if (p.second)
+ attrs[p.first] = *(p.second);
+ else
+ t->rmattr(coll, goid, p.first);
+ }
+ t->setattrs(coll, goid, attrs);
+ }
+
+ if (op.clear_omap)
+ t->omap_clear(coll, goid);
+ if (op.omap_header)
+ t->omap_setheader(coll, goid, *(op.omap_header));
+
+ for (auto &&up: op.omap_updates) {
+ using UpdateType = PGTransaction::ObjectOperation::OmapUpdateType;
+ switch (up.first) {
+ case UpdateType::Remove:
+ t->omap_rmkeys(coll, goid, up.second);
+ break;
+ case UpdateType::Insert:
+ t->omap_setkeys(coll, goid, up.second);
+ break;
+ case UpdateType::RemoveRange:
+ t->omap_rmkeyrange(coll, goid, up.second);
+ break;
+ }
+ }
+
+ // updated_snaps doesn't matter since we marked unrollbackable
+
+ if (op.alloc_hint) {
+ auto &hint = *(op.alloc_hint);
+ t->set_alloc_hint(
+ coll,
+ goid,
+ hint.expected_object_size,
+ hint.expected_write_size,
+ hint.flags);
+ }
+
+ for (auto &&extent: op.buffer_updates) {
+ using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+ match(
+ extent.get_val(),
+ [&](const BufferUpdate::Write &op) {
+ t->write(
+ coll,
+ goid,
+ extent.get_off(),
+ extent.get_len(),
+ op.buffer,
+ op.fadvise_flags);
+ },
+ [&](const BufferUpdate::Zero &op) {
+ t->zero(
+ coll,
+ goid,
+ extent.get_off(),
+ extent.get_len());
+ },
+ [&](const BufferUpdate::CloneRange &op) {
+ ceph_assert(op.len == extent.get_len());
+ t->clone_range(
+ coll,
+ ghobject_t(op.from, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+ goid,
+ op.offset,
+ extent.get_len(),
+ extent.get_off());
+ });
+ }
+ });
+}
+
+void ReplicatedBackend::submit_transaction(
+ const hobject_t &soid,
+ const object_stat_sum_t &delta_stats,
+ const eversion_t &at_version,
+ PGTransactionUPtr &&_t,
+ const eversion_t &trim_to,
+ const eversion_t &min_last_complete_ondisk,
+ vector<pg_log_entry_t>&& _log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ OpRequestRef orig_op)
+{
+ parent->apply_stats(
+ soid,
+ delta_stats);
+
+ vector<pg_log_entry_t> log_entries(_log_entries);
+ ObjectStore::Transaction op_t;
+ PGTransactionUPtr t(std::move(_t));
+ set<hobject_t> added, removed;
+ generate_transaction(
+ t,
+ coll,
+ log_entries,
+ &op_t,
+ &added,
+ &removed,
+ get_osdmap()->require_osd_release);
+ ceph_assert(added.size() <= 1);
+ ceph_assert(removed.size() <= 1);
+
+ auto insert_res = in_progress_ops.insert(
+ make_pair(
+ tid,
+ ceph::make_ref<InProgressOp>(
+ tid, on_all_commit,
+ orig_op, at_version)
+ )
+ );
+ ceph_assert(insert_res.second);
+ InProgressOp &op = *insert_res.first->second;
+
+#ifdef HAVE_JAEGER
+ auto rep_sub_trans = jaeger_tracing::child_span("ReplicatedBackend::submit_transaction", orig_op->osd_parent_span);
+#endif
+ op.waiting_for_commit.insert(
+ parent->get_acting_recovery_backfill_shards().begin(),
+ parent->get_acting_recovery_backfill_shards().end());
+
+ issue_op(
+ soid,
+ at_version,
+ tid,
+ reqid,
+ trim_to,
+ min_last_complete_ondisk,
+ added.size() ? *(added.begin()) : hobject_t(),
+ removed.size() ? *(removed.begin()) : hobject_t(),
+ log_entries,
+ hset_history,
+ &op,
+ op_t);
+
+ add_temp_objs(added);
+ clear_temp_objs(removed);
+
+ parent->log_operation(
+ std::move(log_entries),
+ hset_history,
+ trim_to,
+ at_version,
+ min_last_complete_ondisk,
+ true,
+ op_t);
+
+ op_t.register_on_commit(
+ parent->bless_context(
+ new C_OSD_OnOpCommit(this, &op)));
+
+ vector<ObjectStore::Transaction> tls;
+ tls.push_back(std::move(op_t));
+
+ parent->queue_transactions(tls, op.op);
+ if (at_version != eversion_t()) {
+ parent->op_applied(at_version);
+ }
+}
+
+void ReplicatedBackend::op_commit(const ceph::ref_t<InProgressOp>& op)
+{
+ if (op->on_commit == nullptr) {
+ // aborted
+ return;
+ }
+
+ FUNCTRACE(cct);
+ OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true);
+ dout(10) << __func__ << ": " << op->tid << dendl;
+ if (op->op) {
+ op->op->mark_event("op_commit");
+ op->op->pg_trace.event("op commit");
+ }
+
+ op->waiting_for_commit.erase(get_parent()->whoami_shard());
+
+ if (op->waiting_for_commit.empty()) {
+ op->on_commit->complete(0);
+ op->on_commit = 0;
+ in_progress_ops.erase(op->tid);
+ }
+}
+
+void ReplicatedBackend::do_repop_reply(OpRequestRef op)
+{
+ static_cast<MOSDRepOpReply*>(op->get_nonconst_req())->finish_decode();
+ auto r = op->get_req<MOSDRepOpReply>();
+ ceph_assert(r->get_header().type == MSG_OSD_REPOPREPLY);
+
+ op->mark_started();
+
+ // must be replication.
+ ceph_tid_t rep_tid = r->get_tid();
+ pg_shard_t from = r->from;
+
+ auto iter = in_progress_ops.find(rep_tid);
+ if (iter != in_progress_ops.end()) {
+ InProgressOp &ip_op = *iter->second;
+ const MOSDOp *m = nullptr;
+ if (ip_op.op)
+ m = ip_op.op->get_req<MOSDOp>();
+
+ if (m)
+ dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m
+ << " ack_type " << (int)r->ack_type
+ << " from " << from
+ << dendl;
+ else
+ dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) "
+ << " ack_type " << (int)r->ack_type
+ << " from " << from
+ << dendl;
+
+ // oh, good.
+
+ if (r->ack_type & CEPH_OSD_FLAG_ONDISK) {
+ ceph_assert(ip_op.waiting_for_commit.count(from));
+ ip_op.waiting_for_commit.erase(from);
+ if (ip_op.op) {
+ ip_op.op->mark_event("sub_op_commit_rec");
+ ip_op.op->pg_trace.event("sub_op_commit_rec");
+ }
+ } else {
+ // legacy peer; ignore
+ }
+
+ parent->update_peer_last_complete_ondisk(
+ from,
+ r->get_last_complete_ondisk());
+
+ if (ip_op.waiting_for_commit.empty() &&
+ ip_op.on_commit) {
+ ip_op.on_commit->complete(0);
+ ip_op.on_commit = 0;
+ in_progress_ops.erase(iter);
+ }
+ }
+}
+
+int ReplicatedBackend::be_deep_scrub(
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o)
+{
+ dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
+ int r;
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;
+
+ utime_t sleeptime;
+ sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+ if (sleeptime != utime_t()) {
+ lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+ sleeptime.sleep();
+ }
+
+ ceph_assert(poid == pos.ls[pos.pos]);
+ if (!pos.data_done()) {
+ if (pos.data_pos == 0) {
+ pos.data_hash = bufferhash(-1);
+ }
+
+ bufferlist bl;
+ r = store->read(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ pos.data_pos,
+ cct->_conf->osd_deep_scrub_stride, bl,
+ fadvise_flags);
+ if (r < 0) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on read, read_error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ if (r > 0) {
+ pos.data_hash << bl;
+ }
+ pos.data_pos += r;
+ if (r == cct->_conf->osd_deep_scrub_stride) {
+ dout(20) << __func__ << " " << poid << " more data, digest so far 0x"
+ << std::hex << pos.data_hash.digest() << std::dec << dendl;
+ return -EINPROGRESS;
+ }
+ // done with bytes
+ pos.data_pos = -1;
+ o.digest = pos.data_hash.digest();
+ o.digest_present = true;
+ dout(20) << __func__ << " " << poid << " done with data, digest 0x"
+ << std::hex << o.digest << std::dec << dendl;
+ }
+
+ // omap header
+ if (pos.omap_pos.empty()) {
+ pos.omap_hash = bufferhash(-1);
+
+ bufferlist hdrbl;
+ r = store->omap_get_header(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &hdrbl, true);
+ if (r == -EIO) {
+ dout(20) << __func__ << " " << poid << " got "
+ << r << " on omap header read, read_error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ if (r == 0 && hdrbl.length()) {
+ bool encoded = false;
+ dout(25) << "CRC header " << cleanbin(hdrbl, encoded, true) << dendl;
+ pos.omap_hash << hdrbl;
+ }
+ }
+
+ // omap
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
+ ch,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ ceph_assert(iter);
+ if (pos.omap_pos.length()) {
+ iter->lower_bound(pos.omap_pos);
+ } else {
+ iter->seek_to_first();
+ }
+ int max = g_conf()->osd_deep_scrub_keys;
+ while (iter->status() == 0 && iter->valid()) {
+ pos.omap_bytes += iter->value().length();
+ ++pos.omap_keys;
+ --max;
+ // fixme: we can do this more efficiently.
+ bufferlist bl;
+ encode(iter->key(), bl);
+ encode(iter->value(), bl);
+ pos.omap_hash << bl;
+
+ iter->next();
+
+ if (iter->valid() && max == 0) {
+ pos.omap_pos = iter->key();
+ return -EINPROGRESS;
+ }
+ if (iter->status() < 0) {
+ dout(25) << __func__ << " " << poid
+ << " on omap scan, db status error" << dendl;
+ o.read_error = true;
+ return 0;
+ }
+ }
+
+ if (pos.omap_keys > cct->_conf->
+ osd_deep_scrub_large_omap_object_key_threshold ||
+ pos.omap_bytes > cct->_conf->
+ osd_deep_scrub_large_omap_object_value_sum_threshold) {
+ dout(25) << __func__ << " " << poid
+ << " large omap object detected. Object has " << pos.omap_keys
+ << " keys and size " << pos.omap_bytes << " bytes" << dendl;
+ o.large_omap_object_found = true;
+ o.large_omap_object_key_count = pos.omap_keys;
+ o.large_omap_object_value_size = pos.omap_bytes;
+ map.has_large_omap_object_errors = true;
+ }
+
+ o.omap_digest = pos.omap_hash.digest();
+ o.omap_digest_present = true;
+ dout(20) << __func__ << " done with " << poid << " omap_digest "
+ << std::hex << o.omap_digest << std::dec << dendl;
+
+ // Sum up omap usage
+ if (pos.omap_keys > 0 || pos.omap_bytes > 0) {
+ dout(25) << __func__ << " adding " << pos.omap_keys << " keys and "
+ << pos.omap_bytes << " bytes to pg_stats sums" << dendl;
+ map.has_omap_keys = true;
+ o.object_omap_bytes = pos.omap_bytes;
+ o.object_omap_keys = pos.omap_keys;
+ }
+
+ // done!
+ return 0;
+}
+
+void ReplicatedBackend::_do_push(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGPush>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_PUSH);
+ pg_shard_t from = m->from;
+
+ op->mark_started();
+
+ vector<PushReplyOp> replies;
+ ObjectStore::Transaction t;
+ if (get_parent()->check_failsafe_full()) {
+ dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
+ ceph_abort();
+ }
+ for (vector<PushOp>::const_iterator i = m->pushes.begin();
+ i != m->pushes.end();
+ ++i) {
+ replies.push_back(PushReplyOp());
+ handle_push(from, *i, &(replies.back()), &t, m->is_repair);
+ }
+
+ MOSDPGPushReply *reply = new MOSDPGPushReply;
+ reply->from = get_parent()->whoami_shard();
+ reply->set_priority(m->get_priority());
+ reply->pgid = get_info().pgid;
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->replies.swap(replies);
+ reply->compute_cost(cct);
+
+ t.register_on_complete(
+ new PG_SendMessageOnConn(
+ get_parent(), reply, m->get_connection()));
+
+ get_parent()->queue_transaction(std::move(t));
+}
+
+struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
+ ReplicatedBackend *bc;
+ list<ReplicatedBackend::pull_complete_info> to_continue;
+ int priority;
+ C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
+ : bc(bc), priority(priority) {}
+
+ void finish(ThreadPool::TPHandle &handle) override {
+ ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
+ for (auto &&i: to_continue) {
+ auto j = bc->pulling.find(i.hoid);
+ ceph_assert(j != bc->pulling.end());
+ ObjectContextRef obc = j->second.obc;
+ bc->clear_pull(j, false /* already did it */);
+ int started = bc->start_pushes(i.hoid, obc, h);
+ if (started < 0) {
+ bc->pushing[i.hoid].clear();
+ bc->get_parent()->on_failed_pull(
+ { bc->get_parent()->whoami_shard() },
+ i.hoid, obc->obs.oi.version);
+ } else if (!started) {
+ bc->get_parent()->on_global_recover(
+ i.hoid, i.stat, false);
+ }
+ handle.reset_tp_timeout();
+ }
+ bc->run_recovery_op(h, priority);
+ }
+};
+
+void ReplicatedBackend::_do_pull_response(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGPush>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_PUSH);
+ pg_shard_t from = m->from;
+
+ op->mark_started();
+
+ vector<PullOp> replies(1);
+ if (get_parent()->check_failsafe_full()) {
+ dout(10) << __func__ << " Out of space (failsafe) processing pull response (push)." << dendl;
+ ceph_abort();
+ }
+
+ ObjectStore::Transaction t;
+ list<pull_complete_info> to_continue;
+ for (vector<PushOp>::const_iterator i = m->pushes.begin();
+ i != m->pushes.end();
+ ++i) {
+ bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, &t);
+ if (more)
+ replies.push_back(PullOp());
+ }
+ if (!to_continue.empty()) {
+ C_ReplicatedBackend_OnPullComplete *c =
+ new C_ReplicatedBackend_OnPullComplete(
+ this,
+ m->get_priority());
+ c->to_continue.swap(to_continue);
+ t.register_on_complete(
+ new PG_RecoveryQueueAsync(
+ get_parent(),
+ get_parent()->bless_unlocked_gencontext(c)));
+ }
+ replies.erase(replies.end() - 1);
+
+ if (replies.size()) {
+ MOSDPGPull *reply = new MOSDPGPull;
+ reply->from = parent->whoami_shard();
+ reply->set_priority(m->get_priority());
+ reply->pgid = get_info().pgid;
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->set_pulls(std::move(replies));
+ reply->compute_cost(cct);
+
+ t.register_on_complete(
+ new PG_SendMessageOnConn(
+ get_parent(), reply, m->get_connection()));
+ }
+
+ get_parent()->queue_transaction(std::move(t));
+}
+
+void ReplicatedBackend::do_pull(OpRequestRef op)
+{
+ MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_nonconst_req());
+ ceph_assert(m->get_type() == MSG_OSD_PG_PULL);
+ pg_shard_t from = m->from;
+
+ map<pg_shard_t, vector<PushOp> > replies;
+ for (auto& i : m->take_pulls()) {
+ replies[from].push_back(PushOp());
+ handle_pull(from, i, &(replies[from].back()));
+ }
+ send_pushes(m->get_priority(), replies);
+}
+
+void ReplicatedBackend::do_push_reply(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDPGPushReply>();
+ ceph_assert(m->get_type() == MSG_OSD_PG_PUSH_REPLY);
+ pg_shard_t from = m->from;
+
+ vector<PushOp> replies(1);
+ for (vector<PushReplyOp>::const_iterator i = m->replies.begin();
+ i != m->replies.end();
+ ++i) {
+ bool more = handle_push_reply(from, *i, &(replies.back()));
+ if (more)
+ replies.push_back(PushOp());
+ }
+ replies.erase(replies.end() - 1);
+
+ map<pg_shard_t, vector<PushOp> > _replies;
+ _replies[from].swap(replies);
+ send_pushes(m->get_priority(), _replies);
+}
+
+Message * ReplicatedBackend::generate_subop(
+ const hobject_t &soid,
+ const eversion_t &at_version,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ eversion_t pg_trim_to,
+ eversion_t min_last_complete_ondisk,
+ hobject_t new_temp_oid,
+ hobject_t discard_temp_oid,
+ const bufferlist &log_entries,
+ std::optional<pg_hit_set_history_t> &hset_hist,
+ ObjectStore::Transaction &op_t,
+ pg_shard_t peer,
+ const pg_info_t &pinfo)
+{
+ int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+ // forward the write/update/whatever
+ MOSDRepOp *wr = new MOSDRepOp(
+ reqid, parent->whoami_shard(),
+ spg_t(get_info().pgid.pgid, peer.shard),
+ soid, acks_wanted,
+ get_osdmap_epoch(),
+ parent->get_last_peering_reset_epoch(),
+ tid, at_version);
+
+ // ship resulting transaction, log entries, and pg_stats
+ if (!parent->should_send_op(peer, soid)) {
+ ObjectStore::Transaction t;
+ encode(t, wr->get_data());
+ } else {
+ encode(op_t, wr->get_data());
+ wr->get_header().data_off = op_t.get_data_alignment();
+ }
+
+ wr->logbl = log_entries;
+
+ if (pinfo.is_incomplete())
+ wr->pg_stats = pinfo.stats; // reflects backfill progress
+ else
+ wr->pg_stats = get_info().stats;
+
+ wr->pg_trim_to = pg_trim_to;
+
+ if (HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD)) {
+ wr->min_last_complete_ondisk = min_last_complete_ondisk;
+ } else {
+ /* Some replicas need this field to be at_version. New replicas
+ * will ignore it */
+ wr->set_rollback_to(at_version);
+ }
+
+ wr->new_temp_oid = new_temp_oid;
+ wr->discard_temp_oid = discard_temp_oid;
+ wr->updated_hit_set_history = hset_hist;
+ return wr;
+}
+
+void ReplicatedBackend::issue_op(
+ const hobject_t &soid,
+ const eversion_t &at_version,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ eversion_t pg_trim_to,
+ eversion_t min_last_complete_ondisk,
+ hobject_t new_temp_oid,
+ hobject_t discard_temp_oid,
+ const vector<pg_log_entry_t> &log_entries,
+ std::optional<pg_hit_set_history_t> &hset_hist,
+ InProgressOp *op,
+ ObjectStore::Transaction &op_t)
+{
+ if (parent->get_acting_recovery_backfill_shards().size() > 1) {
+ if (op->op) {
+ op->op->pg_trace.event("issue replication ops");
+ ostringstream ss;
+ set<pg_shard_t> replicas = parent->get_acting_recovery_backfill_shards();
+ replicas.erase(parent->whoami_shard());
+ ss << "waiting for subops from " << replicas;
+ op->op->mark_sub_op_sent(ss.str());
+ }
+
+ // avoid doing the same work in generate_subop
+ bufferlist logs;
+ encode(log_entries, logs);
+
+ for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
+ if (shard == parent->whoami_shard()) continue;
+ const pg_info_t &pinfo = parent->get_shard_info().find(shard)->second;
+
+ Message *wr;
+ wr = generate_subop(
+ soid,
+ at_version,
+ tid,
+ reqid,
+ pg_trim_to,
+ min_last_complete_ondisk,
+ new_temp_oid,
+ discard_temp_oid,
+ logs,
+ hset_hist,
+ op_t,
+ shard,
+ pinfo);
+ if (op->op && op->op->pg_trace)
+ wr->trace.init("replicated op", nullptr, &op->op->pg_trace);
+ get_parent()->send_message_osd_cluster(
+ shard.osd, wr, get_osdmap_epoch());
+ }
+ }
+}
+
+// sub op modify
+void ReplicatedBackend::do_repop(OpRequestRef op)
+{
+ static_cast<MOSDRepOp*>(op->get_nonconst_req())->finish_decode();
+ auto m = op->get_req<MOSDRepOp>();
+ int msg_type = m->get_type();
+ ceph_assert(MSG_OSD_REPOP == msg_type);
+
+ const hobject_t& soid = m->poid;
+
+ dout(10) << __func__ << " " << soid
+ << " v " << m->version
+ << (m->logbl.length() ? " (transaction)" : " (parallel exec")
+ << " " << m->logbl.length()
+ << dendl;
+
+#ifdef HAVE_JAEGER
+ auto do_repop_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+#endif
+
+ // sanity checks
+ ceph_assert(m->map_epoch >= get_info().history.same_interval_since);
+
+ dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl;
+ parent->maybe_preempt_replica_scrub(soid);
+
+ int ackerosd = m->get_source().num();
+
+ op->mark_started();
+
+ RepModifyRef rm(std::make_shared<RepModify>());
+ rm->op = op;
+ rm->ackerosd = ackerosd;
+ rm->last_complete = get_info().last_complete;
+ rm->epoch_started = get_osdmap_epoch();
+
+ ceph_assert(m->logbl.length());
+ // shipped transaction and log entries
+ vector<pg_log_entry_t> log;
+
+ auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+ decode(rm->opt, p);
+
+ if (m->new_temp_oid != hobject_t()) {
+ dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl;
+ add_temp_obj(m->new_temp_oid);
+ }
+ if (m->discard_temp_oid != hobject_t()) {
+ dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl;
+ if (rm->opt.empty()) {
+ dout(10) << __func__ << ": removing object " << m->discard_temp_oid
+ << " since we won't get the transaction" << dendl;
+ rm->localt.remove(coll, ghobject_t(m->discard_temp_oid));
+ }
+ clear_temp_obj(m->discard_temp_oid);
+ }
+
+ p = const_cast<bufferlist&>(m->logbl).begin();
+ decode(log, p);
+ rm->opt.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+
+ bool update_snaps = false;
+ if (!rm->opt.empty()) {
+ // If the opt is non-empty, we infer we are before
+ // last_backfill (according to the primary, not our
+ // not-quite-accurate value), and should update the
+ // collections now. Otherwise, we do it later on push.
+ update_snaps = true;
+ }
+
+ // flag set to true during async recovery
+ bool async = false;
+ pg_missing_tracker_t pmissing = get_parent()->get_local_missing();
+ if (pmissing.is_missing(soid)) {
+ async = true;
+ dout(30) << __func__ << " is_missing " << pmissing.is_missing(soid) << dendl;
+ for (auto &&e: log) {
+ dout(30) << " add_next_event entry " << e << dendl;
+ get_parent()->add_local_next_event(e);
+ dout(30) << " entry is_delete " << e.is_delete() << dendl;
+ }
+ }
+
+ parent->update_stats(m->pg_stats);
+ parent->log_operation(
+ std::move(log),
+ m->updated_hit_set_history,
+ m->pg_trim_to,
+ m->version, /* Replicated PGs don't have rollback info */
+ m->min_last_complete_ondisk,
+ update_snaps,
+ rm->localt,
+ async);
+
+ rm->opt.register_on_commit(
+ parent->bless_context(
+ new C_OSD_RepModifyCommit(this, rm)));
+ vector<ObjectStore::Transaction> tls;
+ tls.reserve(2);
+ tls.push_back(std::move(rm->localt));
+ tls.push_back(std::move(rm->opt));
+ parent->queue_transactions(tls, op);
+ // op is cleaned up by oncommit/onapply when both are executed
+ dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl;
+}
+
+void ReplicatedBackend::repop_commit(RepModifyRef rm)
+{
+ rm->op->mark_commit_sent();
+ rm->op->pg_trace.event("sup_op_commit");
+ rm->committed = true;
+
+ // send commit.
+ auto m = rm->op->get_req<MOSDRepOp>();
+ ceph_assert(m->get_type() == MSG_OSD_REPOP);
+ dout(10) << __func__ << " on op " << *m
+ << ", sending commit to osd." << rm->ackerosd
+ << dendl;
+ ceph_assert(get_osdmap()->is_up(rm->ackerosd));
+
+ get_parent()->update_last_complete_ondisk(rm->last_complete);
+
+ MOSDRepOpReply *reply = new MOSDRepOpReply(
+ m,
+ get_parent()->whoami_shard(),
+ 0, get_osdmap_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+ reply->set_last_complete_ondisk(rm->last_complete);
+ reply->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
+ reply->trace = rm->op->pg_trace;
+ get_parent()->send_message_osd_cluster(
+ rm->ackerosd, reply, get_osdmap_epoch());
+
+ log_subop_stats(get_parent()->get_logger(), rm->op, l_osd_sop_w);
+}
+
+
+// ===========================================================
+
+void ReplicatedBackend::calc_head_subsets(
+ ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+ ObcLockManager &manager)
+{
+ dout(10) << "calc_head_subsets " << head
+ << " clone_overlap " << snapset.clone_overlap << dendl;
+
+ uint64_t size = obc->obs.oi.size;
+ if (size)
+ data_subset.insert(0, size);
+
+ if (HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS)) {
+ const auto it = missing.get_items().find(head);
+ assert(it != missing.get_items().end());
+ data_subset.intersection_of(it->second.clean_regions.get_dirty_regions());
+ dout(10) << "calc_head_subsets " << head
+ << " data_subset " << data_subset << dendl;
+ }
+
+ if (get_parent()->get_pool().allow_incomplete_clones()) {
+ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+ return;
+ }
+
+ if (!cct->_conf->osd_recover_clone_overlap) {
+ dout(10) << "calc_head_subsets " << head << " -- osd_recover_clone_overlap disabled" << dendl;
+ return;
+ }
+
+
+ interval_set<uint64_t> cloning;
+ interval_set<uint64_t> prev;
+ hobject_t c = head;
+ if (size)
+ prev.insert(0, size);
+
+ for (int j=snapset.clones.size()-1; j>=0; j--) {
+ c.snap = snapset.clones[j];
+ prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
+ if (!missing.is_missing(c) &&
+ c < last_backfill &&
+ get_parent()->try_lock_for_read(c, manager)) {
+ dout(10) << "calc_head_subsets " << head << " has prev " << c
+ << " overlap " << prev << dendl;
+ cloning = prev;
+ break;
+ }
+ dout(10) << "calc_head_subsets " << head << " does not have prev " << c
+ << " overlap " << prev << dendl;
+ }
+
+ cloning.intersection_of(data_subset);
+ if (cloning.empty()) {
+ dout(10) << "skipping clone, nothing needs to clone" << dendl;
+ return;
+ }
+
+ if (cloning.num_intervals() > g_conf().get_val<uint64_t>("osd_recover_clone_overlap_limit")) {
+ dout(10) << "skipping clone, too many holes" << dendl;
+ get_parent()->release_locks(manager);
+ clone_subsets.clear();
+ cloning.clear();
+ return;
+ }
+
+ // what's left for us to push?
+ clone_subsets[c] = cloning;
+ data_subset.subtract(cloning);
+
+ dout(10) << "calc_head_subsets " << head
+ << " data_subset " << data_subset
+ << " clone_subsets " << clone_subsets << dendl;
+}
+
+void ReplicatedBackend::calc_clone_subsets(
+ SnapSet& snapset, const hobject_t& soid,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+ ObcLockManager &manager)
+{
+ dout(10) << "calc_clone_subsets " << soid
+ << " clone_overlap " << snapset.clone_overlap << dendl;
+
+ uint64_t size = snapset.clone_size[soid.snap];
+ if (size)
+ data_subset.insert(0, size);
+
+ if (get_parent()->get_pool().allow_incomplete_clones()) {
+ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+ return;
+ }
+
+ if (!cct->_conf->osd_recover_clone_overlap) {
+ dout(10) << "calc_clone_subsets " << soid << " -- osd_recover_clone_overlap disabled" << dendl;
+ return;
+ }
+
+ unsigned i;
+ for (i=0; i < snapset.clones.size(); i++)
+ if (snapset.clones[i] == soid.snap)
+ break;
+
+ // any overlap with next older clone?
+ interval_set<uint64_t> cloning;
+ interval_set<uint64_t> prev;
+ if (size)
+ prev.insert(0, size);
+ for (int j=i-1; j>=0; j--) {
+ hobject_t c = soid;
+ c.snap = snapset.clones[j];
+ prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
+ if (!missing.is_missing(c) &&
+ c < last_backfill &&
+ get_parent()->try_lock_for_read(c, manager)) {
+ dout(10) << "calc_clone_subsets " << soid << " has prev " << c
+ << " overlap " << prev << dendl;
+ clone_subsets[c] = prev;
+ cloning.union_of(prev);
+ break;
+ }
+ dout(10) << "calc_clone_subsets " << soid << " does not have prev " << c
+ << " overlap " << prev << dendl;
+ }
+
+ // overlap with next newest?
+ interval_set<uint64_t> next;
+ if (size)
+ next.insert(0, size);
+ for (unsigned j=i+1; j<snapset.clones.size(); j++) {
+ hobject_t c = soid;
+ c.snap = snapset.clones[j];
+ next.intersection_of(snapset.clone_overlap[snapset.clones[j-1]]);
+ if (!missing.is_missing(c) &&
+ c < last_backfill &&
+ get_parent()->try_lock_for_read(c, manager)) {
+ dout(10) << "calc_clone_subsets " << soid << " has next " << c
+ << " overlap " << next << dendl;
+ clone_subsets[c] = next;
+ cloning.union_of(next);
+ break;
+ }
+ dout(10) << "calc_clone_subsets " << soid << " does not have next " << c
+ << " overlap " << next << dendl;
+ }
+
+ if (cloning.num_intervals() > g_conf().get_val<uint64_t>("osd_recover_clone_overlap_limit")) {
+ dout(10) << "skipping clone, too many holes" << dendl;
+ get_parent()->release_locks(manager);
+ clone_subsets.clear();
+ cloning.clear();
+ }
+
+
+ // what's left for us to push?
+ data_subset.subtract(cloning);
+
+ dout(10) << "calc_clone_subsets " << soid
+ << " data_subset " << data_subset
+ << " clone_subsets " << clone_subsets << dendl;
+}
+
+void ReplicatedBackend::prepare_pull(
+ eversion_t v,
+ const hobject_t& soid,
+ ObjectContextRef headctx,
+ RPGHandle *h)
+{
+ const auto missing_iter = get_parent()->get_local_missing().get_items().find(soid);
+ ceph_assert(missing_iter != get_parent()->get_local_missing().get_items().end());
+ eversion_t _v = missing_iter->second.need;
+ ceph_assert(_v == v);
+ const map<hobject_t, set<pg_shard_t>> &missing_loc(
+ get_parent()->get_missing_loc_shards());
+ const map<pg_shard_t, pg_missing_t > &peer_missing(
+ get_parent()->get_shard_missing());
+ map<hobject_t, set<pg_shard_t>>::const_iterator q = missing_loc.find(soid);
+ ceph_assert(q != missing_loc.end());
+ ceph_assert(!q->second.empty());
+
+ // pick a pullee
+ auto p = q->second.end();
+ if (cct->_conf->osd_debug_feed_pullee >= 0) {
+ for (auto it = q->second.begin(); it != q->second.end(); it++) {
+ if (it->osd == cct->_conf->osd_debug_feed_pullee) {
+ p = it;
+ break;
+ }
+ }
+ }
+ if (p == q->second.end()) {
+ // probably because user feed a wrong pullee
+ p = q->second.begin();
+ std::advance(p,
+ ceph::util::generate_random_number<int>(0,
+ q->second.size() - 1));
+ }
+ ceph_assert(get_osdmap()->is_up(p->osd));
+ pg_shard_t fromshard = *p;
+
+ dout(7) << "pull " << soid
+ << " v " << v
+ << " on osds " << q->second
+ << " from osd." << fromshard
+ << dendl;
+
+ ceph_assert(peer_missing.count(fromshard));
+ const pg_missing_t &pmissing = peer_missing.find(fromshard)->second;
+ if (pmissing.is_missing(soid, v)) {
+ ceph_assert(pmissing.get_items().find(soid)->second.have != v);
+ dout(10) << "pulling soid " << soid << " from osd " << fromshard
+ << " at version " << pmissing.get_items().find(soid)->second.have
+ << " rather than at version " << v << dendl;
+ v = pmissing.get_items().find(soid)->second.have;
+ ceph_assert(get_parent()->get_log().get_log().objects.count(soid) &&
+ (get_parent()->get_log().get_log().objects.find(soid)->second->op ==
+ pg_log_entry_t::LOST_REVERT) &&
+ (get_parent()->get_log().get_log().objects.find(
+ soid)->second->reverting_to ==
+ v));
+ }
+
+ ObjectRecoveryInfo recovery_info;
+ ObcLockManager lock_manager;
+
+ if (soid.is_snap()) {
+ ceph_assert(!get_parent()->get_local_missing().is_missing(soid.get_head()));
+ ceph_assert(headctx);
+ // check snapset
+ SnapSetContext *ssc = headctx->ssc;
+ ceph_assert(ssc);
+ dout(10) << " snapset " << ssc->snapset << dendl;
+ recovery_info.ss = ssc->snapset;
+ calc_clone_subsets(
+ ssc->snapset, soid, get_parent()->get_local_missing(),
+ get_info().last_backfill,
+ recovery_info.copy_subset,
+ recovery_info.clone_subset,
+ lock_manager);
+ // FIXME: this may overestimate if we are pulling multiple clones in parallel...
+ dout(10) << " pulling " << recovery_info << dendl;
+
+ ceph_assert(ssc->snapset.clone_size.count(soid.snap));
+ recovery_info.size = ssc->snapset.clone_size[soid.snap];
+ recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+ } else {
+ // pulling head or unversioned object.
+ // always pull the whole thing.
+ recovery_info.copy_subset.insert(0, (uint64_t)-1);
+ if (HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS))
+ recovery_info.copy_subset.intersection_of(missing_iter->second.clean_regions.get_dirty_regions());
+ recovery_info.size = ((uint64_t)-1);
+ recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+ }
+
+ h->pulls[fromshard].push_back(PullOp());
+ PullOp &op = h->pulls[fromshard].back();
+ op.soid = soid;
+
+ op.recovery_info = recovery_info;
+ op.recovery_info.soid = soid;
+ op.recovery_info.version = v;
+ op.recovery_progress.data_complete = false;
+ op.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty()
+ && HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS);
+ op.recovery_progress.data_recovered_to = 0;
+ op.recovery_progress.first = true;
+
+ ceph_assert(!pulling.count(soid));
+ pull_from_peer[fromshard].insert(soid);
+ PullInfo &pi = pulling[soid];
+ pi.from = fromshard;
+ pi.soid = soid;
+ pi.head_ctx = headctx;
+ pi.recovery_info = op.recovery_info;
+ pi.recovery_progress = op.recovery_progress;
+ pi.cache_dont_need = h->cache_dont_need;
+ pi.lock_manager = std::move(lock_manager);
+}
+
+/*
+ * intelligently push an object to a replica. make use of existing
+ * clones/heads and dup data ranges where possible.
+ */
+int ReplicatedBackend::prep_push_to_replica(
+ ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
+ PushOp *pop, bool cache_dont_need)
+{
+ const object_info_t& oi = obc->obs.oi;
+ uint64_t size = obc->obs.oi.size;
+
+ dout(10) << __func__ << ": " << soid << " v" << oi.version
+ << " size " << size << " to osd." << peer << dendl;
+
+ map<hobject_t, interval_set<uint64_t>> clone_subsets;
+ interval_set<uint64_t> data_subset;
+
+ ObcLockManager lock_manager;
+ // are we doing a clone on the replica?
+ if (soid.snap && soid.snap < CEPH_NOSNAP) {
+ hobject_t head = soid;
+ head.snap = CEPH_NOSNAP;
+
+ // try to base push off of clones that succeed/preceed poid
+ // we need the head (and current SnapSet) locally to do that.
+ if (get_parent()->get_local_missing().is_missing(head)) {
+ dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl;
+ return prep_push(obc, soid, peer, pop, cache_dont_need);
+ }
+
+ SnapSetContext *ssc = obc->ssc;
+ ceph_assert(ssc);
+ dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
+ pop->recovery_info.ss = ssc->snapset;
+ map<pg_shard_t, pg_missing_t>::const_iterator pm =
+ get_parent()->get_shard_missing().find(peer);
+ ceph_assert(pm != get_parent()->get_shard_missing().end());
+ map<pg_shard_t, pg_info_t>::const_iterator pi =
+ get_parent()->get_shard_info().find(peer);
+ ceph_assert(pi != get_parent()->get_shard_info().end());
+ calc_clone_subsets(
+ ssc->snapset, soid,
+ pm->second,
+ pi->second.last_backfill,
+ data_subset, clone_subsets,
+ lock_manager);
+ } else if (soid.snap == CEPH_NOSNAP) {
+ // pushing head or unversioned object.
+ // base this on partially on replica's clones?
+ SnapSetContext *ssc = obc->ssc;
+ ceph_assert(ssc);
+ dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
+ calc_head_subsets(
+ obc,
+ ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second,
+ get_parent()->get_shard_info().find(peer)->second.last_backfill,
+ data_subset, clone_subsets,
+ lock_manager);
+ }
+
+ return prep_push(
+ obc,
+ soid,
+ peer,
+ oi.version,
+ data_subset,
+ clone_subsets,
+ pop,
+ cache_dont_need,
+ std::move(lock_manager));
+}
+
+int ReplicatedBackend::prep_push(ObjectContextRef obc,
+ const hobject_t& soid, pg_shard_t peer,
+ PushOp *pop, bool cache_dont_need)
+{
+ interval_set<uint64_t> data_subset;
+ if (obc->obs.oi.size)
+ data_subset.insert(0, obc->obs.oi.size);
+ map<hobject_t, interval_set<uint64_t>> clone_subsets;
+
+ return prep_push(obc, soid, peer,
+ obc->obs.oi.version, data_subset, clone_subsets,
+ pop, cache_dont_need, ObcLockManager());
+}
+
+int ReplicatedBackend::prep_push(
+ ObjectContextRef obc,
+ const hobject_t& soid, pg_shard_t peer,
+ eversion_t version,
+ interval_set<uint64_t> &data_subset,
+ map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+ PushOp *pop,
+ bool cache_dont_need,
+ ObcLockManager &&lock_manager)
+{
+ get_parent()->begin_peer_recover(peer, soid);
+ const auto pmissing_iter = get_parent()->get_shard_missing().find(peer);
+ const auto missing_iter = pmissing_iter->second.get_items().find(soid);
+ assert(missing_iter != pmissing_iter->second.get_items().end());
+ // take note.
+ PushInfo &pi = pushing[soid][peer];
+ pi.obc = obc;
+ pi.recovery_info.size = obc->obs.oi.size;
+ pi.recovery_info.copy_subset = data_subset;
+ pi.recovery_info.clone_subset = clone_subsets;
+ pi.recovery_info.soid = soid;
+ pi.recovery_info.oi = obc->obs.oi;
+ pi.recovery_info.ss = pop->recovery_info.ss;
+ pi.recovery_info.version = version;
+ pi.recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+ pi.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() &&
+ HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS);
+ pi.lock_manager = std::move(lock_manager);
+
+ ObjectRecoveryProgress new_progress;
+ int r = build_push_op(pi.recovery_info,
+ pi.recovery_progress,
+ &new_progress,
+ pop,
+ &(pi.stat), cache_dont_need);
+ if (r < 0)
+ return r;
+ pi.recovery_progress = new_progress;
+ return 0;
+}
+
+void ReplicatedBackend::submit_push_data(
+ const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ bool cache_dont_need,
+ interval_set<uint64_t> &data_zeros,
+ const interval_set<uint64_t> &intervals_included,
+ bufferlist data_included,
+ bufferlist omap_header,
+ const map<string, bufferlist> &attrs,
+ const map<string, bufferlist> &omap_entries,
+ ObjectStore::Transaction *t)
+{
+ hobject_t target_oid;
+ if (first && complete) {
+ target_oid = recovery_info.soid;
+ } else {
+ target_oid = get_parent()->get_temp_recovery_object(recovery_info.soid,
+ recovery_info.version);
+ if (first) {
+ dout(10) << __func__ << ": Adding oid "
+ << target_oid << " in the temp collection" << dendl;
+ add_temp_obj(target_oid);
+ }
+ }
+
+ if (first) {
+ if (!complete) {
+ t->remove(coll, ghobject_t(target_oid));
+ t->touch(coll, ghobject_t(target_oid));
+ bufferlist bv = attrs.at(OI_ATTR);
+ object_info_t oi(bv);
+ t->set_alloc_hint(coll, ghobject_t(target_oid),
+ oi.expected_object_size,
+ oi.expected_write_size,
+ oi.alloc_hint_flags);
+ } else {
+ if (!recovery_info.object_exist) {
+ t->remove(coll, ghobject_t(target_oid));
+ t->touch(coll, ghobject_t(target_oid));
+ bufferlist bv = attrs.at(OI_ATTR);
+ object_info_t oi(bv);
+ t->set_alloc_hint(coll, ghobject_t(target_oid),
+ oi.expected_object_size,
+ oi.expected_write_size,
+ oi.alloc_hint_flags);
+ }
+ //remove xattr and update later if overwrite on original object
+ t->rmattrs(coll, ghobject_t(target_oid));
+ //if need update omap, clear the previous content first
+ if (clear_omap)
+ t->omap_clear(coll, ghobject_t(target_oid));
+ }
+
+ t->truncate(coll, ghobject_t(target_oid), recovery_info.size);
+ if (omap_header.length())
+ t->omap_setheader(coll, ghobject_t(target_oid), omap_header);
+
+ struct stat st;
+ int r = store->stat(ch, ghobject_t(recovery_info.soid), &st);
+ if (get_parent()->pg_is_remote_backfilling()) {
+ uint64_t size = 0;
+ if (r == 0)
+ size = st.st_size;
+ // Don't need to do anything if object is still the same size
+ if (size != recovery_info.oi.size) {
+ get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
+ get_parent()->pg_add_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
+ dout(10) << __func__ << " " << recovery_info.soid
+ << " backfill size " << recovery_info.oi.size
+ << " previous size " << size
+ << " net size " << recovery_info.oi.size - size
+ << dendl;
+ }
+ }
+ if (!complete) {
+ //clone overlap content in local object
+ if (recovery_info.object_exist) {
+ assert(r == 0);
+ uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size);
+ interval_set<uint64_t> local_intervals_included, local_intervals_excluded;
+ if (local_size) {
+ local_intervals_included.insert(0, local_size);
+ local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset);
+ local_intervals_included.subtract(local_intervals_excluded);
+ }
+ for (interval_set<uint64_t>::const_iterator q = local_intervals_included.begin();
+ q != local_intervals_included.end();
+ ++q) {
+ dout(15) << " clone_range " << recovery_info.soid << " "
+ << q.get_start() << "~" << q.get_len() << dendl;
+ t->clone_range(coll, ghobject_t(recovery_info.soid), ghobject_t(target_oid),
+ q.get_start(), q.get_len(), q.get_start());
+ }
+ }
+ }
+ }
+ uint64_t off = 0;
+ uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+ if (cache_dont_need)
+ fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+ // Punch zeros for data, if fiemap indicates nothing but it is marked dirty
+ if (data_zeros.size() > 0) {
+ data_zeros.intersection_of(recovery_info.copy_subset);
+ assert(intervals_included.subset_of(data_zeros));
+ data_zeros.subtract(intervals_included);
+
+ dout(20) << __func__ <<" recovering object " << recovery_info.soid
+ << " copy_subset: " << recovery_info.copy_subset
+ << " intervals_included: " << intervals_included
+ << " data_zeros: " << data_zeros << dendl;
+
+ for (auto p = data_zeros.begin(); p != data_zeros.end(); ++p)
+ t->zero(coll, ghobject_t(target_oid), p.get_start(), p.get_len());
+ }
+ for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
+ p != intervals_included.end();
+ ++p) {
+ bufferlist bit;
+ bit.substr_of(data_included, off, p.get_len());
+ t->write(coll, ghobject_t(target_oid),
+ p.get_start(), p.get_len(), bit, fadvise_flags);
+ off += p.get_len();
+ }
+
+ if (!omap_entries.empty())
+ t->omap_setkeys(coll, ghobject_t(target_oid), omap_entries);
+ if (!attrs.empty())
+ t->setattrs(coll, ghobject_t(target_oid), attrs);
+
+ if (complete) {
+ if (!first) {
+ dout(10) << __func__ << ": Removing oid "
+ << target_oid << " from the temp collection" << dendl;
+ clear_temp_obj(target_oid);
+ t->remove(coll, ghobject_t(recovery_info.soid));
+ t->collection_move_rename(coll, ghobject_t(target_oid),
+ coll, ghobject_t(recovery_info.soid));
+ }
+
+ submit_push_complete(recovery_info, t);
+
+ }
+}
+
+void ReplicatedBackend::submit_push_complete(
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t)
+{
+ for (map<hobject_t, interval_set<uint64_t>>::const_iterator p =
+ recovery_info.clone_subset.begin();
+ p != recovery_info.clone_subset.end();
+ ++p) {
+ for (interval_set<uint64_t>::const_iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ dout(15) << " clone_range " << p->first << " "
+ << q.get_start() << "~" << q.get_len() << dendl;
+ t->clone_range(coll, ghobject_t(p->first), ghobject_t(recovery_info.soid),
+ q.get_start(), q.get_len(), q.get_start());
+ }
+ }
+}
+
+ObjectRecoveryInfo ReplicatedBackend::recalc_subsets(
+ const ObjectRecoveryInfo& recovery_info,
+ SnapSetContext *ssc,
+ ObcLockManager &manager)
+{
+ if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP)
+ return recovery_info;
+ ObjectRecoveryInfo new_info = recovery_info;
+ new_info.copy_subset.clear();
+ new_info.clone_subset.clear();
+ ceph_assert(ssc);
+ get_parent()->release_locks(manager); // might already have locks
+ calc_clone_subsets(
+ ssc->snapset, new_info.soid, get_parent()->get_local_missing(),
+ get_info().last_backfill,
+ new_info.copy_subset, new_info.clone_subset,
+ manager);
+ return new_info;
+}
+
+bool ReplicatedBackend::handle_pull_response(
+ pg_shard_t from, const PushOp &pop, PullOp *response,
+ list<pull_complete_info> *to_continue,
+ ObjectStore::Transaction *t)
+{
+ interval_set<uint64_t> data_included = pop.data_included;
+ bufferlist data;
+ data = pop.data;
+ dout(10) << "handle_pull_response "
+ << pop.recovery_info
+ << pop.after_progress
+ << " data.size() is " << data.length()
+ << " data_included: " << data_included
+ << dendl;
+ if (pop.version == eversion_t()) {
+ // replica doesn't have it!
+ _failed_pull(from, pop.soid);
+ return false;
+ }
+
+ const hobject_t &hoid = pop.soid;
+ ceph_assert((data_included.empty() && data.length() == 0) ||
+ (!data_included.empty() && data.length() > 0));
+
+ auto piter = pulling.find(hoid);
+ if (piter == pulling.end()) {
+ return false;
+ }
+
+ PullInfo &pi = piter->second;
+ if (pi.recovery_info.size == (uint64_t(-1))) {
+ pi.recovery_info.size = pop.recovery_info.size;
+ pi.recovery_info.copy_subset.intersection_of(
+ pop.recovery_info.copy_subset);
+ }
+ // If primary doesn't have object info and didn't know version
+ if (pi.recovery_info.version == eversion_t()) {
+ pi.recovery_info.version = pop.version;
+ }
+
+ bool first = pi.recovery_progress.first;
+ if (first) {
+ // attrs only reference the origin bufferlist (decode from
+ // MOSDPGPush message) whose size is much greater than attrs in
+ // recovery. If obc cache it (get_obc maybe cache the attr), this
+ // causes the whole origin bufferlist would not be free until obc
+ // is evicted from obc cache. So rebuild the bufferlists before
+ // cache it.
+ auto attrset = pop.attrset;
+ for (auto& a : attrset) {
+ a.second.rebuild();
+ }
+ pi.obc = get_parent()->get_obc(pi.recovery_info.soid, attrset);
+ if (attrset.find(SS_ATTR) != attrset.end()) {
+ bufferlist ssbv = attrset.at(SS_ATTR);
+ SnapSet ss(ssbv);
+ assert(!pi.obc->ssc->exists || ss.seq == pi.obc->ssc->snapset.seq);
+ }
+ pi.recovery_info.oi = pi.obc->obs.oi;
+ pi.recovery_info = recalc_subsets(
+ pi.recovery_info,
+ pi.obc->ssc,
+ pi.lock_manager);
+ }
+
+
+ interval_set<uint64_t> usable_intervals;
+ bufferlist usable_data;
+ trim_pushed_data(pi.recovery_info.copy_subset,
+ data_included,
+ data,
+ &usable_intervals,
+ &usable_data);
+ data_included = usable_intervals;
+ data = std::move(usable_data);
+
+
+ pi.recovery_progress = pop.after_progress;
+
+ dout(10) << "new recovery_info " << pi.recovery_info
+ << ", new progress " << pi.recovery_progress
+ << dendl;
+ interval_set<uint64_t> data_zeros;
+ uint64_t z_offset = pop.before_progress.data_recovered_to;
+ uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to;
+ if (z_length)
+ data_zeros.insert(z_offset, z_length);
+ bool complete = pi.is_complete();
+ bool clear_omap = !pop.before_progress.omap_complete;
+
+ submit_push_data(pi.recovery_info,
+ first,
+ complete,
+ clear_omap,
+ pi.cache_dont_need,
+ data_zeros,
+ data_included,
+ data,
+ pop.omap_header,
+ pop.attrset,
+ pop.omap_entries,
+ t);
+
+ pi.stat.num_keys_recovered += pop.omap_entries.size();
+ pi.stat.num_bytes_recovered += data.length();
+ get_parent()->get_logger()->inc(l_osd_rbytes, pop.omap_entries.size() + data.length());
+
+ if (complete) {
+ pi.stat.num_objects_recovered++;
+ // XXX: This could overcount if regular recovery is needed right after a repair
+ if (get_parent()->pg_is_repair()) {
+ pi.stat.num_objects_repaired++;
+ get_parent()->inc_osd_stat_repaired();
+ }
+ clear_pull_from(piter);
+ to_continue->push_back({hoid, pi.stat});
+ get_parent()->on_local_recover(
+ hoid, pi.recovery_info, pi.obc, false, t);
+ return false;
+ } else {
+ response->soid = pop.soid;
+ response->recovery_info = pi.recovery_info;
+ response->recovery_progress = pi.recovery_progress;
+ return true;
+ }
+}
+
+void ReplicatedBackend::handle_push(
+ pg_shard_t from, const PushOp &pop, PushReplyOp *response,
+ ObjectStore::Transaction *t, bool is_repair)
+{
+ dout(10) << "handle_push "
+ << pop.recovery_info
+ << pop.after_progress
+ << dendl;
+ bufferlist data;
+ data = pop.data;
+ bool first = pop.before_progress.first;
+ bool complete = pop.after_progress.data_complete &&
+ pop.after_progress.omap_complete;
+ bool clear_omap = !pop.before_progress.omap_complete;
+ interval_set<uint64_t> data_zeros;
+ uint64_t z_offset = pop.before_progress.data_recovered_to;
+ uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to;
+ if (z_length)
+ data_zeros.insert(z_offset, z_length);
+ response->soid = pop.recovery_info.soid;
+
+ submit_push_data(pop.recovery_info,
+ first,
+ complete,
+ clear_omap,
+ true, // must be replicate
+ data_zeros,
+ pop.data_included,
+ data,
+ pop.omap_header,
+ pop.attrset,
+ pop.omap_entries,
+ t);
+
+ if (complete) {
+ if (is_repair) {
+ get_parent()->inc_osd_stat_repaired();
+ dout(20) << __func__ << " repair complete" << dendl;
+ }
+ get_parent()->on_local_recover(
+ pop.recovery_info.soid,
+ pop.recovery_info,
+ ObjectContextRef(), // ok, is replica
+ false,
+ t);
+ }
+}
+
+void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes)
+{
+ for (map<pg_shard_t, vector<PushOp> >::iterator i = pushes.begin();
+ i != pushes.end();
+ ++i) {
+ ConnectionRef con = get_parent()->get_con_osd_cluster(
+ i->first.osd,
+ get_osdmap_epoch());
+ if (!con)
+ continue;
+ vector<PushOp>::iterator j = i->second.begin();
+ while (j != i->second.end()) {
+ uint64_t cost = 0;
+ uint64_t pushes = 0;
+ MOSDPGPush *msg = new MOSDPGPush();
+ msg->from = get_parent()->whoami_shard();
+ msg->pgid = get_parent()->primary_spg_t();
+ msg->map_epoch = get_osdmap_epoch();
+ msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+ msg->set_priority(prio);
+ msg->is_repair = get_parent()->pg_is_repair();
+ for (;
+ (j != i->second.end() &&
+ cost < cct->_conf->osd_max_push_cost &&
+ pushes < cct->_conf->osd_max_push_objects) ;
+ ++j) {
+ dout(20) << __func__ << ": sending push " << *j
+ << " to osd." << i->first << dendl;
+ cost += j->cost(cct);
+ pushes += 1;
+ msg->pushes.push_back(*j);
+ }
+ msg->set_cost(cost);
+ get_parent()->send_message_osd_cluster(msg, con);
+ }
+ }
+}
+
+void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &pulls)
+{
+ for (map<pg_shard_t, vector<PullOp> >::iterator i = pulls.begin();
+ i != pulls.end();
+ ++i) {
+ ConnectionRef con = get_parent()->get_con_osd_cluster(
+ i->first.osd,
+ get_osdmap_epoch());
+ if (!con)
+ continue;
+ dout(20) << __func__ << ": sending pulls " << i->second
+ << " to osd." << i->first << dendl;
+ MOSDPGPull *msg = new MOSDPGPull();
+ msg->from = parent->whoami_shard();
+ msg->set_priority(prio);
+ msg->pgid = get_parent()->primary_spg_t();
+ msg->map_epoch = get_osdmap_epoch();
+ msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+ msg->set_pulls(std::move(i->second));
+ msg->compute_cost(cct);
+ get_parent()->send_message_osd_cluster(msg, con);
+ }
+}
+
+int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
+ const ObjectRecoveryProgress &progress,
+ ObjectRecoveryProgress *out_progress,
+ PushOp *out_op,
+ object_stat_sum_t *stat,
+ bool cache_dont_need)
+{
+ ObjectRecoveryProgress _new_progress;
+ if (!out_progress)
+ out_progress = &_new_progress;
+ ObjectRecoveryProgress &new_progress = *out_progress;
+ new_progress = progress;
+
+ dout(7) << __func__ << " " << recovery_info.soid
+ << " v " << recovery_info.version
+ << " size " << recovery_info.size
+ << " recovery_info: " << recovery_info
+ << dendl;
+
+ eversion_t v = recovery_info.version;
+ object_info_t oi;
+ if (progress.first) {
+ int r = store->omap_get_header(ch, ghobject_t(recovery_info.soid), &out_op->omap_header);
+ if (r < 0) {
+ dout(1) << __func__ << " get omap header failed: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ r = store->getattrs(ch, ghobject_t(recovery_info.soid), out_op->attrset);
+ if (r < 0) {
+ dout(1) << __func__ << " getattrs failed: " << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ // Debug
+ bufferlist bv = out_op->attrset[OI_ATTR];
+ try {
+ auto bliter = bv.cbegin();
+ decode(oi, bliter);
+ } catch (...) {
+ dout(0) << __func__ << ": bad object_info_t: " << recovery_info.soid << dendl;
+ return -EINVAL;
+ }
+
+ // If requestor didn't know the version, use ours
+ if (v == eversion_t()) {
+ v = oi.version;
+ } else if (oi.version != v) {
+ get_parent()->clog_error() << get_info().pgid << " push "
+ << recovery_info.soid << " v "
+ << recovery_info.version
+ << " failed because local copy is "
+ << oi.version;
+ return -EINVAL;
+ }
+
+ new_progress.first = false;
+ }
+ // Once we provide the version subsequent requests will have it, so
+ // at this point it must be known.
+ ceph_assert(v != eversion_t());
+
+ uint64_t available = cct->_conf->osd_recovery_max_chunk;
+ if (!progress.omap_complete) {
+ ObjectMap::ObjectMapIterator iter =
+ store->get_omap_iterator(ch,
+ ghobject_t(recovery_info.soid));
+ ceph_assert(iter);
+ for (iter->lower_bound(progress.omap_recovered_to);
+ iter->valid();
+ iter->next()) {
+ if (!out_op->omap_entries.empty() &&
+ ((cct->_conf->osd_recovery_max_omap_entries_per_chunk > 0 &&
+ out_op->omap_entries.size() >= cct->_conf->osd_recovery_max_omap_entries_per_chunk) ||
+ available <= iter->key().size() + iter->value().length()))
+ break;
+ out_op->omap_entries.insert(make_pair(iter->key(), iter->value()));
+
+ if ((iter->key().size() + iter->value().length()) <= available)
+ available -= (iter->key().size() + iter->value().length());
+ else
+ available = 0;
+ }
+ if (!iter->valid())
+ new_progress.omap_complete = true;
+ else
+ new_progress.omap_recovered_to = iter->key();
+ }
+
+ if (available > 0) {
+ if (!recovery_info.copy_subset.empty()) {
+ interval_set<uint64_t> copy_subset = recovery_info.copy_subset;
+ map<uint64_t, uint64_t> m;
+ int r = store->fiemap(ch, ghobject_t(recovery_info.soid), 0,
+ copy_subset.range_end(), m);
+ if (r >= 0) {
+ interval_set<uint64_t> fiemap_included(std::move(m));
+ copy_subset.intersection_of(fiemap_included);
+ } else {
+ // intersection of copy_subset and empty interval_set would be empty anyway
+ copy_subset.clear();
+ }
+
+ out_op->data_included.span_of(copy_subset, progress.data_recovered_to,
+ available);
+ // zero filled section, skip to end!
+ if (out_op->data_included.empty() ||
+ out_op->data_included.range_end() == copy_subset.range_end())
+ new_progress.data_recovered_to = recovery_info.copy_subset.range_end();
+ else
+ new_progress.data_recovered_to = out_op->data_included.range_end();
+ }
+ } else {
+ out_op->data_included.clear();
+ }
+
+ auto origin_size = out_op->data_included.size();
+ bufferlist bit;
+ int r = store->readv(ch, ghobject_t(recovery_info.soid),
+ out_op->data_included, bit,
+ cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
+ if (cct->_conf->osd_debug_random_push_read_error &&
+ (rand() % (int)(cct->_conf->osd_debug_random_push_read_error * 100.0)) == 0) {
+ dout(0) << __func__ << ": inject EIO " << recovery_info.soid << dendl;
+ r = -EIO;
+ }
+ if (r < 0) {
+ return r;
+ }
+ if (out_op->data_included.size() != origin_size) {
+ dout(10) << __func__ << " some extents get pruned "
+ << out_op->data_included.size() << "/" << origin_size
+ << dendl;
+ new_progress.data_complete = true;
+ }
+ out_op->data.claim_append(bit);
+ if (progress.first && !out_op->data_included.empty() &&
+ out_op->data_included.begin().get_start() == 0 &&
+ out_op->data.length() == oi.size && oi.is_data_digest()) {
+ uint32_t crc = out_op->data.crc32c(-1);
+ if (oi.data_digest != crc) {
+ dout(0) << __func__ << " " << coll << std::hex
+ << " full-object read crc 0x" << crc
+ << " != expected 0x" << oi.data_digest
+ << std::dec << " on " << recovery_info.soid << dendl;
+ return -EIO;
+ }
+ }
+
+ if (new_progress.is_complete(recovery_info)) {
+ new_progress.data_complete = true;
+ if (stat) {
+ stat->num_objects_recovered++;
+ if (get_parent()->pg_is_repair())
+ stat->num_objects_repaired++;
+ }
+ } else if (progress.first && progress.omap_complete) {
+ // If omap is not changed, we need recovery omap when recovery cannot be completed once
+ new_progress.omap_complete = false;
+ }
+
+ if (stat) {
+ stat->num_keys_recovered += out_op->omap_entries.size();
+ stat->num_bytes_recovered += out_op->data.length();
+ get_parent()->get_logger()->inc(l_osd_rbytes, out_op->omap_entries.size() + out_op->data.length());
+ }
+
+ get_parent()->get_logger()->inc(l_osd_push);
+ get_parent()->get_logger()->inc(l_osd_push_outb, out_op->data.length());
+
+ // send
+ out_op->version = v;
+ out_op->soid = recovery_info.soid;
+ out_op->recovery_info = recovery_info;
+ out_op->after_progress = new_progress;
+ out_op->before_progress = progress;
+ return 0;
+}
+
+void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op)
+{
+ op->recovery_info.version = eversion_t();
+ op->version = eversion_t();
+ op->soid = soid;
+}
+
+bool ReplicatedBackend::handle_push_reply(
+ pg_shard_t peer, const PushReplyOp &op, PushOp *reply)
+{
+ const hobject_t &soid = op.soid;
+ if (pushing.count(soid) == 0) {
+ dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer
+ << ", or anybody else"
+ << dendl;
+ return false;
+ } else if (pushing[soid].count(peer) == 0) {
+ dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer
+ << dendl;
+ return false;
+ } else {
+ PushInfo *pi = &pushing[soid][peer];
+ bool error = pushing[soid].begin()->second.recovery_progress.error;
+
+ if (!pi->recovery_progress.data_complete && !error) {
+ dout(10) << " pushing more from, "
+ << pi->recovery_progress.data_recovered_to
+ << " of " << pi->recovery_info.copy_subset << dendl;
+ ObjectRecoveryProgress new_progress;
+ int r = build_push_op(
+ pi->recovery_info,
+ pi->recovery_progress, &new_progress, reply,
+ &(pi->stat));
+ // Handle the case of a read error right after we wrote, which is
+ // hopefully extremely rare.
+ if (r < 0) {
+ dout(5) << __func__ << ": oid " << soid << " error " << r << dendl;
+
+ error = true;
+ goto done;
+ }
+ pi->recovery_progress = new_progress;
+ return true;
+ } else {
+ // done!
+done:
+ if (!error)
+ get_parent()->on_peer_recover( peer, soid, pi->recovery_info);
+
+ get_parent()->release_locks(pi->lock_manager);
+ object_stat_sum_t stat = pi->stat;
+ eversion_t v = pi->recovery_info.version;
+ pushing[soid].erase(peer);
+ pi = NULL;
+
+ if (pushing[soid].empty()) {
+ if (!error)
+ get_parent()->on_global_recover(soid, stat, false);
+ else
+ get_parent()->on_failed_pull(
+ std::set<pg_shard_t>{ get_parent()->whoami_shard() },
+ soid,
+ v);
+ pushing.erase(soid);
+ } else {
+ // This looks weird, but we erased the current peer and need to remember
+ // the error on any other one, while getting more acks.
+ if (error)
+ pushing[soid].begin()->second.recovery_progress.error = true;
+ dout(10) << "pushed " << soid << ", still waiting for push ack from "
+ << pushing[soid].size() << " others" << dendl;
+ }
+ return false;
+ }
+ }
+}
+
+void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply)
+{
+ const hobject_t &soid = op.soid;
+ struct stat st;
+ int r = store->stat(ch, ghobject_t(soid), &st);
+ if (r != 0) {
+ get_parent()->clog_error() << get_info().pgid << " "
+ << peer << " tried to pull " << soid
+ << " but got " << cpp_strerror(-r);
+ prep_push_op_blank(soid, reply);
+ } else {
+ ObjectRecoveryInfo &recovery_info = op.recovery_info;
+ ObjectRecoveryProgress &progress = op.recovery_progress;
+ if (progress.first && recovery_info.size == ((uint64_t)-1)) {
+ // Adjust size and copy_subset
+ recovery_info.size = st.st_size;
+ if (st.st_size) {
+ interval_set<uint64_t> object_range;
+ object_range.insert(0, st.st_size);
+ recovery_info.copy_subset.intersection_of(object_range);
+ } else {
+ recovery_info.copy_subset.clear();
+ }
+ assert(recovery_info.clone_subset.empty());
+ }
+
+ r = build_push_op(recovery_info, progress, 0, reply);
+ if (r < 0)
+ prep_push_op_blank(soid, reply);
+ }
+}
+
+/**
+ * trim received data to remove what we don't want
+ *
+ * @param copy_subset intervals we want
+ * @param data_included intervals we got
+ * @param data_recieved data we got
+ * @param intervals_usable intervals we want to keep
+ * @param data_usable matching data we want to keep
+ */
+void ReplicatedBackend::trim_pushed_data(
+ const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ bufferlist data_received,
+ interval_set<uint64_t> *intervals_usable,
+ bufferlist *data_usable)
+{
+ if (intervals_received.subset_of(copy_subset)) {
+ *intervals_usable = intervals_received;
+ *data_usable = data_received;
+ return;
+ }
+
+ intervals_usable->intersection_of(copy_subset,
+ intervals_received);
+
+ uint64_t off = 0;
+ for (interval_set<uint64_t>::const_iterator p = intervals_received.begin();
+ p != intervals_received.end();
+ ++p) {
+ interval_set<uint64_t> x;
+ x.insert(p.get_start(), p.get_len());
+ x.intersection_of(copy_subset);
+ for (interval_set<uint64_t>::const_iterator q = x.begin();
+ q != x.end();
+ ++q) {
+ bufferlist sub;
+ uint64_t data_off = off + (q.get_start() - p.get_start());
+ sub.substr_of(data_received, data_off, q.get_len());
+ data_usable->claim_append(sub);
+ }
+ off += p.get_len();
+ }
+}
+
+void ReplicatedBackend::_failed_pull(pg_shard_t from, const hobject_t &soid)
+{
+ dout(20) << __func__ << ": " << soid << " from " << from << dendl;
+ auto it = pulling.find(soid);
+ assert(it != pulling.end());
+ get_parent()->on_failed_pull(
+ { from },
+ soid,
+ it->second.recovery_info.version);
+
+ clear_pull(it);
+}
+
+void ReplicatedBackend::clear_pull_from(
+ map<hobject_t, PullInfo>::iterator piter)
+{
+ auto from = piter->second.from;
+ pull_from_peer[from].erase(piter->second.soid);
+ if (pull_from_peer[from].empty())
+ pull_from_peer.erase(from);
+}
+
+void ReplicatedBackend::clear_pull(
+ map<hobject_t, PullInfo>::iterator piter,
+ bool clear_pull_from_peer)
+{
+ if (clear_pull_from_peer) {
+ clear_pull_from(piter);
+ }
+ get_parent()->release_locks(piter->second.lock_manager);
+ pulling.erase(piter);
+}
+
+int ReplicatedBackend::start_pushes(
+ const hobject_t &soid,
+ ObjectContextRef obc,
+ RPGHandle *h)
+{
+ list< map<pg_shard_t, pg_missing_t>::const_iterator > shards;
+
+ dout(20) << __func__ << " soid " << soid << dendl;
+ // who needs it?
+ ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
+ for (set<pg_shard_t>::iterator i =
+ get_parent()->get_acting_recovery_backfill_shards().begin();
+ i != get_parent()->get_acting_recovery_backfill_shards().end();
+ ++i) {
+ if (*i == get_parent()->whoami_shard()) continue;
+ pg_shard_t peer = *i;
+ map<pg_shard_t, pg_missing_t>::const_iterator j =
+ get_parent()->get_shard_missing().find(peer);
+ ceph_assert(j != get_parent()->get_shard_missing().end());
+ if (j->second.is_missing(soid)) {
+ shards.push_back(j);
+ }
+ }
+
+ // If more than 1 read will occur ignore possible request to not cache
+ bool cache = shards.size() == 1 ? h->cache_dont_need : false;
+
+ for (auto j : shards) {
+ pg_shard_t peer = j->first;
+ h->pushes[peer].push_back(PushOp());
+ int r = prep_push_to_replica(obc, soid, peer,
+ &(h->pushes[peer].back()), cache);
+ if (r < 0) {
+ // Back out all failed reads
+ for (auto k : shards) {
+ pg_shard_t p = k->first;
+ dout(10) << __func__ << " clean up peer " << p << dendl;
+ h->pushes[p].pop_back();
+ if (p == peer) break;
+ }
+ return r;
+ }
+ }
+ return shards.size();
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
new file mode 100644
index 000000000..f4b506357
--- /dev/null
+++ b/src/osd/ReplicatedBackend.h
@@ -0,0 +1,437 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef REPBACKEND_H
+#define REPBACKEND_H
+
+#include "PGBackend.h"
+
+struct C_ReplicatedBackend_OnPullComplete;
+class ReplicatedBackend : public PGBackend {
+ struct RPGHandle : public PGBackend::RecoveryHandle {
+ std::map<pg_shard_t, std::vector<PushOp> > pushes;
+ std::map<pg_shard_t, std::vector<PullOp> > pulls;
+ };
+ friend struct C_ReplicatedBackend_OnPullComplete;
+public:
+ ReplicatedBackend(
+ PGBackend::Listener *pg,
+ const coll_t &coll,
+ ObjectStore::CollectionHandle &ch,
+ ObjectStore *store,
+ CephContext *cct);
+
+ /// @see PGBackend::open_recovery_op
+ RPGHandle *_open_recovery_op() {
+ return new RPGHandle();
+ }
+ PGBackend::RecoveryHandle *open_recovery_op() override {
+ return _open_recovery_op();
+ }
+
+ /// @see PGBackend::run_recovery_op
+ void run_recovery_op(
+ PGBackend::RecoveryHandle *h,
+ int priority) override;
+
+ /// @see PGBackend::recover_object
+ int recover_object(
+ const hobject_t &hoid,
+ eversion_t v,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *h
+ ) override;
+
+ void check_recovery_sources(const OSDMapRef& osdmap) override;
+
+ bool can_handle_while_inactive(OpRequestRef op) override;
+
+ /// @see PGBackend::handle_message
+ bool _handle_message(
+ OpRequestRef op
+ ) override;
+
+ void on_change() override;
+ void clear_recovery_state() override;
+
+ class RPCRecPred : public IsPGRecoverablePredicate {
+ public:
+ bool operator()(const std::set<pg_shard_t> &have) const override {
+ return !have.empty();
+ }
+ };
+ IsPGRecoverablePredicate *get_is_recoverable_predicate() const override {
+ return new RPCRecPred;
+ }
+
+ class RPCReadPred : public IsPGReadablePredicate {
+ pg_shard_t whoami;
+ public:
+ explicit RPCReadPred(pg_shard_t whoami) : whoami(whoami) {}
+ bool operator()(const std::set<pg_shard_t> &have) const override {
+ return have.count(whoami);
+ }
+ };
+ IsPGReadablePredicate *get_is_readable_predicate() const override {
+ return new RPCReadPred(get_parent()->whoami_shard());
+ }
+
+ void dump_recovery_info(ceph::Formatter *f) const override {
+ {
+ f->open_array_section("pull_from_peer");
+ for (std::map<pg_shard_t, std::set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+ i != pull_from_peer.end();
+ ++i) {
+ f->open_object_section("pulling_from");
+ f->dump_stream("pull_from") << i->first;
+ {
+ f->open_array_section("pulls");
+ for (std::set<hobject_t>::const_iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ f->open_object_section("pull_info");
+ ceph_assert(pulling.count(*j));
+ pulling.find(*j)->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ {
+ f->open_array_section("pushing");
+ for (std::map<hobject_t, std::map<pg_shard_t, PushInfo>>::const_iterator i =
+ pushing.begin();
+ i != pushing.end();
+ ++i) {
+ f->open_object_section("object");
+ f->dump_stream("pushing") << i->first;
+ {
+ f->open_array_section("pushing_to");
+ for (std::map<pg_shard_t, PushInfo>::const_iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ f->open_object_section("push_progress");
+ f->dump_stream("pushing_to") << j->first;
+ {
+ f->open_object_section("push_info");
+ j->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ }
+
+ int objects_read_sync(
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ ceph::buffer::list *bl) override;
+
+ int objects_readv_sync(
+ const hobject_t &hoid,
+ std::map<uint64_t, uint64_t>&& m,
+ uint32_t op_flags,
+ ceph::buffer::list *bl) override;
+
+ void objects_read_async(
+ const hobject_t &hoid,
+ const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+ std::pair<ceph::buffer::list*, Context*> > > &to_read,
+ Context *on_complete,
+ bool fast_read = false) override;
+
+private:
+ // push
+ struct PushInfo {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ ObjectContextRef obc;
+ object_stat_sum_t stat;
+ ObcLockManager lock_manager;
+
+ void dump(ceph::Formatter *f) const {
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ }
+ };
+ std::map<hobject_t, std::map<pg_shard_t, PushInfo>> pushing;
+
+ // pull
+ struct PullInfo {
+ pg_shard_t from;
+ hobject_t soid;
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ ObjectContextRef head_ctx;
+ ObjectContextRef obc;
+ object_stat_sum_t stat;
+ bool cache_dont_need;
+ ObcLockManager lock_manager;
+
+ void dump(ceph::Formatter *f) const {
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ }
+
+ bool is_complete() const {
+ return recovery_progress.is_complete(recovery_info);
+ }
+ };
+
+ std::map<hobject_t, PullInfo> pulling;
+
+ // Reverse mapping from osd peer to objects being pulled from that peer
+ std::map<pg_shard_t, std::set<hobject_t> > pull_from_peer;
+ void clear_pull(
+ std::map<hobject_t, PullInfo>::iterator piter,
+ bool clear_pull_from_peer = true);
+ void clear_pull_from(
+ std::map<hobject_t, PullInfo>::iterator piter);
+
+ void _do_push(OpRequestRef op);
+ void _do_pull_response(OpRequestRef op);
+ void do_push(OpRequestRef op) {
+ if (is_primary()) {
+ _do_pull_response(op);
+ } else {
+ _do_push(op);
+ }
+ }
+ void do_pull(OpRequestRef op);
+ void do_push_reply(OpRequestRef op);
+
+ bool handle_push_reply(pg_shard_t peer, const PushReplyOp &op, PushOp *reply);
+ void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply);
+
+ struct pull_complete_info {
+ hobject_t hoid;
+ object_stat_sum_t stat;
+ };
+ bool handle_pull_response(
+ pg_shard_t from, const PushOp &op, PullOp *response,
+ std::list<pull_complete_info> *to_continue,
+ ObjectStore::Transaction *t);
+ void handle_push(pg_shard_t from, const PushOp &op, PushReplyOp *response,
+ ObjectStore::Transaction *t, bool is_repair);
+
+ static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ ceph::buffer::list data_received,
+ interval_set<uint64_t> *intervals_usable,
+ ceph::buffer::list *data_usable);
+ void _failed_pull(pg_shard_t from, const hobject_t &soid);
+
+ void send_pushes(int prio, std::map<pg_shard_t, std::vector<PushOp> > &pushes);
+ void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+ void send_pulls(
+ int priority,
+ std::map<pg_shard_t, std::vector<PullOp> > &pulls);
+
+ int build_push_op(const ObjectRecoveryInfo &recovery_info,
+ const ObjectRecoveryProgress &progress,
+ ObjectRecoveryProgress *out_progress,
+ PushOp *out_op,
+ object_stat_sum_t *stat = 0,
+ bool cache_dont_need = true);
+ void submit_push_data(const ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ bool clear_omap,
+ bool cache_dont_need,
+ interval_set<uint64_t> &data_zeros,
+ const interval_set<uint64_t> &intervals_included,
+ ceph::buffer::list data_included,
+ ceph::buffer::list omap_header,
+ const std::map<std::string, ceph::buffer::list> &attrs,
+ const std::map<std::string, ceph::buffer::list> &omap_entries,
+ ObjectStore::Transaction *t);
+ void submit_push_complete(const ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t);
+
+ void calc_clone_subsets(
+ SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ std::map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+ ObcLockManager &lock_manager);
+ void prepare_pull(
+ eversion_t v,
+ const hobject_t& soid,
+ ObjectContextRef headctx,
+ RPGHandle *h);
+ int start_pushes(
+ const hobject_t &soid,
+ ObjectContextRef obj,
+ RPGHandle *h);
+ int prep_push_to_replica(
+ ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
+ PushOp *pop, bool cache_dont_need = true);
+ int prep_push(
+ ObjectContextRef obc,
+ const hobject_t& oid, pg_shard_t dest,
+ PushOp *op,
+ bool cache_dont_need);
+ int prep_push(
+ ObjectContextRef obc,
+ const hobject_t& soid, pg_shard_t peer,
+ eversion_t version,
+ interval_set<uint64_t> &data_subset,
+ std::map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+ PushOp *op,
+ bool cache,
+ ObcLockManager &&lock_manager);
+ void calc_head_subsets(
+ ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ std::map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+ ObcLockManager &lock_manager);
+ ObjectRecoveryInfo recalc_subsets(
+ const ObjectRecoveryInfo& recovery_info,
+ SnapSetContext *ssc,
+ ObcLockManager &lock_manager);
+
+ /**
+ * Client IO
+ */
+ struct InProgressOp : public RefCountedObject {
+ ceph_tid_t tid;
+ std::set<pg_shard_t> waiting_for_commit;
+ Context *on_commit;
+ OpRequestRef op;
+ eversion_t v;
+ bool done() const {
+ return waiting_for_commit.empty();
+ }
+ private:
+ FRIEND_MAKE_REF(InProgressOp);
+ InProgressOp(ceph_tid_t tid, Context *on_commit, OpRequestRef op, eversion_t v)
+ :
+ tid(tid), on_commit(on_commit),
+ op(op), v(v) {}
+ };
+ std::map<ceph_tid_t, ceph::ref_t<InProgressOp>> in_progress_ops;
+public:
+ friend class C_OSD_OnOpCommit;
+
+ void call_write_ordered(std::function<void(void)> &&cb) override {
+ // ReplicatedBackend submits writes inline in submit_transaction, so
+ // we can just call the callback.
+ cb();
+ }
+
+ void submit_transaction(
+ const hobject_t &hoid,
+ const object_stat_sum_t &delta_stats,
+ const eversion_t &at_version,
+ PGTransactionUPtr &&t,
+ const eversion_t &trim_to,
+ const eversion_t &min_last_complete_ondisk,
+ std::vector<pg_log_entry_t>&& log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ OpRequestRef op
+ ) override;
+
+private:
+ Message * generate_subop(
+ const hobject_t &soid,
+ const eversion_t &at_version,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ eversion_t pg_trim_to,
+ eversion_t min_last_complete_ondisk,
+ hobject_t new_temp_oid,
+ hobject_t discard_temp_oid,
+ const ceph::buffer::list &log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ ObjectStore::Transaction &op_t,
+ pg_shard_t peer,
+ const pg_info_t &pinfo);
+ void issue_op(
+ const hobject_t &soid,
+ const eversion_t &at_version,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ eversion_t pg_trim_to,
+ eversion_t min_last_complete_ondisk,
+ hobject_t new_temp_oid,
+ hobject_t discard_temp_oid,
+ const std::vector<pg_log_entry_t> &log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ InProgressOp *op,
+ ObjectStore::Transaction &op_t);
+ void op_commit(const ceph::ref_t<InProgressOp>& op);
+ void do_repop_reply(OpRequestRef op);
+ void do_repop(OpRequestRef op);
+
+ struct RepModify {
+ OpRequestRef op;
+ bool committed;
+ int ackerosd;
+ eversion_t last_complete;
+ epoch_t epoch_started;
+
+ ObjectStore::Transaction opt, localt;
+
+ RepModify() : committed(false), ackerosd(-1),
+ epoch_started(0) {}
+ };
+ typedef std::shared_ptr<RepModify> RepModifyRef;
+
+ struct C_OSD_RepModifyCommit;
+
+ void repop_commit(RepModifyRef rm);
+ bool auto_repair_supported() const override { return store->has_builtin_csum(); }
+
+
+ int be_deep_scrub(
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o) override;
+ uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; }
+};
+
+#endif
diff --git a/src/osd/ScrubStore.cc b/src/osd/ScrubStore.cc
new file mode 100644
index 000000000..a692a4435
--- /dev/null
+++ b/src/osd/ScrubStore.cc
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ScrubStore.h"
+#include "osd_types.h"
+#include "common/scrub_types.h"
+#include "include/rados/rados_types.hpp"
+
+using std::ostringstream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+namespace {
+ghobject_t make_scrub_object(const spg_t& pgid)
+{
+ ostringstream ss;
+ ss << "scrub_" << pgid;
+ return pgid.make_temp_ghobject(ss.str());
+}
+
+string first_object_key(int64_t pool)
+{
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0x00000000,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+// the object_key should be unique across pools
+string to_object_key(int64_t pool, const librados::object_id_t& oid)
+{
+ auto hoid = hobject_t(object_t(oid.name),
+ oid.locator, // key
+ oid.snap,
+ 0, // hash
+ pool,
+ oid.nspace);
+ hoid.build_hash_cache();
+ return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string last_object_key(int64_t pool)
+{
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0xffffffff,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string first_snap_key(int64_t pool)
+{
+ // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
+ // the representing the minimal and maximum keys. and this relies on how
+ // hobject_t::to_str() works: hex(pool).hex(revhash).
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0x00000000,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_SS_" + hoid.to_str();
+}
+
+string to_snap_key(int64_t pool, const librados::object_id_t& oid)
+{
+ auto hoid = hobject_t(object_t(oid.name),
+ oid.locator, // key
+ oid.snap,
+ 0x77777777, // hash
+ pool,
+ oid.nspace);
+ hoid.build_hash_cache();
+ return "SCRUB_SS_" + hoid.to_str();
+}
+
+string last_snap_key(int64_t pool)
+{
+ auto hoid = hobject_t(object_t(),
+ "",
+ 0,
+ 0xffffffff,
+ pool,
+ "");
+ hoid.build_hash_cache();
+ return "SCRUB_SS_" + hoid.to_str();
+}
+}
+
+namespace Scrub {
+
+Store*
+Store::create(ObjectStore* store,
+ ObjectStore::Transaction* t,
+ const spg_t& pgid,
+ const coll_t& coll)
+{
+ ceph_assert(store);
+ ceph_assert(t);
+ ghobject_t oid = make_scrub_object(pgid);
+ t->touch(coll, oid);
+ return new Store{coll, oid, store};
+}
+
+Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
+ : coll(coll),
+ hoid(oid),
+ driver(store, coll, hoid),
+ backend(&driver)
+{}
+
+Store::~Store()
+{
+ ceph_assert(results.empty());
+}
+
+void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
+{
+ bufferlist bl;
+ e.encode(bl);
+ results[to_object_key(pool, e.object)] = bl;
+}
+
+void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
+{
+ bufferlist bl;
+ e.encode(bl);
+ results[to_snap_key(pool, e.object)] = bl;
+}
+
+bool Store::empty() const
+{
+ return results.empty();
+}
+
+void Store::flush(ObjectStore::Transaction* t)
+{
+ if (t) {
+ OSDriver::OSTransaction txn = driver.get_transaction(t);
+ backend.set_keys(results, &txn);
+ }
+ results.clear();
+}
+
+void Store::cleanup(ObjectStore::Transaction* t)
+{
+ t->remove(coll, hoid);
+}
+
+std::vector<bufferlist>
+Store::get_snap_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const
+{
+ const string begin = (start.name.empty() ?
+ first_snap_key(pool) : to_snap_key(pool, start));
+ const string end = last_snap_key(pool);
+ return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_object_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const
+{
+ const string begin = (start.name.empty() ?
+ first_object_key(pool) : to_object_key(pool, start));
+ const string end = last_object_key(pool);
+ return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_errors(const string& begin,
+ const string& end,
+ uint64_t max_return) const
+{
+ vector<bufferlist> errors;
+ auto next = std::make_pair(begin, bufferlist{});
+ while (max_return && !backend.get_next(next.first, &next)) {
+ if (next.first >= end)
+ break;
+ errors.push_back(next.second);
+ max_return--;
+ }
+ return errors;
+}
+
+} // namespace Scrub
diff --git a/src/osd/ScrubStore.h b/src/osd/ScrubStore.h
new file mode 100644
index 000000000..721aae092
--- /dev/null
+++ b/src/osd/ScrubStore.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_SCRUB_RESULT_H
+#define CEPH_SCRUB_RESULT_H
+
+#include "SnapMapper.h" // for OSDriver
+#include "common/map_cacher.hpp"
+
+namespace librados {
+ struct object_id_t;
+}
+
+struct inconsistent_obj_wrapper;
+struct inconsistent_snapset_wrapper;
+
+namespace Scrub {
+
+class Store {
+public:
+ ~Store();
+ static Store* create(ObjectStore* store,
+ ObjectStore::Transaction* t,
+ const spg_t& pgid,
+ const coll_t& coll);
+ void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+ void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
+ bool empty() const;
+ void flush(ObjectStore::Transaction *);
+ void cleanup(ObjectStore::Transaction *);
+ std::vector<ceph::buffer::list> get_snap_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const;
+ std::vector<ceph::buffer::list> get_object_errors(int64_t pool,
+ const librados::object_id_t& start,
+ uint64_t max_return) const;
+private:
+ Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
+ std::vector<ceph::buffer::list> get_errors(const std::string& start, const std::string& end,
+ uint64_t max_return) const;
+private:
+ const coll_t coll;
+ const ghobject_t hoid;
+ // a temp object holding mappings from seq-id to inconsistencies found in
+ // scrubbing
+ OSDriver driver;
+ mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+ std::map<std::string, ceph::buffer::list> results;
+};
+}
+
+#endif // CEPH_SCRUB_RESULT_H
diff --git a/src/osd/Session.cc b/src/osd/Session.cc
new file mode 100644
index 000000000..454e1b857
--- /dev/null
+++ b/src/osd/Session.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PG.h"
+#include "Session.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+
+using std::map;
+using std::set;
+
+void Session::clear_backoffs()
+{
+ map<spg_t,map<hobject_t,set<ceph::ref_t<Backoff>>>> ls;
+ {
+ std::lock_guard l(backoff_lock);
+ ls.swap(backoffs);
+ backoff_count = 0;
+ }
+ for (auto& i : ls) {
+ for (auto& p : i.second) {
+ for (auto& b : p.second) {
+ std::lock_guard l(b->lock);
+ if (b->pg) {
+ ceph_assert(b->session == this);
+ ceph_assert(b->is_new() || b->is_acked());
+ b->pg->rm_backoff(b);
+ b->pg.reset();
+ b->session.reset();
+ } else if (b->session) {
+ ceph_assert(b->session == this);
+ ceph_assert(b->is_deleting());
+ b->session.reset();
+ }
+ }
+ }
+ }
+}
+
+void Session::ack_backoff(
+ CephContext *cct,
+ spg_t pgid,
+ uint64_t id,
+ const hobject_t& begin,
+ const hobject_t& end)
+{
+ std::lock_guard l(backoff_lock);
+ auto p = backoffs.find(pgid);
+ if (p == backoffs.end()) {
+ dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << ","
+ << end << ") pg not found" << dendl;
+ return;
+ }
+ auto q = p->second.find(begin);
+ if (q == p->second.end()) {
+ dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << ","
+ << end << ") begin not found" << dendl;
+ return;
+ }
+ for (auto i = q->second.begin(); i != q->second.end(); ++i) {
+ Backoff *b = (*i).get();
+ if (b->id == id) {
+ if (b->is_new()) {
+ b->state = Backoff::STATE_ACKED;
+ dout(20) << __func__ << " now " << *b << dendl;
+ } else if (b->is_deleting()) {
+ dout(20) << __func__ << " deleting " << *b << dendl;
+ q->second.erase(i);
+ --backoff_count;
+ }
+ break;
+ }
+ }
+ if (q->second.empty()) {
+ dout(20) << __func__ << " clearing begin bin " << q->first << dendl;
+ p->second.erase(q);
+ if (p->second.empty()) {
+ dout(20) << __func__ << " clearing pg bin " << p->first << dendl;
+ backoffs.erase(p);
+ }
+ }
+ ceph_assert(!backoff_count == backoffs.empty());
+}
+
+bool Session::check_backoff(
+ CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m)
+{
+ auto b = have_backoff(pgid, oid);
+ if (b) {
+ dout(10) << __func__ << " session " << this << " has backoff " << *b
+ << " for " << *m << dendl;
+ ceph_assert(!b->is_acked() || !g_conf()->osd_debug_crash_on_ignored_backoff);
+ return true;
+ }
+ // we may race with ms_handle_reset. it clears session->con before removing
+ // backoffs, so if we see con is cleared here we have to abort this
+ // request.
+ if (!con) {
+ dout(10) << __func__ << " session " << this << " disconnected" << dendl;
+ return true;
+ }
+ return false;
+}
diff --git a/src/osd/Session.h b/src/osd/Session.h
new file mode 100644
index 000000000..a42d37bfe
--- /dev/null
+++ b/src/osd/Session.h
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_SESSION_H
+#define CEPH_OSD_SESSION_H
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_mutex.h"
+#include "global/global_context.h"
+#include "include/spinlock.h"
+#include "OSDCap.h"
+#include "Watch.h"
+#include "OSDMap.h"
+#include "PeeringState.h"
+
+//#define PG_DEBUG_REFS
+
+class PG;
+#ifdef PG_DEBUG_REFS
+#include "common/tracked_int_ptr.hpp"
+typedef TrackedIntPtr<PG> PGRef;
+#else
+typedef boost::intrusive_ptr<PG> PGRef;
+#endif
+
+/*
+ * A Backoff represents one instance of either a PG or an OID
+ * being plugged at the client. It's refcounted and linked from
+ * the PG {pg_oid}_backoffs map and from the client Session
+ * object.
+ *
+ * The Backoff has a lock that protects it's internal fields.
+ *
+ * The PG has a backoff_lock that protects it's maps to Backoffs.
+ * This lock is *inside* of Backoff::lock.
+ *
+ * The Session has a backoff_lock that protects it's map of pg and
+ * oid backoffs. This lock is *inside* the Backoff::lock *and*
+ * PG::backoff_lock.
+ *
+ * That's
+ *
+ * Backoff::lock
+ * PG::backoff_lock
+ * Session::backoff_lock
+ *
+ * When the Session goes away, we move our backoff lists aside,
+ * then we lock each of the Backoffs we
+ * previously referenced and clear the Session* pointer. If the PG
+ * is still linked, we unlink it, too.
+ *
+ * When the PG clears the backoff, it will send an unblock message
+ * if the Session* is still non-null, and unlink the session.
+ *
+ */
+
+struct Backoff : public RefCountedObject {
+ enum {
+ STATE_NEW = 1, ///< backoff in flight to client
+ STATE_ACKED = 2, ///< backoff acked
+ STATE_DELETING = 3 ///< backoff deleted, but un-acked
+ };
+ std::atomic<int> state = {STATE_NEW};
+ spg_t pgid; ///< owning pgid
+ uint64_t id = 0; ///< unique id (within the Session)
+
+ bool is_new() const {
+ return state.load() == STATE_NEW;
+ }
+ bool is_acked() const {
+ return state.load() == STATE_ACKED;
+ }
+ bool is_deleting() const {
+ return state.load() == STATE_DELETING;
+ }
+ const char *get_state_name() const {
+ switch (state.load()) {
+ case STATE_NEW: return "new";
+ case STATE_ACKED: return "acked";
+ case STATE_DELETING: return "deleting";
+ default: return "???";
+ }
+ }
+
+ ceph::mutex lock = ceph::make_mutex("Backoff::lock");
+ // NOTE: the owning PG and session are either
+ // - *both* set, or
+ // - both null (teardown), or
+ // - only session is set (and state == DELETING)
+ PGRef pg; ///< owning pg
+ ceph::ref_t<struct Session> session; ///< owning session
+ hobject_t begin, end; ///< [) range to block, unless ==, then single obj
+
+ friend ostream& operator<<(ostream& out, const Backoff& b) {
+ return out << "Backoff(" << &b << " " << b.pgid << " " << b.id
+ << " " << b.get_state_name()
+ << " [" << b.begin << "," << b.end << ") "
+ << " session " << b.session
+ << " pg " << b.pg << ")";
+ }
+
+private:
+ FRIEND_MAKE_REF(Backoff);
+ Backoff(spg_t pgid, PGRef pg, ceph::ref_t<Session> s,
+ uint64_t i,
+ const hobject_t& b, const hobject_t& e)
+ : RefCountedObject(g_ceph_context),
+ pgid(pgid),
+ id(i),
+ pg(pg),
+ session(std::move(s)),
+ begin(b),
+ end(e) {}
+};
+
+
+
+struct Session : public RefCountedObject {
+ EntityName entity_name;
+ OSDCap caps;
+ ConnectionRef con;
+ entity_addr_t socket_addr;
+ WatchConState wstate;
+
+ ceph::mutex session_dispatch_lock =
+ ceph::make_mutex("Session::session_dispatch_lock");
+ boost::intrusive::list<OpRequest> waiting_on_map;
+
+ ceph::spinlock sent_epoch_lock;
+ epoch_t last_sent_epoch = 0;
+
+ /// protects backoffs; orders inside Backoff::lock *and* PG::backoff_lock
+ ceph::mutex backoff_lock = ceph::make_mutex("Session::backoff_lock");
+ std::atomic<int> backoff_count= {0}; ///< simple count of backoffs
+ std::map<spg_t, std::map<hobject_t, std::set<ceph::ref_t<Backoff>>>> backoffs;
+
+ std::atomic<uint64_t> backoff_seq = {0};
+
+ // for heartbeat connections only
+ int peer = -1;
+ HeartbeatStampsRef stamps;
+
+ entity_addr_t& get_peer_socket_addr() {
+ return socket_addr;
+ }
+
+ void ack_backoff(
+ CephContext *cct,
+ spg_t pgid,
+ uint64_t id,
+ const hobject_t& start,
+ const hobject_t& end);
+
+ ceph::ref_t<Backoff> have_backoff(spg_t pgid, const hobject_t& oid) {
+ if (!backoff_count.load()) {
+ return nullptr;
+ }
+ std::lock_guard l(backoff_lock);
+ ceph_assert(!backoff_count == backoffs.empty());
+ auto i = backoffs.find(pgid);
+ if (i == backoffs.end()) {
+ return nullptr;
+ }
+ auto p = i->second.lower_bound(oid);
+ if (p != i->second.begin() &&
+ (p == i->second.end() || p->first > oid)) {
+ --p;
+ }
+ if (p != i->second.end()) {
+ int r = cmp(oid, p->first);
+ if (r == 0 || r > 0) {
+ for (auto& q : p->second) {
+ if (r == 0 || oid < q->end) {
+ return &(*q);
+ }
+ }
+ }
+ }
+ return nullptr;
+ }
+
+ bool check_backoff(
+ CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m);
+
+ void add_backoff(ceph::ref_t<Backoff> b) {
+ std::lock_guard l(backoff_lock);
+ ceph_assert(!backoff_count == backoffs.empty());
+ backoffs[b->pgid][b->begin].insert(std::move(b));
+ ++backoff_count;
+ }
+
+ // called by PG::release_*_backoffs and PG::clear_backoffs()
+ void rm_backoff(const ceph::ref_t<Backoff>& b) {
+ std::lock_guard l(backoff_lock);
+ ceph_assert(ceph_mutex_is_locked_by_me(b->lock));
+ ceph_assert(b->session == this);
+ auto i = backoffs.find(b->pgid);
+ if (i != backoffs.end()) {
+ // may race with clear_backoffs()
+ auto p = i->second.find(b->begin);
+ if (p != i->second.end()) {
+ auto q = p->second.find(b);
+ if (q != p->second.end()) {
+ p->second.erase(q);
+ --backoff_count;
+ if (p->second.empty()) {
+ i->second.erase(p);
+ if (i->second.empty()) {
+ backoffs.erase(i);
+ }
+ }
+ }
+ }
+ }
+ ceph_assert(!backoff_count == backoffs.empty());
+ }
+ void clear_backoffs();
+
+private:
+ FRIEND_MAKE_REF(Session);
+ explicit Session(CephContext *cct, Connection *con_) :
+ RefCountedObject(cct),
+ con(con_),
+ socket_addr(con_->get_peer_socket_addr()),
+ wstate(cct)
+ {}
+};
+
+#endif
diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc
new file mode 100644
index 000000000..804213b1f
--- /dev/null
+++ b/src/osd/SnapMapper.cc
@@ -0,0 +1,752 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "SnapMapper.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << "snap_mapper."
+
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::timespan_str;
+
+const string SnapMapper::LEGACY_MAPPING_PREFIX = "MAP_";
+const string SnapMapper::MAPPING_PREFIX = "SNA_";
+const string SnapMapper::OBJECT_PREFIX = "OBJ_";
+
+const char *SnapMapper::PURGED_SNAP_PREFIX = "PSN_";
+
+/*
+
+ We have a bidirectional mapping, (1) from each snap+obj to object,
+ sorted by snapshot, such that we can enumerate to identify all clones
+ mapped to a particular snapshot, and (2) from object to snaps, so we
+ can identify which reverse mappings exist for any given object (and,
+ e.g., clean up on deletion).
+
+ "MAP_"
+ + ("%016x" % snapid)
+ + "_"
+ + (".%x" % shard_id)
+ + "_"
+ + hobject_t::to_str() ("%llx.%8x.%lx.name...." % pool, hash, snap)
+ -> SnapMapping::Mapping { snap, hoid }
+
+ "SNA_"
+ + ("%lld" % poolid)
+ + "_"
+ + ("%016x" % snapid)
+ + "_"
+ + (".%x" % shard_id)
+ + "_"
+ + hobject_t::to_str() ("%llx.%8x.%lx.name...." % pool, hash, snap)
+ -> SnapMapping::Mapping { snap, hoid }
+
+ "OBJ_" +
+ + (".%x" % shard_id)
+ + hobject_t::to_str()
+ -> SnapMapper::object_snaps { oid, set<snapid_t> }
+
+ */
+
+int OSDriver::get_keys(
+ const std::set<std::string> &keys,
+ std::map<std::string, bufferlist> *out)
+{
+ return os->omap_get_values(ch, hoid, keys, out);
+}
+
+int OSDriver::get_next(
+ const std::string &key,
+ pair<std::string, bufferlist> *next)
+{
+ ObjectMap::ObjectMapIterator iter =
+ os->get_omap_iterator(ch, hoid);
+ if (!iter) {
+ ceph_abort();
+ return -EINVAL;
+ }
+ iter->upper_bound(key);
+ if (iter->valid()) {
+ if (next)
+ *next = make_pair(iter->key(), iter->value());
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+string SnapMapper::get_prefix(int64_t pool, snapid_t snap)
+{
+ char buf[100];
+ int len = snprintf(
+ buf, sizeof(buf),
+ "%lld_%.*X_",
+ (long long)pool,
+ (int)(sizeof(snap)*2), static_cast<unsigned>(snap));
+ return MAPPING_PREFIX + string(buf, len);
+}
+
+string SnapMapper::to_raw_key(
+ const pair<snapid_t, hobject_t> &in)
+{
+ return get_prefix(in.second.pool, in.first) + shard_prefix + in.second.to_str();
+}
+
+pair<string, bufferlist> SnapMapper::to_raw(
+ const pair<snapid_t, hobject_t> &in)
+{
+ bufferlist bl;
+ encode(Mapping(in), bl);
+ return make_pair(
+ to_raw_key(in),
+ bl);
+}
+
+pair<snapid_t, hobject_t> SnapMapper::from_raw(
+ const pair<std::string, bufferlist> &image)
+{
+ using ceph::decode;
+ Mapping map;
+ bufferlist bl(image.second);
+ auto bp = bl.cbegin();
+ decode(map, bp);
+ return make_pair(map.snap, map.hoid);
+}
+
+bool SnapMapper::is_mapping(const string &to_test)
+{
+ return to_test.substr(0, MAPPING_PREFIX.size()) == MAPPING_PREFIX;
+}
+
+string SnapMapper::to_object_key(const hobject_t &hoid)
+{
+ return OBJECT_PREFIX + shard_prefix + hoid.to_str();
+}
+
+void SnapMapper::object_snaps::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(oid, bl);
+ encode(snaps, bl);
+ ENCODE_FINISH(bl);
+}
+
+void SnapMapper::object_snaps::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(oid, bl);
+ decode(snaps, bl);
+ DECODE_FINISH(bl);
+}
+
+bool SnapMapper::check(const hobject_t &hoid) const
+{
+ if (hoid.match(mask_bits, match)) {
+ return true;
+ }
+ derr << __func__ << " " << hoid << " mask_bits " << mask_bits
+ << " match 0x" << std::hex << match << std::dec << " is false"
+ << dendl;
+ return false;
+}
+
+int SnapMapper::get_snaps(
+ const hobject_t &oid,
+ object_snaps *out)
+{
+ ceph_assert(check(oid));
+ set<string> keys;
+ map<string, bufferlist> got;
+ keys.insert(to_object_key(oid));
+ int r = backend.get_keys(keys, &got);
+ if (r < 0) {
+ dout(20) << __func__ << " " << oid << " got err " << r << dendl;
+ return r;
+ }
+ if (got.empty()) {
+ dout(20) << __func__ << " " << oid << " got.empty()" << dendl;
+ return -ENOENT;
+ }
+ if (out) {
+ auto bp = got.begin()->second.cbegin();
+ decode(*out, bp);
+ dout(20) << __func__ << " " << oid << " " << out->snaps << dendl;
+ if (out->snaps.empty()) {
+ dout(1) << __func__ << " " << oid << " empty snapset" << dendl;
+ ceph_assert(!cct->_conf->osd_debug_verify_snaps);
+ }
+ } else {
+ dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl;
+ }
+ return 0;
+}
+
+void SnapMapper::clear_snaps(
+ const hobject_t &oid,
+ MapCacher::Transaction<std::string, bufferlist> *t)
+{
+ dout(20) << __func__ << " " << oid << dendl;
+ ceph_assert(check(oid));
+ set<string> to_remove;
+ to_remove.insert(to_object_key(oid));
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ for (auto& i : to_remove) {
+ dout(20) << __func__ << " rm " << i << dendl;
+ }
+ }
+ backend.remove_keys(to_remove, t);
+}
+
+void SnapMapper::set_snaps(
+ const hobject_t &oid,
+ const object_snaps &in,
+ MapCacher::Transaction<std::string, bufferlist> *t)
+{
+ ceph_assert(check(oid));
+ map<string, bufferlist> to_set;
+ bufferlist bl;
+ encode(in, bl);
+ to_set[to_object_key(oid)] = bl;
+ dout(20) << __func__ << " " << oid << " " << in.snaps << dendl;
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ for (auto& i : to_set) {
+ dout(20) << __func__ << " set " << i.first << dendl;
+ }
+ }
+ backend.set_keys(to_set, t);
+}
+
+int SnapMapper::update_snaps(
+ const hobject_t &oid,
+ const set<snapid_t> &new_snaps,
+ const set<snapid_t> *old_snaps_check,
+ MapCacher::Transaction<std::string, bufferlist> *t)
+{
+ dout(20) << __func__ << " " << oid << " " << new_snaps
+ << " was " << (old_snaps_check ? *old_snaps_check : set<snapid_t>())
+ << dendl;
+ ceph_assert(check(oid));
+ if (new_snaps.empty())
+ return remove_oid(oid, t);
+
+ object_snaps out;
+ int r = get_snaps(oid, &out);
+ // Tolerate missing keys but not disk errors
+ if (r < 0 && r != -ENOENT)
+ return r;
+ if (old_snaps_check)
+ ceph_assert(out.snaps == *old_snaps_check);
+
+ object_snaps in(oid, new_snaps);
+ set_snaps(oid, in, t);
+
+ set<string> to_remove;
+ for (set<snapid_t>::iterator i = out.snaps.begin();
+ i != out.snaps.end();
+ ++i) {
+ if (!new_snaps.count(*i)) {
+ to_remove.insert(to_raw_key(make_pair(*i, oid)));
+ }
+ }
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ for (auto& i : to_remove) {
+ dout(20) << __func__ << " rm " << i << dendl;
+ }
+ }
+ backend.remove_keys(to_remove, t);
+ return 0;
+}
+
+void SnapMapper::add_oid(
+ const hobject_t &oid,
+ const set<snapid_t>& snaps,
+ MapCacher::Transaction<std::string, bufferlist> *t)
+{
+ dout(20) << __func__ << " " << oid << " " << snaps << dendl;
+ ceph_assert(!snaps.empty());
+ ceph_assert(check(oid));
+ {
+ object_snaps out;
+ int r = get_snaps(oid, &out);
+ if (r != -ENOENT) {
+ derr << __func__ << " found existing snaps mapped on " << oid
+ << ", removing" << dendl;
+ ceph_assert(!cct->_conf->osd_debug_verify_snaps);
+ remove_oid(oid, t);
+ }
+ }
+
+ object_snaps _snaps(oid, snaps);
+ set_snaps(oid, _snaps, t);
+
+ map<string, bufferlist> to_add;
+ for (set<snapid_t>::iterator i = snaps.begin();
+ i != snaps.end();
+ ++i) {
+ to_add.insert(to_raw(make_pair(*i, oid)));
+ }
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ for (auto& i : to_add) {
+ dout(20) << __func__ << " set " << i.first << dendl;
+ }
+ }
+ backend.set_keys(to_add, t);
+}
+
+int SnapMapper::get_next_objects_to_trim(
+ snapid_t snap,
+ unsigned max,
+ vector<hobject_t> *out)
+{
+ ceph_assert(out);
+ ceph_assert(out->empty());
+
+ // if max would be 0, we return ENOENT and the caller would mistakenly
+ // trim the snaptrim queue
+ ceph_assert(max > 0);
+ int r = 0;
+ for (set<string>::iterator i = prefixes.begin();
+ i != prefixes.end() && out->size() < max && r == 0;
+ ++i) {
+ string prefix(get_prefix(pool, snap) + *i);
+ string pos = prefix;
+ while (out->size() < max) {
+ pair<string, bufferlist> next;
+ r = backend.get_next(pos, &next);
+ dout(20) << __func__ << " get_next(" << pos << ") returns " << r
+ << " " << next << dendl;
+ if (r != 0) {
+ break; // Done
+ }
+
+ if (next.first.substr(0, prefix.size()) !=
+ prefix) {
+ break; // Done with this prefix
+ }
+
+ ceph_assert(is_mapping(next.first));
+
+ dout(20) << __func__ << " " << next.first << dendl;
+ pair<snapid_t, hobject_t> next_decoded(from_raw(next));
+ ceph_assert(next_decoded.first == snap);
+ ceph_assert(check(next_decoded.second));
+
+ out->push_back(next_decoded.second);
+ pos = next.first;
+ }
+ }
+ if (out->size() == 0) {
+ return -ENOENT;
+ } else {
+ return 0;
+ }
+}
+
+
+int SnapMapper::remove_oid(
+ const hobject_t &oid,
+ MapCacher::Transaction<std::string, bufferlist> *t)
+{
+ dout(20) << __func__ << " " << oid << dendl;
+ ceph_assert(check(oid));
+ return _remove_oid(oid, t);
+}
+
+int SnapMapper::_remove_oid(
+ const hobject_t &oid,
+ MapCacher::Transaction<std::string, bufferlist> *t)
+{
+ dout(20) << __func__ << " " << oid << dendl;
+ object_snaps out;
+ int r = get_snaps(oid, &out);
+ if (r < 0)
+ return r;
+
+ clear_snaps(oid, t);
+
+ set<string> to_remove;
+ for (set<snapid_t>::iterator i = out.snaps.begin();
+ i != out.snaps.end();
+ ++i) {
+ to_remove.insert(to_raw_key(make_pair(*i, oid)));
+ }
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ for (auto& i : to_remove) {
+ dout(20) << __func__ << " rm " << i << dendl;
+ }
+ }
+ backend.remove_keys(to_remove, t);
+ return 0;
+}
+
+int SnapMapper::get_snaps(
+ const hobject_t &oid,
+ std::set<snapid_t> *snaps)
+{
+ ceph_assert(check(oid));
+ object_snaps out;
+ int r = get_snaps(oid, &out);
+ if (r < 0)
+ return r;
+ if (snaps)
+ snaps->swap(out.snaps);
+ return 0;
+}
+
+
+// -- purged snaps --
+
+string SnapMapper::make_purged_snap_key(int64_t pool, snapid_t last)
+{
+ char k[80];
+ snprintf(k, sizeof(k), "%s_%llu_%016llx", PURGED_SNAP_PREFIX,
+ (unsigned long long)pool, (unsigned long long)last);
+ return k;
+}
+
+void SnapMapper::make_purged_snap_key_value(
+ int64_t pool, snapid_t begin, snapid_t end, map<string,bufferlist> *m)
+{
+ string k = make_purged_snap_key(pool, end - 1);
+ auto& v = (*m)[k];
+ ceph::encode(pool, v);
+ ceph::encode(begin, v);
+ ceph::encode(end, v);
+}
+
+int SnapMapper::_lookup_purged_snap(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ const ghobject_t& hoid,
+ int64_t pool, snapid_t snap,
+ snapid_t *begin, snapid_t *end)
+{
+ string k = make_purged_snap_key(pool, snap);
+ auto it = store->get_omap_iterator(ch, hoid);
+ it->lower_bound(k);
+ if (!it->valid()) {
+ dout(20) << __func__ << " pool " << pool << " snap " << snap
+ << " key '" << k << "' lower_bound not found" << dendl;
+ return -ENOENT;
+ }
+ if (it->key().find(PURGED_SNAP_PREFIX) != 0) {
+ dout(20) << __func__ << " pool " << pool << " snap " << snap
+ << " key '" << k << "' lower_bound got mismatched prefix '"
+ << it->key() << "'" << dendl;
+ return -ENOENT;
+ }
+ bufferlist v = it->value();
+ auto p = v.cbegin();
+ int64_t gotpool;
+ decode(gotpool, p);
+ decode(*begin, p);
+ decode(*end, p);
+ if (snap < *begin || snap >= *end) {
+ dout(20) << __func__ << " pool " << pool << " snap " << snap
+ << " found [" << *begin << "," << *end << "), no overlap" << dendl;
+ return -ENOENT;
+ }
+ return 0;
+}
+
+void SnapMapper::record_purged_snaps(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t hoid,
+ ObjectStore::Transaction *t,
+ map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps)
+{
+ dout(10) << __func__ << " purged_snaps " << purged_snaps << dendl;
+ map<string,bufferlist> m;
+ set<string> rm;
+ for (auto& [epoch, bypool] : purged_snaps) {
+ // index by (pool, snap)
+ for (auto& [pool, snaps] : bypool) {
+ for (auto i = snaps.begin();
+ i != snaps.end();
+ ++i) {
+ snapid_t begin = i.get_start();
+ snapid_t end = i.get_end();
+ snapid_t before_begin, before_end;
+ snapid_t after_begin, after_end;
+ int b = _lookup_purged_snap(cct, store, ch, hoid,
+ pool, begin - 1, &before_begin, &before_end);
+ int a = _lookup_purged_snap(cct, store, ch, hoid,
+ pool, end, &after_begin, &after_end);
+ if (!b && !a) {
+ dout(10) << __func__
+ << " [" << begin << "," << end << ") - joins ["
+ << before_begin << "," << before_end << ") and ["
+ << after_begin << "," << after_end << ")" << dendl;
+ // erase only the begin record; we'll overwrite the end one
+ rm.insert(make_purged_snap_key(pool, before_end - 1));
+ make_purged_snap_key_value(pool, before_begin, after_end, &m);
+ } else if (!b) {
+ dout(10) << __func__
+ << " [" << begin << "," << end << ") - join with earlier ["
+ << before_begin << "," << before_end << ")" << dendl;
+ rm.insert(make_purged_snap_key(pool, before_end - 1));
+ make_purged_snap_key_value(pool, before_begin, end, &m);
+ } else if (!a) {
+ dout(10) << __func__
+ << " [" << begin << "," << end << ") - join with later ["
+ << after_begin << "," << after_end << ")" << dendl;
+ // overwrite after record
+ make_purged_snap_key_value(pool, begin, after_end, &m);
+ } else {
+ make_purged_snap_key_value(pool, begin, end, &m);
+ }
+ }
+ }
+ }
+ t->omap_rmkeys(ch->cid, hoid, rm);
+ t->omap_setkeys(ch->cid, hoid, m);
+ dout(10) << __func__ << " rm " << rm.size() << " keys, set " << m.size()
+ << " keys" << dendl;
+}
+
+
+bool SnapMapper::Scrubber::_parse_p()
+{
+ if (!psit->valid()) {
+ pool = -1;
+ return false;
+ }
+ if (psit->key().find(PURGED_SNAP_PREFIX) != 0) {
+ pool = -1;
+ return false;
+ }
+ bufferlist v = psit->value();
+ auto p = v.cbegin();
+ ceph::decode(pool, p);
+ ceph::decode(begin, p);
+ ceph::decode(end, p);
+ dout(20) << __func__ << " purged_snaps pool " << pool
+ << " [" << begin << "," << end << ")" << dendl;
+ psit->next();
+ return true;
+}
+
+bool SnapMapper::Scrubber::_parse_m()
+{
+ if (!mapit->valid()) {
+ return false;
+ }
+ if (mapit->key().find(MAPPING_PREFIX) != 0) {
+ return false;
+ }
+ auto v = mapit->value();
+ auto p = v.cbegin();
+ mapping.decode(p);
+
+ {
+ unsigned long long p, s;
+ long sh;
+ string k = mapit->key();
+ int r = sscanf(k.c_str(), "SNA_%lld_%llx.%lx", &p, &s, &sh);
+ if (r != 1) {
+ shard = shard_id_t::NO_SHARD;
+ } else {
+ shard = shard_id_t(sh);
+ }
+ }
+ dout(20) << __func__ << " mapping pool " << mapping.hoid.pool
+ << " snap " << mapping.snap
+ << " shard " << shard
+ << " " << mapping.hoid << dendl;
+ mapit->next();
+ return true;
+}
+
+void SnapMapper::Scrubber::run()
+{
+ dout(10) << __func__ << dendl;
+
+ psit = store->get_omap_iterator(ch, purged_snaps_hoid);
+ psit->upper_bound(PURGED_SNAP_PREFIX);
+ _parse_p();
+
+ mapit = store->get_omap_iterator(ch, mapping_hoid);
+ mapit->upper_bound(MAPPING_PREFIX);
+
+ while (_parse_m()) {
+ // advance to next purged_snaps range?
+ while (pool >= 0 &&
+ (mapping.hoid.pool > pool ||
+ (mapping.hoid.pool == pool && mapping.snap >= end))) {
+ _parse_p();
+ }
+ if (pool < 0) {
+ dout(10) << __func__ << " passed final purged_snaps interval, rest ok"
+ << dendl;
+ break;
+ }
+ if (mapping.hoid.pool < pool ||
+ mapping.snap < begin) {
+ // ok
+ dout(20) << __func__ << " ok " << mapping.hoid
+ << " snap " << mapping.snap
+ << " precedes pool " << pool
+ << " purged_snaps [" << begin << "," << end << ")" << dendl;
+ } else {
+ assert(mapping.snap >= begin &&
+ mapping.snap < end &&
+ mapping.hoid.pool == pool);
+ // invalid
+ dout(10) << __func__ << " stray " << mapping.hoid
+ << " snap " << mapping.snap
+ << " in pool " << pool
+ << " shard " << shard
+ << " purged_snaps [" << begin << "," << end << ")" << dendl;
+ stray.emplace_back(std::tuple<int64_t,snapid_t,uint32_t,shard_id_t>(
+ pool, mapping.snap, mapping.hoid.get_hash(),
+ shard
+ ));
+ }
+ }
+
+ dout(10) << __func__ << " end, found " << stray.size() << " stray" << dendl;
+ psit = ObjectMap::ObjectMapIterator();
+ mapit = ObjectMap::ObjectMapIterator();
+}
+
+
+// -------------------------------------
+// legacy conversion/support
+
+string SnapMapper::get_legacy_prefix(snapid_t snap)
+{
+ char buf[100];
+ int len = snprintf(
+ buf, sizeof(buf),
+ "%.*X_",
+ (int)(sizeof(snap)*2), static_cast<unsigned>(snap));
+ return LEGACY_MAPPING_PREFIX + string(buf, len);
+}
+
+string SnapMapper::to_legacy_raw_key(
+ const pair<snapid_t, hobject_t> &in)
+{
+ return get_legacy_prefix(in.first) + shard_prefix + in.second.to_str();
+}
+
+bool SnapMapper::is_legacy_mapping(const string &to_test)
+{
+ return to_test.substr(0, LEGACY_MAPPING_PREFIX.size()) ==
+ LEGACY_MAPPING_PREFIX;
+}
+
+/* Octopus modified the SnapMapper key format from
+ *
+ * <LEGACY_MAPPING_PREFIX><snapid>_<shardid>_<hobject_t::to_str()>
+ *
+ * to
+ *
+ * <MAPPING_PREFIX><pool>_<snapid>_<shardid>_<hobject_t::to_str()>
+ *
+ * We can't reconstruct the new key format just from the value since the
+ * Mapping object contains an hobject rather than a ghobject. Instead,
+ * we exploit the fact that the new format is identical starting at <snapid>.
+ *
+ * Note that the original version of this conversion introduced in 94ebe0ea
+ * had a crucial bug which essentially destroyed legacy keys by mapping
+ * them to
+ *
+ * <MAPPING_PREFIX><poolid>_<snapid>_
+ *
+ * without the object-unique suffix.
+ * See https://tracker.ceph.com/issues/56147
+ */
+std::string SnapMapper::convert_legacy_key(
+ const std::string& old_key,
+ const bufferlist& value)
+{
+ auto old = from_raw(make_pair(old_key, value));
+ std::string object_suffix = old_key.substr(
+ SnapMapper::LEGACY_MAPPING_PREFIX.length());
+ return SnapMapper::MAPPING_PREFIX + std::to_string(old.second.pool)
+ + "_" + object_suffix;
+}
+
+int SnapMapper::convert_legacy(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t hoid,
+ unsigned max)
+{
+ uint64_t n = 0;
+
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid);
+ if (!iter) {
+ return -EIO;
+ }
+
+ auto start = ceph::mono_clock::now();
+
+ iter->upper_bound(SnapMapper::LEGACY_MAPPING_PREFIX);
+ map<string,bufferlist> to_set;
+ while (iter->valid()) {
+ bool valid = SnapMapper::is_legacy_mapping(iter->key());
+ if (valid) {
+ to_set.emplace(
+ convert_legacy_key(iter->key(), iter->value()),
+ iter->value());
+ ++n;
+ iter->next();
+ }
+ if (!valid || !iter->valid() || to_set.size() >= max) {
+ ObjectStore::Transaction t;
+ t.omap_setkeys(ch->cid, hoid, to_set);
+ int r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ to_set.clear();
+ if (!valid) {
+ break;
+ }
+ dout(10) << __func__ << " converted " << n << " keys" << dendl;
+ }
+ }
+
+ auto end = ceph::mono_clock::now();
+
+ dout(1) << __func__ << " converted " << n << " keys in "
+ << timespan_str(end - start) << dendl;
+
+ // remove the old keys
+ {
+ ObjectStore::Transaction t;
+ string end = SnapMapper::LEGACY_MAPPING_PREFIX;
+ ++end[end.size()-1]; // turn _ to whatever comes after _
+ t.omap_rmkeyrange(ch->cid, hoid,
+ SnapMapper::LEGACY_MAPPING_PREFIX,
+ end);
+ int r = store->queue_transaction(ch, std::move(t));
+ ceph_assert(r == 0);
+ }
+ return 0;
+}
diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h
new file mode 100644
index 000000000..90b0c7c8d
--- /dev/null
+++ b/src/osd/SnapMapper.h
@@ -0,0 +1,338 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef SNAPMAPPER_H
+#define SNAPMAPPER_H
+
+#include <string>
+#include <set>
+#include <utility>
+#include <cstring>
+
+#include "common/map_cacher.hpp"
+#include "common/hobject.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/object.h"
+#include "os/ObjectStore.h"
+#include "osd/OSDMap.h"
+
+class OSDriver : public MapCacher::StoreDriver<std::string, ceph::buffer::list> {
+ ObjectStore *os;
+ ObjectStore::CollectionHandle ch;
+ ghobject_t hoid;
+
+public:
+ class OSTransaction : public MapCacher::Transaction<std::string, ceph::buffer::list> {
+ friend class OSDriver;
+ coll_t cid;
+ ghobject_t hoid;
+ ObjectStore::Transaction *t;
+ OSTransaction(
+ const coll_t &cid,
+ const ghobject_t &hoid,
+ ObjectStore::Transaction *t)
+ : cid(cid), hoid(hoid), t(t) {}
+ public:
+ void set_keys(
+ const std::map<std::string, ceph::buffer::list> &to_set) override {
+ t->omap_setkeys(cid, hoid, to_set);
+ }
+ void remove_keys(
+ const std::set<std::string> &to_remove) override {
+ t->omap_rmkeys(cid, hoid, to_remove);
+ }
+ void add_callback(
+ Context *c) override {
+ t->register_on_applied(c);
+ }
+ };
+
+ OSTransaction get_transaction(
+ ObjectStore::Transaction *t) {
+ return OSTransaction(ch->cid, hoid, t);
+ }
+
+ OSDriver(ObjectStore *os, const coll_t& cid, const ghobject_t &hoid) :
+ os(os),
+ hoid(hoid) {
+ ch = os->open_collection(cid);
+ }
+ int get_keys(
+ const std::set<std::string> &keys,
+ std::map<std::string, ceph::buffer::list> *out) override;
+ int get_next(
+ const std::string &key,
+ std::pair<std::string, ceph::buffer::list> *next) override;
+};
+
+/**
+ * SnapMapper
+ *
+ * Manages two mappings:
+ * 1) hobject_t -> {snapid}
+ * 2) snapid -> {hobject_t}
+ *
+ * We accomplish this using two sets of keys:
+ * 1) OBJECT_PREFIX + obj.str() -> encoding of object_snaps
+ * 2) MAPPING_PREFIX + poolid + snapid_t + obj.str() -> encoding of std::pair<snapid_t, obj>
+ *
+ * The on disk strings and encodings are implemented in to_raw, to_raw_key,
+ * from_raw, to_object_key.
+ *
+ * The object -> {snapid} mapping is primarily included so that the
+ * SnapMapper state can be verified against the external PG state during
+ * scrub etc.
+ *
+ * The 2) mapping is arranged such that all objects in a particular
+ * snap will sort together, and so that all objects in a pg for a
+ * particular snap will group under up to 8 prefixes.
+ */
+class SnapMapper {
+ friend class MapperVerifier;
+public:
+ CephContext* cct;
+ struct object_snaps {
+ hobject_t oid;
+ std::set<snapid_t> snaps;
+ object_snaps(hobject_t oid, const std::set<snapid_t> &snaps)
+ : oid(oid), snaps(snaps) {}
+ object_snaps() {}
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bp);
+ };
+
+ struct Mapping {
+ snapid_t snap;
+ hobject_t hoid;
+ explicit Mapping(const std::pair<snapid_t, hobject_t> &in)
+ : snap(in.first), hoid(in.second) {}
+ Mapping() : snap(0) {}
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(snap, bl);
+ encode(hoid, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(1, bl);
+ decode(snap, bl);
+ decode(hoid, bl);
+ DECODE_FINISH(bl);
+ }
+ };
+
+ static const std::string LEGACY_MAPPING_PREFIX;
+ static const std::string MAPPING_PREFIX;
+ static const std::string OBJECT_PREFIX;
+ static const char *PURGED_SNAP_EPOCH_PREFIX;
+ static const char *PURGED_SNAP_PREFIX;
+
+ struct Scrubber {
+ CephContext *cct;
+ ObjectStore *store;
+ ObjectStore::CollectionHandle ch;
+ ghobject_t mapping_hoid;
+ ghobject_t purged_snaps_hoid;
+
+ ObjectMap::ObjectMapIterator psit;
+ int64_t pool;
+ snapid_t begin, end;
+
+ bool _parse_p(); ///< advance the purged_snaps pointer
+
+ ObjectMap::ObjectMapIterator mapit;
+ Mapping mapping;
+ shard_id_t shard;
+
+ bool _parse_m(); ///< advance the (object) mapper pointer
+
+ std::vector<std::tuple<int64_t, snapid_t, uint32_t, shard_id_t>> stray;
+
+ Scrubber(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t mapping_hoid,
+ ghobject_t purged_snaps_hoid)
+ : cct(cct),
+ store(store),
+ ch(ch),
+ mapping_hoid(mapping_hoid),
+ purged_snaps_hoid(purged_snaps_hoid) {}
+
+ void run();
+ };
+
+ static std::string convert_legacy_key(
+ const std::string& old_key,
+ const bufferlist& value);
+
+ static int convert_legacy(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t hoid,
+ unsigned max);
+
+ static void record_purged_snaps(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t hoid,
+ ObjectStore::Transaction *t,
+ std::map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps);
+ static void scrub_purged_snaps(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ ghobject_t mapper_hoid,
+ ghobject_t purged_snaps_hoid);
+
+private:
+ static int _lookup_purged_snap(
+ CephContext *cct,
+ ObjectStore *store,
+ ObjectStore::CollectionHandle& ch,
+ const ghobject_t& hoid,
+ int64_t pool, snapid_t snap,
+ snapid_t *begin, snapid_t *end);
+ static void make_purged_snap_key_value(
+ int64_t pool, snapid_t begin,
+ snapid_t end, std::map<std::string,ceph::buffer::list> *m);
+ static std::string make_purged_snap_key(int64_t pool, snapid_t last);
+
+
+ MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+
+ static std::string get_legacy_prefix(snapid_t snap);
+ std::string to_legacy_raw_key(
+ const std::pair<snapid_t, hobject_t> &to_map);
+ static bool is_legacy_mapping(const std::string &to_test);
+
+ static std::string get_prefix(int64_t pool, snapid_t snap);
+ std::string to_raw_key(
+ const std::pair<snapid_t, hobject_t> &to_map);
+
+ std::pair<std::string, ceph::buffer::list> to_raw(
+ const std::pair<snapid_t, hobject_t> &to_map);
+
+ static bool is_mapping(const std::string &to_test);
+
+ static std::pair<snapid_t, hobject_t> from_raw(
+ const std::pair<std::string, ceph::buffer::list> &image);
+
+ std::string to_object_key(const hobject_t &hoid);
+
+ int get_snaps(const hobject_t &oid, object_snaps *out);
+
+ void set_snaps(
+ const hobject_t &oid,
+ const object_snaps &out,
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t);
+
+ void clear_snaps(
+ const hobject_t &oid,
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t);
+
+ // True if hoid belongs in this mapping based on mask_bits and match
+ bool check(const hobject_t &hoid) const;
+
+ int _remove_oid(
+ const hobject_t &oid, ///< [in] oid to remove
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+ );
+
+public:
+ static std::string make_shard_prefix(shard_id_t shard) {
+ if (shard == shard_id_t::NO_SHARD)
+ return std::string();
+ char buf[20];
+ int r = snprintf(buf, sizeof(buf), ".%x", (int)shard);
+ ceph_assert(r < (int)sizeof(buf));
+ return std::string(buf, r) + '_';
+ }
+ uint32_t mask_bits;
+ const uint32_t match;
+ std::string last_key_checked;
+ const int64_t pool;
+ const shard_id_t shard;
+ const std::string shard_prefix;
+ SnapMapper(
+ CephContext* cct,
+ MapCacher::StoreDriver<std::string, ceph::buffer::list> *driver,
+ uint32_t match, ///< [in] pgid
+ uint32_t bits, ///< [in] current split bits
+ int64_t pool, ///< [in] pool
+ shard_id_t shard ///< [in] shard
+ )
+ : cct(cct), backend(driver), mask_bits(bits), match(match), pool(pool),
+ shard(shard), shard_prefix(make_shard_prefix(shard)) {
+ update_bits(mask_bits);
+ }
+
+ std::set<std::string> prefixes;
+ /// Update bits in case of pg split or merge
+ void update_bits(
+ uint32_t new_bits ///< [in] new split bits
+ ) {
+ mask_bits = new_bits;
+ std::set<std::string> _prefixes = hobject_t::get_prefixes(
+ mask_bits,
+ match,
+ pool);
+ prefixes.clear();
+ for (auto i = _prefixes.begin(); i != _prefixes.end(); ++i) {
+ prefixes.insert(shard_prefix + *i);
+ }
+ }
+
+ /// Update snaps for oid, empty new_snaps removes the mapping
+ int update_snaps(
+ const hobject_t &oid, ///< [in] oid to update
+ const std::set<snapid_t> &new_snaps, ///< [in] new snap std::set
+ const std::set<snapid_t> *old_snaps, ///< [in] old snaps (for debugging)
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+ ); ///@ return error, 0 on success
+
+ /// Add mapping for oid, must not already be mapped
+ void add_oid(
+ const hobject_t &oid, ///< [in] oid to add
+ const std::set<snapid_t>& new_snaps, ///< [in] snaps
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+ );
+
+ /// Returns first object with snap as a snap
+ int get_next_objects_to_trim(
+ snapid_t snap, ///< [in] snap to check
+ unsigned max, ///< [in] max to get
+ std::vector<hobject_t> *out ///< [out] next objects to trim (must be empty)
+ ); ///< @return error, -ENOENT if no more objects
+
+ /// Remove mapping for oid
+ int remove_oid(
+ const hobject_t &oid, ///< [in] oid to remove
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+ ); ///< @return error, -ENOENT if the object is not mapped
+
+ /// Get snaps for oid
+ int get_snaps(
+ const hobject_t &oid, ///< [in] oid to get snaps for
+ std::set<snapid_t> *snaps ///< [out] snaps
+ ); ///< @return error, -ENOENT if oid is not recorded
+};
+WRITE_CLASS_ENCODER(SnapMapper::object_snaps)
+WRITE_CLASS_ENCODER(SnapMapper::Mapping)
+
+#endif
diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h
new file mode 100644
index 000000000..28e1598a9
--- /dev/null
+++ b/src/osd/TierAgentState.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_TIERAGENT_H
+#define CEPH_OSD_TIERAGENT_H
+
+#include <ctime>
+#include <list>
+#include <map>
+#include <utility>
+
+#include "common/Formatter.h"
+#include "common/histogram.h"
+#include "common/hobject.h"
+
+#include "osd/HitSet.h"
+
+struct TierAgentState {
+ /// current position iterating across pool
+ hobject_t position;
+ /// Count of agent_work since "start" position of object hash space
+ int started;
+ hobject_t start;
+ bool delaying;
+
+ /// histogram of ages we've encountered
+ pow2_hist_t temp_hist;
+ int hist_age;
+
+ /// past HitSet(s) (not current)
+ std::map<time_t,HitSetRef> hit_set_map;
+
+ /// a few recent things we've seen that are clean
+ std::list<hobject_t> recent_clean;
+
+ enum flush_mode_t {
+ FLUSH_MODE_IDLE, // nothing to flush
+ FLUSH_MODE_LOW, // flush dirty objects with a low speed
+ FLUSH_MODE_HIGH, //flush dirty objects with a high speed
+ } flush_mode; ///< current flush behavior
+ static const char *get_flush_mode_name(flush_mode_t m) {
+ switch (m) {
+ case FLUSH_MODE_IDLE: return "idle";
+ case FLUSH_MODE_LOW: return "low";
+ case FLUSH_MODE_HIGH: return "high";
+ default: ceph_abort_msg("bad flush mode");
+ }
+ }
+ const char *get_flush_mode_name() const {
+ return get_flush_mode_name(flush_mode);
+ }
+
+ enum evict_mode_t {
+ EVICT_MODE_IDLE, // no need to evict anything
+ EVICT_MODE_SOME, // evict some things as we are near the target
+ EVICT_MODE_FULL, // evict anything
+ } evict_mode; ///< current evict behavior
+ static const char *get_evict_mode_name(evict_mode_t m) {
+ switch (m) {
+ case EVICT_MODE_IDLE: return "idle";
+ case EVICT_MODE_SOME: return "some";
+ case EVICT_MODE_FULL: return "full";
+ default: ceph_abort_msg("bad evict mode");
+ }
+ }
+ const char *get_evict_mode_name() const {
+ return get_evict_mode_name(evict_mode);
+ }
+
+ /// approximate ratio of objects (assuming they are uniformly
+ /// distributed) that i should aim to evict.
+ unsigned evict_effort;
+
+ TierAgentState()
+ : started(0),
+ delaying(false),
+ hist_age(0),
+ flush_mode(FLUSH_MODE_IDLE),
+ evict_mode(EVICT_MODE_IDLE),
+ evict_effort(0)
+ {}
+
+ /// false if we have any work to do
+ bool is_idle() const {
+ return
+ delaying ||
+ (flush_mode == FLUSH_MODE_IDLE &&
+ evict_mode == EVICT_MODE_IDLE);
+ }
+
+ /// add archived HitSet
+ void add_hit_set(time_t start, HitSetRef hs) {
+ hit_set_map.insert(std::make_pair(start, hs));
+ }
+
+ /// remove old/trimmed HitSet
+ void remove_oldest_hit_set() {
+ if (!hit_set_map.empty())
+ hit_set_map.erase(hit_set_map.begin());
+ }
+
+ /// discard all open hit sets
+ void discard_hit_sets() {
+ hit_set_map.clear();
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->dump_string("flush_mode", get_flush_mode_name());
+ f->dump_string("evict_mode", get_evict_mode_name());
+ f->dump_unsigned("evict_effort", evict_effort);
+ f->dump_stream("position") << position;
+ f->open_object_section("temp_hist");
+ temp_hist.dump(f);
+ f->close_section();
+ }
+};
+
+#endif
diff --git a/src/osd/Watch.cc b/src/osd/Watch.cc
new file mode 100644
index 000000000..78aae6e2d
--- /dev/null
+++ b/src/osd/Watch.cc
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include "PG.h"
+
+#include "include/types.h"
+#include "messages/MWatchNotify.h"
+
+#include <map>
+
+#include "OSD.h"
+#include "PrimaryLogPG.h"
+#include "Watch.h"
+#include "Session.h"
+
+#include "common/config.h"
+
+#define dout_context osd->cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+using std::list;
+using std::make_pair;
+using std::pair;
+using std::ostream;
+using std::set;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+struct CancelableContext : public Context {
+ virtual void cancel() = 0;
+};
+
+
+static ostream& _prefix(
+ std::ostream* _dout,
+ Notify *notify) {
+ return notify->gen_dbg_prefix(*_dout);
+}
+
+Notify::Notify(
+ ConnectionRef client,
+ uint64_t client_gid,
+ bufferlist &payload,
+ uint32_t timeout,
+ uint64_t cookie,
+ uint64_t notify_id,
+ uint64_t version,
+ OSDService *osd)
+ : client(client),
+ client_gid(client_gid),
+ complete(false),
+ discarded(false),
+ timed_out(false),
+ payload(payload),
+ timeout(timeout),
+ cookie(cookie),
+ notify_id(notify_id),
+ version(version),
+ osd(osd),
+ cb(nullptr) {}
+
+NotifyRef Notify::makeNotifyRef(
+ ConnectionRef client,
+ uint64_t client_gid,
+ bufferlist &payload,
+ uint32_t timeout,
+ uint64_t cookie,
+ uint64_t notify_id,
+ uint64_t version,
+ OSDService *osd) {
+ NotifyRef ret(
+ new Notify(
+ client, client_gid,
+ payload, timeout,
+ cookie, notify_id,
+ version, osd));
+ ret->set_self(ret);
+ return ret;
+}
+
+class NotifyTimeoutCB : public CancelableContext {
+ NotifyRef notif;
+ bool canceled; // protected by notif lock
+public:
+ explicit NotifyTimeoutCB(NotifyRef notif) : notif(notif), canceled(false) {}
+ void finish(int) override {
+ notif->osd->watch_lock.unlock();
+ notif->lock.lock();
+ if (!canceled)
+ notif->do_timeout(); // drops lock
+ else
+ notif->lock.unlock();
+ notif->osd->watch_lock.lock();
+ }
+ void cancel() override {
+ ceph_assert(ceph_mutex_is_locked(notif->lock));
+ canceled = true;
+ }
+};
+
+void Notify::do_timeout()
+{
+ ceph_assert(ceph_mutex_is_locked(lock));
+ dout(10) << "timeout" << dendl;
+ cb = nullptr;
+ if (is_discarded()) {
+ lock.unlock();
+ return;
+ }
+
+ timed_out = true; // we will send the client an error code
+ maybe_complete_notify();
+ ceph_assert(complete);
+ set<WatchRef> _watchers;
+ _watchers.swap(watchers);
+ lock.unlock();
+
+ for (auto i = _watchers.begin(); i != _watchers.end(); ++i) {
+ boost::intrusive_ptr<PrimaryLogPG> pg((*i)->get_pg());
+ pg->lock();
+ if (!(*i)->is_discarded()) {
+ (*i)->cancel_notify(self.lock());
+ }
+ pg->unlock();
+ }
+}
+
+void Notify::register_cb()
+{
+ ceph_assert(ceph_mutex_is_locked(lock));
+ {
+ std::lock_guard l{osd->watch_lock};
+ cb = new NotifyTimeoutCB(self.lock());
+ if (!osd->watch_timer.add_event_after(timeout, cb)) {
+ cb = nullptr;
+ }
+ }
+}
+
+void Notify::unregister_cb()
+{
+ ceph_assert(ceph_mutex_is_locked(lock));
+ if (!cb)
+ return;
+ cb->cancel();
+ {
+ std::lock_guard l{osd->watch_lock};
+ osd->watch_timer.cancel_event(cb);
+ cb = nullptr;
+ }
+}
+
+void Notify::start_watcher(WatchRef watch)
+{
+ std::lock_guard l(lock);
+ dout(10) << "start_watcher" << dendl;
+ watchers.insert(watch);
+}
+
+void Notify::complete_watcher(WatchRef watch, bufferlist& reply_bl)
+{
+ std::lock_guard l(lock);
+ dout(10) << "complete_watcher" << dendl;
+ if (is_discarded())
+ return;
+ ceph_assert(watchers.count(watch));
+ watchers.erase(watch);
+ notify_replies.insert(make_pair(make_pair(watch->get_watcher_gid(),
+ watch->get_cookie()),
+ reply_bl));
+ maybe_complete_notify();
+}
+
+void Notify::complete_watcher_remove(WatchRef watch)
+{
+ std::lock_guard l(lock);
+ dout(10) << __func__ << dendl;
+ if (is_discarded())
+ return;
+ ceph_assert(watchers.count(watch));
+ watchers.erase(watch);
+ maybe_complete_notify();
+}
+
+void Notify::maybe_complete_notify()
+{
+ dout(10) << "maybe_complete_notify -- "
+ << watchers.size()
+ << " in progress watchers " << dendl;
+ if (watchers.empty() || timed_out) {
+ // prepare reply
+ bufferlist bl;
+ encode(notify_replies, bl);
+ list<pair<uint64_t,uint64_t> > missed;
+ for (auto p = watchers.begin(); p != watchers.end(); ++p) {
+ missed.push_back(make_pair((*p)->get_watcher_gid(),
+ (*p)->get_cookie()));
+ }
+ encode(missed, bl);
+
+ bufferlist empty;
+ auto* const reply = new MWatchNotify(
+ cookie,
+ version,
+ notify_id,
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE,
+ empty,
+ client_gid);
+ reply->set_data(bl);
+ if (timed_out)
+ reply->return_code = -ETIMEDOUT;
+ client->send_message(reply);
+ unregister_cb();
+
+ complete = true;
+ }
+}
+
+void Notify::discard()
+{
+ std::lock_guard l(lock);
+ discarded = true;
+ unregister_cb();
+ watchers.clear();
+}
+
+void Notify::init()
+{
+ std::lock_guard l(lock);
+ register_cb();
+ maybe_complete_notify();
+}
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, watch.get())
+
+static ostream& _prefix(
+ std::ostream* _dout,
+ Watch *watch) {
+ return watch->gen_dbg_prefix(*_dout);
+}
+
+class HandleWatchTimeout : public CancelableContext {
+ WatchRef watch;
+public:
+ bool canceled; // protected by watch->pg->lock
+ explicit HandleWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {}
+ void cancel() override {
+ canceled = true;
+ }
+ void finish(int) override { ceph_abort(); /* not used */ }
+ void complete(int) override {
+ OSDService *osd(watch->osd);
+ ldout(osd->cct, 10) << "HandleWatchTimeout" << dendl;
+ boost::intrusive_ptr<PrimaryLogPG> pg(watch->pg);
+ osd->watch_lock.unlock();
+ pg->lock();
+ watch->cb = nullptr;
+ if (!watch->is_discarded() && !canceled)
+ watch->pg->handle_watch_timeout(watch);
+ delete this; // ~Watch requires pg lock!
+ pg->unlock();
+ osd->watch_lock.lock();
+ }
+};
+
+class HandleDelayedWatchTimeout : public CancelableContext {
+ WatchRef watch;
+public:
+ bool canceled;
+ explicit HandleDelayedWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {}
+ void cancel() override {
+ canceled = true;
+ }
+ void finish(int) override {
+ OSDService *osd(watch->osd);
+ dout(10) << "HandleWatchTimeoutDelayed" << dendl;
+ ceph_assert(watch->pg->is_locked());
+ watch->cb = nullptr;
+ if (!watch->is_discarded() && !canceled)
+ watch->pg->handle_watch_timeout(watch);
+ }
+};
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+std::ostream& Watch::gen_dbg_prefix(std::ostream& out) {
+ return pg->gen_prefix(out) << " -- Watch("
+ << make_pair(cookie, entity) << ") ";
+}
+
+Watch::Watch(
+ PrimaryLogPG *pg,
+ OSDService *osd,
+ ObjectContextRef obc,
+ uint32_t timeout,
+ uint64_t cookie,
+ entity_name_t entity,
+ const entity_addr_t &addr)
+ : cb(NULL),
+ osd(osd),
+ pg(pg),
+ obc(obc),
+ timeout(timeout),
+ cookie(cookie),
+ addr(addr),
+ will_ping(false),
+ entity(entity),
+ discarded(false) {
+ dout(10) << "Watch()" << dendl;
+}
+
+Watch::~Watch() {
+ dout(10) << "~Watch" << dendl;
+ // users must have called remove() or discard() prior to this point
+ ceph_assert(!obc);
+ ceph_assert(!is_connected());
+}
+
+Context *Watch::get_delayed_cb()
+{
+ ceph_assert(!cb);
+ cb = new HandleDelayedWatchTimeout(self.lock());
+ return cb;
+}
+
+void Watch::register_cb()
+{
+ std::lock_guard l(osd->watch_lock);
+ if (cb) {
+ dout(15) << "re-registering callback, timeout: " << timeout << dendl;
+ cb->cancel();
+ osd->watch_timer.cancel_event(cb);
+ } else {
+ dout(15) << "registering callback, timeout: " << timeout << dendl;
+ }
+ cb = new HandleWatchTimeout(self.lock());
+ if (!osd->watch_timer.add_event_after(timeout, cb)) {
+ cb = nullptr;
+ }
+}
+
+void Watch::unregister_cb()
+{
+ dout(15) << "unregister_cb" << dendl;
+ if (!cb)
+ return;
+ dout(15) << "actually registered, cancelling" << dendl;
+ cb->cancel();
+ {
+ std::lock_guard l(osd->watch_lock);
+ osd->watch_timer.cancel_event(cb); // harmless if not registered with timer
+ }
+ cb = nullptr;
+}
+
+void Watch::got_ping(utime_t t)
+{
+ last_ping = t;
+ if (is_connected()) {
+ register_cb();
+ }
+}
+
+void Watch::connect(ConnectionRef con, bool _will_ping)
+{
+ if (is_connected(con.get())) {
+ dout(10) << __func__ << " con " << con << " - already connected" << dendl;
+ return;
+ }
+ dout(10) << __func__ << " con " << con << dendl;
+ conn = con;
+ will_ping = _will_ping;
+ auto priv = con->get_priv();
+ if (priv) {
+ auto sessionref = static_cast<Session*>(priv.get());
+ sessionref->wstate.addWatch(self.lock());
+ priv.reset();
+ for (auto i = in_progress_notifies.begin();
+ i != in_progress_notifies.end();
+ ++i) {
+ send_notify(i->second);
+ }
+ }
+ if (will_ping) {
+ last_ping = ceph_clock_now();
+ register_cb();
+ } else {
+ unregister_cb();
+ }
+}
+
+void Watch::disconnect()
+{
+ dout(10) << "disconnect (con was " << conn << ")" << dendl;
+ conn = ConnectionRef();
+ if (!will_ping)
+ register_cb();
+}
+
+void Watch::discard()
+{
+ dout(10) << "discard" << dendl;
+ for (auto i = in_progress_notifies.begin();
+ i != in_progress_notifies.end();
+ ++i) {
+ i->second->discard();
+ }
+ discard_state();
+}
+
+void Watch::discard_state()
+{
+ ceph_assert(pg->is_locked());
+ ceph_assert(!discarded);
+ ceph_assert(obc);
+ in_progress_notifies.clear();
+ unregister_cb();
+ discarded = true;
+ if (is_connected()) {
+ if (auto priv = conn->get_priv(); priv) {
+ auto session = static_cast<Session*>(priv.get());
+ session->wstate.removeWatch(self.lock());
+ }
+ conn = ConnectionRef();
+ }
+ obc = ObjectContextRef();
+}
+
+bool Watch::is_discarded() const
+{
+ return discarded;
+}
+
+void Watch::remove(bool send_disconnect)
+{
+ dout(10) << "remove" << dendl;
+ if (send_disconnect && is_connected()) {
+ bufferlist empty;
+ MWatchNotify *reply(new MWatchNotify(cookie, 0, 0,
+ CEPH_WATCH_EVENT_DISCONNECT, empty));
+ conn->send_message(reply);
+ }
+ for (auto i = in_progress_notifies.begin();
+ i != in_progress_notifies.end();
+ ++i) {
+ i->second->complete_watcher_remove(self.lock());
+ }
+ discard_state();
+}
+
+void Watch::start_notify(NotifyRef notif)
+{
+ ceph_assert(in_progress_notifies.find(notif->notify_id) ==
+ in_progress_notifies.end());
+ if (will_ping) {
+ utime_t cutoff = ceph_clock_now();
+ cutoff.sec_ref() -= timeout;
+ if (last_ping < cutoff) {
+ dout(10) << __func__ << " " << notif->notify_id
+ << " last_ping " << last_ping << " < cutoff " << cutoff
+ << ", disconnecting" << dendl;
+ disconnect();
+ return;
+ }
+ }
+ dout(10) << "start_notify " << notif->notify_id << dendl;
+ in_progress_notifies[notif->notify_id] = notif;
+ notif->start_watcher(self.lock());
+ if (is_connected())
+ send_notify(notif);
+}
+
+void Watch::cancel_notify(NotifyRef notif)
+{
+ dout(10) << "cancel_notify " << notif->notify_id << dendl;
+ in_progress_notifies.erase(notif->notify_id);
+}
+
+void Watch::send_notify(NotifyRef notif)
+{
+ dout(10) << "send_notify" << dendl;
+ MWatchNotify *notify_msg = new MWatchNotify(
+ cookie,
+ notif->version,
+ notif->notify_id,
+ CEPH_WATCH_EVENT_NOTIFY,
+ notif->payload,
+ notif->client_gid);
+ conn->send_message(notify_msg);
+}
+
+void Watch::notify_ack(uint64_t notify_id, bufferlist& reply_bl)
+{
+ dout(10) << "notify_ack" << dendl;
+ auto i = in_progress_notifies.find(notify_id);
+ if (i != in_progress_notifies.end()) {
+ i->second->complete_watcher(self.lock(), reply_bl);
+ in_progress_notifies.erase(i);
+ }
+}
+
+WatchRef Watch::makeWatchRef(
+ PrimaryLogPG *pg, OSDService *osd,
+ ObjectContextRef obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t& addr)
+{
+ WatchRef ret(new Watch(pg, osd, obc, timeout, cookie, entity, addr));
+ ret->set_self(ret);
+ return ret;
+}
+
+void WatchConState::addWatch(WatchRef watch)
+{
+ std::lock_guard l(lock);
+ watches.insert(watch);
+}
+
+void WatchConState::removeWatch(WatchRef watch)
+{
+ std::lock_guard l(lock);
+ watches.erase(watch);
+}
+
+void WatchConState::reset(Connection *con)
+{
+ set<WatchRef> _watches;
+ {
+ std::lock_guard l(lock);
+ _watches.swap(watches);
+ }
+ for (set<WatchRef>::iterator i = _watches.begin();
+ i != _watches.end();
+ ++i) {
+ boost::intrusive_ptr<PrimaryLogPG> pg((*i)->get_pg());
+ pg->lock();
+ if (!(*i)->is_discarded()) {
+ if ((*i)->is_connected(con)) {
+ (*i)->disconnect();
+ } else {
+ lgeneric_derr(cct) << __func__ << " not still connected to " << (*i) << dendl;
+ }
+ }
+ pg->unlock();
+ }
+}
diff --git a/src/osd/Watch.h b/src/osd/Watch.h
new file mode 100644
index 000000000..8d6d93a7d
--- /dev/null
+++ b/src/osd/Watch.h
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_WATCH_H
+#define CEPH_WATCH_H
+
+#include <set>
+#include "msg/Connection.h"
+#include "include/Context.h"
+
+enum WatcherState {
+ WATCHER_PENDING,
+ WATCHER_NOTIFIED,
+};
+
+class OSDService;
+class PrimaryLogPG;
+void intrusive_ptr_add_ref(PrimaryLogPG *pg);
+void intrusive_ptr_release(PrimaryLogPG *pg);
+struct ObjectContext;
+class MWatchNotify;
+
+class Watch;
+typedef std::shared_ptr<Watch> WatchRef;
+typedef std::weak_ptr<Watch> WWatchRef;
+
+class Notify;
+typedef std::shared_ptr<Notify> NotifyRef;
+typedef std::weak_ptr<Notify> WNotifyRef;
+
+struct CancelableContext;
+
+/**
+ * Notify tracks the progress of a particular notify
+ *
+ * References are held by Watch and the timeout callback.
+ */
+class Notify {
+ friend class NotifyTimeoutCB;
+ friend class Watch;
+ WNotifyRef self;
+ ConnectionRef client;
+ uint64_t client_gid;
+ bool complete;
+ bool discarded;
+ bool timed_out; ///< true if the notify timed out
+ std::set<WatchRef> watchers;
+
+ ceph::buffer::list payload;
+ uint32_t timeout;
+ uint64_t cookie;
+ uint64_t notify_id;
+ uint64_t version;
+
+ OSDService *osd;
+ CancelableContext *cb;
+ ceph::mutex lock = ceph::make_mutex("Notify::lock");
+
+ /// (gid,cookie) -> reply_bl for everyone who acked the notify
+ std::multimap<std::pair<uint64_t,uint64_t>, ceph::buffer::list> notify_replies;
+
+ /// true if this notify is being discarded
+ bool is_discarded() {
+ return discarded || complete;
+ }
+
+ /// Sends notify completion if watchers.empty() or timeout
+ void maybe_complete_notify();
+
+ /// Called on Notify timeout
+ void do_timeout();
+
+ Notify(
+ ConnectionRef client,
+ uint64_t client_gid,
+ ceph::buffer::list& payload,
+ uint32_t timeout,
+ uint64_t cookie,
+ uint64_t notify_id,
+ uint64_t version,
+ OSDService *osd);
+
+ /// registers a timeout callback with the watch_timer
+ void register_cb();
+
+ /// removes the timeout callback, called on completion or cancellation
+ void unregister_cb();
+public:
+
+ std::ostream& gen_dbg_prefix(std::ostream& out) {
+ return out << "Notify(" << std::make_pair(cookie, notify_id) << " "
+ << " watchers=" << watchers.size()
+ << ") ";
+ }
+ void set_self(NotifyRef _self) {
+ self = _self;
+ }
+ static NotifyRef makeNotifyRef(
+ ConnectionRef client,
+ uint64_t client_gid,
+ ceph::buffer::list &payload,
+ uint32_t timeout,
+ uint64_t cookie,
+ uint64_t notify_id,
+ uint64_t version,
+ OSDService *osd);
+
+ /// Call after creation to initialize
+ void init();
+
+ /// Called once per watcher prior to init()
+ void start_watcher(
+ WatchRef watcher ///< [in] watcher to complete
+ );
+
+ /// Called once per NotifyAck
+ void complete_watcher(
+ WatchRef watcher, ///< [in] watcher to complete
+ ceph::buffer::list& reply_bl ///< [in] reply buffer from the notified watcher
+ );
+ /// Called when a watcher unregisters or times out
+ void complete_watcher_remove(
+ WatchRef watcher ///< [in] watcher to complete
+ );
+
+ /// Called when the notify is canceled due to a new peering interval
+ void discard();
+};
+
+/**
+ * Watch is a mapping between a Connection and an ObjectContext
+ *
+ * References are held by ObjectContext and the timeout callback
+ */
+class HandleWatchTimeout;
+class HandleDelayedWatchTimeout;
+class Watch {
+ WWatchRef self;
+ friend class HandleWatchTimeout;
+ friend class HandleDelayedWatchTimeout;
+ ConnectionRef conn;
+ CancelableContext *cb;
+
+ OSDService *osd;
+ boost::intrusive_ptr<PrimaryLogPG> pg;
+ std::shared_ptr<ObjectContext> obc;
+
+ std::map<uint64_t, NotifyRef> in_progress_notifies;
+
+ // Could have watch_info_t here, but this file includes osd_types.h
+ uint32_t timeout; ///< timeout in seconds
+ uint64_t cookie;
+ entity_addr_t addr;
+
+ bool will_ping; ///< is client new enough to ping the watch
+ utime_t last_ping; ///< last client ping
+
+ entity_name_t entity;
+ bool discarded;
+
+ Watch(
+ PrimaryLogPG *pg, OSDService *osd,
+ std::shared_ptr<ObjectContext> obc, uint32_t timeout,
+ uint64_t cookie, entity_name_t entity,
+ const entity_addr_t& addr);
+
+ /// Registers the timeout callback with watch_timer
+ void register_cb();
+
+ /// send a Notify message when connected for notif
+ void send_notify(NotifyRef notif);
+
+ /// Cleans up state on discard or remove (including Connection state, obc)
+ void discard_state();
+public:
+ /// Unregisters the timeout callback
+ void unregister_cb();
+
+ /// note receipt of a ping
+ void got_ping(utime_t t);
+ utime_t get_last_ping() const {
+ return last_ping;
+ }
+
+ /// True if currently connected
+ bool is_connected() const {
+ return conn.get() != NULL;
+ }
+ bool is_connected(Connection *con) const {
+ return conn.get() == con;
+ }
+
+ /// NOTE: must be called with pg lock held
+ ~Watch();
+
+ uint64_t get_watcher_gid() const {
+ return entity.num();
+ }
+
+ std::ostream& gen_dbg_prefix(std::ostream& out);
+ static WatchRef makeWatchRef(
+ PrimaryLogPG *pg, OSDService *osd,
+ std::shared_ptr<ObjectContext> obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t &addr);
+ void set_self(WatchRef _self) {
+ self = _self;
+ }
+
+ /// Does not grant a ref count!
+ boost::intrusive_ptr<PrimaryLogPG> get_pg() { return pg; }
+
+ std::shared_ptr<ObjectContext> get_obc() { return obc; }
+
+ uint64_t get_cookie() const { return cookie; }
+ entity_name_t get_entity() const { return entity; }
+ entity_addr_t get_peer_addr() const { return addr; }
+ uint32_t get_timeout() const { return timeout; }
+
+ /// Generates context for use if watch timeout is delayed by scrub or recovery
+ Context *get_delayed_cb();
+
+ /// Transitions Watch to connected, unregister_cb, resends pending Notifies
+ void connect(
+ ConnectionRef con, ///< [in] Reference to new connection
+ bool will_ping ///< [in] client is new and will send pings
+ );
+
+ /// Transitions watch to disconnected, register_cb
+ void disconnect();
+
+ /// Called if Watch state is discarded due to new peering interval
+ void discard();
+
+ /// True if removed or discarded
+ bool is_discarded() const;
+
+ /// Called on unwatch
+ void remove(bool send_disconnect);
+
+ /// Adds notif as in-progress notify
+ void start_notify(
+ NotifyRef notif ///< [in] Reference to new in-progress notify
+ );
+
+ /// Removes timed out notify
+ void cancel_notify(
+ NotifyRef notif ///< [in] notify which timed out
+ );
+
+ /// Call when notify_ack received on notify_id
+ void notify_ack(
+ uint64_t notify_id, ///< [in] id of acked notify
+ ceph::buffer::list& reply_bl ///< [in] notify reply buffer
+ );
+};
+
+/**
+ * Holds weak refs to Watch structures corresponding to a connection
+ * Lives in the Session object of an OSD connection
+ */
+class WatchConState {
+ ceph::mutex lock = ceph::make_mutex("WatchConState");
+ std::set<WatchRef> watches;
+public:
+ CephContext* cct;
+ explicit WatchConState(CephContext* cct) : cct(cct) {}
+
+ /// Add a watch
+ void addWatch(
+ WatchRef watch ///< [in] Ref to new watch object
+ );
+
+ /// Remove a watch
+ void removeWatch(
+ WatchRef watch ///< [in] Ref to watch object to remove
+ );
+
+ /// Called on session reset, disconnects watchers
+ void reset(Connection *con);
+};
+
+#endif
diff --git a/src/osd/error_code.cc b/src/osd/error_code.cc
new file mode 100644
index 000000000..97f0012fd
--- /dev/null
+++ b/src/osd/error_code.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+
+#include "common/error_code.h"
+#include "common/errno.h"
+#include "error_code.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+class osd_error_category : public ceph::converting_category {
+public:
+ osd_error_category(){}
+ const char* name() const noexcept override;
+ const char* message(int ev, char*, std::size_t) const noexcept override;
+ std::string message(int ev) const override;
+ boost::system::error_condition default_error_condition(int ev) const noexcept
+ override;
+ bool equivalent(int ev, const boost::system::error_condition& c) const
+ noexcept override;
+ using ceph::converting_category::equivalent;
+ int from_code(int ev) const noexcept override;
+};
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+const char* osd_error_category::name() const noexcept {
+ return "osd";
+}
+
+const char* osd_error_category::message(int ev, char* buf,
+ std::size_t len) const noexcept {
+ if (ev == 0)
+ return "No error";
+
+ switch (static_cast<osd_errc>(ev)) {
+ case osd_errc::old_snapc:
+ return "ORDERSNAP flag set; writer has old snapc";
+ case osd_errc::blocklisted:
+ return "Blocklisted";
+ }
+
+ if (len) {
+ auto s = cpp_strerror(ev);
+ auto n = s.copy(buf, len - 1);
+ *(buf + n) = '\0';
+ }
+ return buf;
+}
+
+std::string osd_error_category::message(int ev) const {
+ if (ev == 0)
+ return "No error";
+
+ switch (static_cast<osd_errc>(ev)) {
+ case osd_errc::old_snapc:
+ return "ORDERSNAP flag set; writer has old snapc";
+ case osd_errc::blocklisted:
+ return "Blocklisted";
+ }
+
+ return cpp_strerror(ev);
+}
+
+boost::system::error_condition osd_error_category::default_error_condition(int ev) const noexcept {
+ if (ev == static_cast<int>(osd_errc::old_snapc) ||
+ ev == static_cast<int>(osd_errc::blocklisted))
+ return { ev, *this };
+ else
+ return { ev, boost::system::generic_category() };
+}
+
+bool osd_error_category::equivalent(int ev, const boost::system::error_condition& c) const noexcept {
+ switch (static_cast<osd_errc>(ev)) {
+ case osd_errc::old_snapc:
+ return c == boost::system::errc::invalid_argument;
+ case osd_errc::blocklisted:
+ return c == boost::system::errc::operation_not_permitted;
+ }
+ return default_error_condition(ev) == c;
+}
+
+int osd_error_category::from_code(int ev) const noexcept {
+ return -ev;
+}
+
+const boost::system::error_category& osd_category() noexcept {
+ static const osd_error_category c;
+ return c;
+}
diff --git a/src/osd/error_code.h b/src/osd/error_code.h
new file mode 100644
index 000000000..d36e79db4
--- /dev/null
+++ b/src/osd/error_code.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/system/error_code.hpp>
+
+#include "include/rados.h"
+
+const boost::system::error_category& osd_category() noexcept;
+
+// Since the OSD mostly uses POSIX error codes plus a couple
+// additions, this will be a degenerate error category for now that
+// mostly forwards to POSIX.
+
+enum class osd_errc {
+ old_snapc = 85, /* ORDERSNAP flag set; writer has old snapc*/
+ blocklisted = 108 /* blocklisted */
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::osd_errc> {
+ static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::osd_errc> {
+ static const bool value = false;
+};
+}
+
+// implicit conversion:
+inline boost::system::error_code make_error_code(osd_errc e) noexcept {
+ return { static_cast<int>(e), osd_category() };
+}
+
+// explicit conversion:
+inline boost::system::error_condition make_error_condition(osd_errc e) noexcept {
+ return { static_cast<int>(e), osd_category() };
+}
diff --git a/src/osd/objclass.cc b/src/osd/objclass.cc
new file mode 100644
index 000000000..274f5e063
--- /dev/null
+++ b/src/osd/objclass.cc
@@ -0,0 +1,702 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdarg>
+#include "common/ceph_context.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "objclass/objclass.h"
+#include "osd/PrimaryLogPG.h"
+
+#include "osd/ClassHandler.h"
+
+#include "auth/Crypto.h"
+#include "common/armor.h"
+
+#define dout_context ClassHandler::get_instance().cct
+
+using std::map;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::real_time;
+
+
+int cls_call(cls_method_context_t hctx, const char *cls, const char *method,
+ char *indata, int datalen, char **outdata, int *outdatalen)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ bufferlist idata;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_CALL;
+ op.op.cls.class_len = strlen(cls);
+ op.op.cls.method_len = strlen(method);
+ op.op.cls.indata_len = datalen;
+ op.indata.append(cls, op.op.cls.class_len);
+ op.indata.append(method, op.op.cls.method_len);
+ op.indata.append(indata, datalen);
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+ if (r < 0)
+ return r;
+
+ *outdata = (char *)malloc(op.outdata.length());
+ if (!*outdata)
+ return -ENOMEM;
+ memcpy(*outdata, op.outdata.c_str(), op.outdata.length());
+ *outdatalen = op.outdata.length();
+
+ return r;
+}
+
+int cls_getxattr(cls_method_context_t hctx, const char *name,
+ char **outdata, int *outdatalen)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_GETXATTR;
+ op.op.xattr.name_len = strlen(name);
+ op.indata.append(name, op.op.xattr.name_len);
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+ if (r < 0)
+ return r;
+
+ *outdata = (char *)malloc(op.outdata.length());
+ if (!*outdata)
+ return -ENOMEM;
+ memcpy(*outdata, op.outdata.c_str(), op.outdata.length());
+ *outdatalen = op.outdata.length();
+
+ return r;
+}
+
+int cls_setxattr(cls_method_context_t hctx, const char *name,
+ const char *value, int val_len)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_SETXATTR;
+ op.op.xattr.name_len = strlen(name);
+ op.op.xattr.value_len = val_len;
+ op.indata.append(name, op.op.xattr.name_len);
+ op.indata.append(value, val_len);
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+
+ return r;
+}
+
+int cls_read(cls_method_context_t hctx, int ofs, int len,
+ char **outdata, int *outdatalen)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_SYNC_READ;
+ ops[0].op.extent.offset = ofs;
+ ops[0].op.extent.length = len;
+ int r = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (r < 0)
+ return r;
+
+ *outdata = (char *)malloc(ops[0].outdata.length());
+ if (!*outdata)
+ return -ENOMEM;
+ memcpy(*outdata, ops[0].outdata.c_str(), ops[0].outdata.length());
+ *outdatalen = ops[0].outdata.length();
+
+ return *outdatalen;
+}
+
+int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
+{
+ PrimaryLogPG::OpContext **pctx = static_cast<PrimaryLogPG::OpContext **>(hctx);
+ *origin = (*pctx)->op->get_req()->get_orig_source_inst();
+ return 0;
+}
+
+int cls_cxx_create(cls_method_context_t hctx, bool exclusive)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_CREATE;
+ ops[0].op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0);
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_remove(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_DELETE;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ int ret;
+ ops[0].op.op = CEPH_OSD_OP_STAT;
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+ auto iter = ops[0].outdata.cbegin();
+ utime_t ut;
+ uint64_t s;
+ try {
+ decode(s, iter);
+ decode(ut, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ if (size)
+ *size = s;
+ if (mtime)
+ *mtime = ut.sec();
+ return 0;
+}
+
+int cls_cxx_stat2(cls_method_context_t hctx, uint64_t *size, ceph::real_time *mtime)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ int ret;
+ ops[0].op.op = CEPH_OSD_OP_STAT;
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+ auto iter = ops[0].outdata.cbegin();
+ real_time ut;
+ uint64_t s;
+ try {
+ decode(s, iter);
+ decode(ut, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ if (size)
+ *size = s;
+ if (mtime)
+ *mtime = ut;
+ return 0;
+}
+
+int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len,
+ bufferlist *outbl, uint32_t op_flags)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ int ret;
+ ops[0].op.op = CEPH_OSD_OP_SYNC_READ;
+ ops[0].op.extent.offset = ofs;
+ ops[0].op.extent.length = len;
+ ops[0].op.flags = op_flags;
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+ *outbl = std::move(ops[0].outdata);
+ return outbl->length();
+}
+
+int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len,
+ bufferlist *inbl, uint32_t op_flags)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_WRITE;
+ ops[0].op.extent.offset = ofs;
+ ops[0].op.extent.length = len;
+ ops[0].op.flags = op_flags;
+ ops[0].indata = *inbl;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_write_full(cls_method_context_t hctx, bufferlist *inbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_WRITEFULL;
+ ops[0].op.extent.offset = 0;
+ ops[0].op.extent.length = inbl->length();
+ ops[0].indata = *inbl;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_replace(cls_method_context_t hctx, int ofs, int len, bufferlist *inbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(2);
+ ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
+ ops[0].op.extent.offset = 0;
+ ops[0].op.extent.length = 0;
+ ops[1].op.op = CEPH_OSD_OP_WRITE;
+ ops[1].op.extent.offset = ofs;
+ ops[1].op.extent.length = len;
+ ops[1].indata = *inbl;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_truncate(cls_method_context_t hctx, int ofs)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
+ ops[0].op.extent.offset = ofs;
+ ops[0].op.extent.length = 0;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_write_zero(cls_method_context_t hctx, int ofs, int len)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_ZERO;
+ ops[0].op.extent.offset = ofs;
+ ops[0].op.extent.length = len;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+ bufferlist *outbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_GETXATTR;
+ op.op.xattr.name_len = strlen(name);
+ op.indata.append(name, op.op.xattr.name_len);
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+ if (r < 0)
+ return r;
+
+ *outbl = std::move(op.outdata);
+ return outbl->length();
+}
+
+int cls_cxx_getxattrs(cls_method_context_t hctx, map<string, bufferlist> *attrset)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_GETXATTRS;
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+ if (r < 0)
+ return r;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ decode(*attrset, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+ bufferlist *inbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_SETXATTR;
+ op.op.xattr.name_len = strlen(name);
+ op.op.xattr.value_len = inbl->length();
+ op.indata.append(name, op.op.xattr.name_len);
+ op.indata.append(*inbl);
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+
+ return r;
+}
+
+int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ ops[0].op.op = CEPH_OSD_OP_ROLLBACK;
+ ops[0].op.snap.snapid = snapid;
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_get_all_vals(cls_method_context_t hctx, map<string, bufferlist>* vals,
+ bool *more)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ int ret;
+
+ string start_after;
+ string filter_prefix;
+ uint64_t max = (uint64_t)-1;
+
+ encode(start_after, op.indata);
+ encode(max, op.indata);
+ encode(filter_prefix, op.indata);
+
+ op.op.op = CEPH_OSD_OP_OMAPGETVALS;
+
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ decode(*vals, iter);
+ decode(*more, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ return vals->size();
+}
+
+int cls_cxx_map_get_keys(cls_method_context_t hctx, const string &start_obj,
+ uint64_t max_to_get, set<string> *keys,
+ bool *more)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ int ret;
+
+ encode(start_obj, op.indata);
+ encode(max_to_get, op.indata);
+
+ op.op.op = CEPH_OSD_OP_OMAPGETKEYS;
+
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ decode(*keys, iter);
+ decode(*more, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ return keys->size();
+}
+
+int cls_cxx_map_get_vals(cls_method_context_t hctx, const string &start_obj,
+ const string &filter_prefix, uint64_t max_to_get,
+ map<string, bufferlist> *vals, bool *more)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ int ret;
+
+ encode(start_obj, op.indata);
+ encode(max_to_get, op.indata);
+ encode(filter_prefix, op.indata);
+
+ op.op.op = CEPH_OSD_OP_OMAPGETVALS;
+
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ decode(*vals, iter);
+ decode(*more, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ return vals->size();
+}
+
+int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ int ret;
+ op.op.op = CEPH_OSD_OP_OMAPGETHEADER;
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+
+ *outbl = std::move(op.outdata);
+
+ return 0;
+}
+
+int cls_cxx_map_get_val(cls_method_context_t hctx, const string &key,
+ bufferlist *outbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ int ret;
+
+ set<string> k;
+ k.insert(key);
+ encode(k, op.indata);
+
+ op.op.op = CEPH_OSD_OP_OMAPGETVALSBYKEYS;
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ map<string, bufferlist> m;
+
+ decode(m, iter);
+ map<string, bufferlist>::iterator iter = m.begin();
+ if (iter == m.end())
+ return -ENOENT;
+
+ *outbl = iter->second;
+ } catch (ceph::buffer::error& e) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx,
+ const std::set<std::string> &keys,
+ std::map<std::string, bufferlist> *map)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ int ret;
+
+ encode(keys, op.indata);
+
+ op.op.op = CEPH_OSD_OP_OMAPGETVALSBYKEYS;
+ ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+ if (ret < 0)
+ return ret;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ decode(*map, iter);
+ } catch (buffer::error& e) {
+ return -EIO;
+ }
+ return 0;
+}
+
+int cls_cxx_map_set_val(cls_method_context_t hctx, const string &key,
+ bufferlist *inbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ bufferlist& update_bl = op.indata;
+ map<string, bufferlist> m;
+ m[key] = *inbl;
+ encode(m, update_bl);
+
+ op.op.op = CEPH_OSD_OP_OMAPSETVALS;
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_set_vals(cls_method_context_t hctx,
+ const std::map<string, bufferlist> *map)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ bufferlist& update_bl = op.indata;
+ encode(*map, update_bl);
+
+ op.op.op = CEPH_OSD_OP_OMAPSETVALS;
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_clear(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+
+ op.op.op = CEPH_OSD_OP_OMAPCLEAR;
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ op.indata = std::move(*inbl);
+
+ op.op.op = CEPH_OSD_OP_OMAPSETHEADER;
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_remove_range(cls_method_context_t hctx,
+ const std::string& key_begin,
+ const std::string& key_end)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ bufferlist& update_bl = op.indata;
+
+ ::encode(key_begin, update_bl);
+ ::encode(key_end, update_bl);
+
+ op.op.op = CEPH_OSD_OP_OMAPRMKEYRANGE;
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> ops(1);
+ OSDOp& op = ops[0];
+ bufferlist& update_bl = op.indata;
+ set<string> to_rm;
+ to_rm.insert(key);
+
+ encode(to_rm, update_bl);
+
+ op.op.op = CEPH_OSD_OP_OMAPRMKEYS;
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_list_watchers(cls_method_context_t hctx,
+ obj_list_watch_response_t *watchers)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ vector<OSDOp> nops(1);
+ OSDOp& op = nops[0];
+ int r;
+
+ op.op.op = CEPH_OSD_OP_LIST_WATCHERS;
+ r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+ if (r < 0)
+ return r;
+
+ auto iter = op.outdata.cbegin();
+ try {
+ decode(*watchers, iter);
+ } catch (ceph::buffer::error& err) {
+ return -EIO;
+ }
+ return 0;
+}
+
+uint64_t cls_current_version(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+ return ctx->pg->get_last_user_version();
+}
+
+
+int cls_current_subop_num(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+ return ctx->processed_subop_count;
+}
+
+uint64_t cls_get_features(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+ return ctx->pg->get_osdmap()->get_up_osd_features();
+}
+
+uint64_t cls_get_client_features(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+ return ctx->op->get_req()->get_connection()->get_features();
+}
+
+ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+ return ctx->pg->get_osdmap()->require_osd_release;
+}
+
+ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+ return ctx->pg->get_osdmap()->get_require_min_compat_client();
+}
+
+int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) {
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+ if (!ctx->new_obs.exists || (ctx->new_obs.oi.is_whiteout() &&
+ ctx->obc->ssc->snapset.clones.empty())) {
+ return -ENOENT;
+ }
+ *snap_seq = ctx->obc->ssc->snapset.seq;
+ return 0;
+}
+
+int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, int ofs, int len,
+ bufferlist *write_inbl, uint32_t op_flags,
+ bufferlist *set_inbl, int set_len)
+{
+ PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+ char cname[] = "cas";
+ char method[] = "chunk_set";
+
+ vector<OSDOp> ops(2);
+ ops[0].op.op = CEPH_OSD_OP_WRITE;
+ ops[0].op.extent.offset = ofs;
+ ops[0].op.extent.length = len;
+ ops[0].op.flags = op_flags;
+ ops[0].indata = *write_inbl;
+
+ ops[1].op.op = CEPH_OSD_OP_CALL;
+ ops[1].op.cls.class_len = strlen(cname);
+ ops[1].op.cls.method_len = strlen(method);
+ ops[1].op.cls.indata_len = set_len;
+ ops[1].indata.append(cname, ops[1].op.cls.class_len);
+ ops[1].indata.append(method, ops[1].op.cls.method_len);
+ ops[1].indata.append(*set_inbl);
+
+ return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+ return ctx->pg->get_manifest_ref_count(ctx->obc, fp_oid, ctx->op);
+}
+
+uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) {
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+ return ctx->pg->get_min_alloc_size();
+}
+
+uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx)
+{
+ PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+ return ctx->pg->get_pool().stripe_width;
+}
diff --git a/src/osd/object_state.h b/src/osd/object_state.h
new file mode 100644
index 000000000..31987d2a4
--- /dev/null
+++ b/src/osd/object_state.h
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd_types.h"
+
+struct ObjectState {
+ object_info_t oi;
+ bool exists; ///< the stored object exists (i.e., we will remember the object_info_t)
+
+ ObjectState() : exists(false) {}
+
+ ObjectState(const object_info_t &oi_, bool exists_)
+ : oi(oi_), exists(exists_) {}
+ ObjectState(object_info_t &&oi_, bool exists_)
+ : oi(std::move(oi_)), exists(exists_) {}
+ ObjectState(const hobject_t &obj) : oi(obj), exists(false) {}
+};
+
+struct RWState {
+ enum State {
+ RWNONE,
+ RWREAD,
+ RWWRITE,
+ RWEXCL,
+ };
+ static const char *get_state_name(State s) {
+ switch (s) {
+ case RWNONE: return "none";
+ case RWREAD: return "read";
+ case RWWRITE: return "write";
+ case RWEXCL: return "excl";
+ default: return "???";
+ }
+ }
+ const char *get_state_name() const {
+ return get_state_name(state);
+ }
+
+ int count; ///< number of readers or writers
+ int waiters = 0; ///< number waiting
+
+ State state:4; ///< rw state
+ /// if set, restart backfill when we can get a read lock
+ bool recovery_read_marker:1;
+ /// if set, requeue snaptrim on lock release
+ bool snaptrimmer_write_marker:1;
+
+ RWState()
+ : count(0),
+ state(RWNONE),
+ recovery_read_marker(false),
+ snaptrimmer_write_marker(false)
+ {}
+
+ /// this function adjusts the counts if necessary
+ bool get_read_lock() {
+ // don't starve anybody!
+ if (waiters > 0) {
+ return false;
+ }
+ switch (state) {
+ case RWNONE:
+ ceph_assert(count == 0);
+ state = RWREAD;
+ // fall through
+ case RWREAD:
+ count++;
+ return true;
+ case RWWRITE:
+ return false;
+ case RWEXCL:
+ return false;
+ default:
+ ceph_abort_msg("unhandled case");
+ return false;
+ }
+ }
+
+ bool get_write_lock(bool greedy=false) {
+ if (!greedy) {
+ // don't starve anybody!
+ if (waiters > 0 ||
+ recovery_read_marker) {
+ return false;
+ }
+ }
+ switch (state) {
+ case RWNONE:
+ ceph_assert(count == 0);
+ state = RWWRITE;
+ // fall through
+ case RWWRITE:
+ count++;
+ return true;
+ case RWREAD:
+ return false;
+ case RWEXCL:
+ return false;
+ default:
+ ceph_abort_msg("unhandled case");
+ return false;
+ }
+ }
+ bool get_excl_lock() {
+ switch (state) {
+ case RWNONE:
+ ceph_assert(count == 0);
+ state = RWEXCL;
+ count = 1;
+ return true;
+ case RWWRITE:
+ return false;
+ case RWREAD:
+ return false;
+ case RWEXCL:
+ return false;
+ default:
+ ceph_abort_msg("unhandled case");
+ return false;
+ }
+ }
+ /// same as get_write_lock, but ignore starvation
+ bool take_write_lock() {
+ if (state == RWWRITE) {
+ count++;
+ return true;
+ }
+ return get_write_lock();
+ }
+ bool dec() {
+ ceph_assert(count > 0);
+ count--;
+ if (count == 0) {
+ state = RWNONE;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ bool put_read() {
+ ceph_assert(state == RWREAD);
+ return dec();
+ }
+ bool put_write() {
+ ceph_assert(state == RWWRITE);
+ return dec();
+ }
+ bool put_excl() {
+ ceph_assert(state == RWEXCL);
+ return dec();
+ }
+ void inc_waiters() {
+ ++waiters;
+ }
+ void release_waiters() {
+ waiters = 0;
+ }
+ void dec_waiters(int count) {
+ ceph_assert(waiters >= count);
+ waiters -= count;
+ }
+ bool empty() const { return state == RWNONE; }
+
+ bool get_snaptrimmer_write(bool mark_if_unsuccessful) {
+ if (get_write_lock()) {
+ return true;
+ } else {
+ if (mark_if_unsuccessful)
+ snaptrimmer_write_marker = true;
+ return false;
+ }
+ }
+ bool get_recovery_read() {
+ recovery_read_marker = true;
+ if (get_read_lock()) {
+ return true;
+ }
+ return false;
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const RWState& rw)
+{
+ return out << "rwstate(" << rw.get_state_name()
+ << " n=" << rw.count
+ << " w=" << rw.waiters
+ << ")";
+}
diff --git a/src/osd/osd_internal_types.h b/src/osd/osd_internal_types.h
new file mode 100644
index 000000000..17f4f3146
--- /dev/null
+++ b/src/osd/osd_internal_types.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OSD_INTERNAL_TYPES_H
+#define CEPH_OSD_INTERNAL_TYPES_H
+
+#include "osd_types.h"
+#include "OpRequest.h"
+#include "object_state.h"
+
+/*
+ * keep tabs on object modifications that are in flight.
+ * we need to know the projected existence, size, snapset,
+ * etc., because we don't send writes down to disk until after
+ * replicas ack.
+ */
+
+struct SnapSetContext {
+ hobject_t oid;
+ SnapSet snapset;
+ int ref;
+ bool registered : 1;
+ bool exists : 1;
+
+ explicit SnapSetContext(const hobject_t& o) :
+ oid(o), ref(0), registered(false), exists(true) { }
+};
+struct ObjectContext;
+typedef std::shared_ptr<ObjectContext> ObjectContextRef;
+
+struct ObjectContext {
+ ObjectState obs;
+
+ SnapSetContext *ssc; // may be null
+
+ Context *destructor_callback;
+
+public:
+
+ // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers.
+ std::map<std::pair<uint64_t, entity_name_t>, WatchRef> watchers;
+
+ // attr cache
+ std::map<std::string, ceph::buffer::list> attr_cache;
+
+ RWState rwstate;
+ std::list<OpRequestRef> waiters; ///< ops waiting on state change
+ bool get_read(OpRequestRef& op) {
+ if (rwstate.get_read_lock()) {
+ return true;
+ } // else
+ // Now we really need to bump up the ref-counter.
+ waiters.emplace_back(op);
+ rwstate.inc_waiters();
+ return false;
+ }
+ bool get_write(OpRequestRef& op, bool greedy=false) {
+ if (rwstate.get_write_lock(greedy)) {
+ return true;
+ } // else
+ if (op) {
+ waiters.emplace_back(op);
+ rwstate.inc_waiters();
+ }
+ return false;
+ }
+ bool get_excl(OpRequestRef& op) {
+ if (rwstate.get_excl_lock()) {
+ return true;
+ } // else
+ if (op) {
+ waiters.emplace_back(op);
+ rwstate.inc_waiters();
+ }
+ return false;
+ }
+ void wake(std::list<OpRequestRef> *requeue) {
+ rwstate.release_waiters();
+ requeue->splice(requeue->end(), waiters);
+ }
+ void put_read(std::list<OpRequestRef> *requeue) {
+ if (rwstate.put_read()) {
+ wake(requeue);
+ }
+ }
+ void put_write(std::list<OpRequestRef> *requeue) {
+ if (rwstate.put_write()) {
+ wake(requeue);
+ }
+ }
+ void put_excl(std::list<OpRequestRef> *requeue) {
+ if (rwstate.put_excl()) {
+ wake(requeue);
+ }
+ }
+ bool empty() const { return rwstate.empty(); }
+
+ bool get_lock_type(OpRequestRef& op, RWState::State type) {
+ switch (type) {
+ case RWState::RWWRITE:
+ return get_write(op);
+ case RWState::RWREAD:
+ return get_read(op);
+ case RWState::RWEXCL:
+ return get_excl(op);
+ default:
+ ceph_abort_msg("invalid lock type");
+ return true;
+ }
+ }
+ bool get_write_greedy(OpRequestRef& op) {
+ return get_write(op, true);
+ }
+ bool get_snaptrimmer_write(bool mark_if_unsuccessful) {
+ return rwstate.get_snaptrimmer_write(mark_if_unsuccessful);
+ }
+ bool get_recovery_read() {
+ return rwstate.get_recovery_read();
+ }
+ bool try_get_read_lock() {
+ return rwstate.get_read_lock();
+ }
+ void drop_recovery_read(std::list<OpRequestRef> *ls) {
+ ceph_assert(rwstate.recovery_read_marker);
+ put_read(ls);
+ rwstate.recovery_read_marker = false;
+ }
+ void put_lock_type(
+ RWState::State type,
+ std::list<OpRequestRef> *to_wake,
+ bool *requeue_recovery,
+ bool *requeue_snaptrimmer) {
+ switch (type) {
+ case RWState::RWWRITE:
+ put_write(to_wake);
+ break;
+ case RWState::RWREAD:
+ put_read(to_wake);
+ break;
+ case RWState::RWEXCL:
+ put_excl(to_wake);
+ break;
+ default:
+ ceph_abort_msg("invalid lock type");
+ }
+ if (rwstate.empty() && rwstate.recovery_read_marker) {
+ rwstate.recovery_read_marker = false;
+ *requeue_recovery = true;
+ }
+ if (rwstate.empty() && rwstate.snaptrimmer_write_marker) {
+ rwstate.snaptrimmer_write_marker = false;
+ *requeue_snaptrimmer = true;
+ }
+ }
+ bool is_request_pending() {
+ return !rwstate.empty();
+ }
+
+ ObjectContext()
+ : ssc(NULL),
+ destructor_callback(0),
+ blocked(false), requeue_scrub_on_unblock(false) {}
+
+ ~ObjectContext() {
+ ceph_assert(rwstate.empty());
+ if (destructor_callback)
+ destructor_callback->complete(0);
+ }
+
+ void start_block() {
+ ceph_assert(!blocked);
+ blocked = true;
+ }
+ void stop_block() {
+ ceph_assert(blocked);
+ blocked = false;
+ }
+ bool is_blocked() const {
+ return blocked;
+ }
+
+ /// in-progress copyfrom ops for this object
+ bool blocked:1;
+ bool requeue_scrub_on_unblock:1; // true if we need to requeue scrub on unblock
+
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ObjectState& obs)
+{
+ out << obs.oi.soid;
+ if (!obs.exists)
+ out << "(dne)";
+ return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const ObjectContext& obc)
+{
+ return out << "obc(" << obc.obs << " " << obc.rwstate << ")";
+}
+
+class ObcLockManager {
+ struct ObjectLockState {
+ ObjectContextRef obc;
+ RWState::State type;
+ ObjectLockState(
+ ObjectContextRef obc,
+ RWState::State type)
+ : obc(std::move(obc)), type(type) {}
+ };
+ std::map<hobject_t, ObjectLockState> locks;
+public:
+ ObcLockManager() = default;
+ ObcLockManager(ObcLockManager &&) = default;
+ ObcLockManager(const ObcLockManager &) = delete;
+ ObcLockManager &operator=(ObcLockManager &&) = default;
+ bool empty() const {
+ return locks.empty();
+ }
+ bool get_lock_type(
+ RWState::State type,
+ const hobject_t &hoid,
+ ObjectContextRef& obc,
+ OpRequestRef& op) {
+ ceph_assert(locks.find(hoid) == locks.end());
+ if (obc->get_lock_type(op, type)) {
+ locks.insert(std::make_pair(hoid, ObjectLockState(obc, type)));
+ return true;
+ } else {
+ return false;
+ }
+ }
+ /// Get write lock, ignore starvation
+ bool take_write_lock(
+ const hobject_t &hoid,
+ ObjectContextRef obc) {
+ ceph_assert(locks.find(hoid) == locks.end());
+ if (obc->rwstate.take_write_lock()) {
+ locks.insert(
+ std::make_pair(
+ hoid, ObjectLockState(obc, RWState::RWWRITE)));
+ return true;
+ } else {
+ return false;
+ }
+ }
+ /// Get write lock for snap trim
+ bool get_snaptrimmer_write(
+ const hobject_t &hoid,
+ ObjectContextRef obc,
+ bool mark_if_unsuccessful) {
+ ceph_assert(locks.find(hoid) == locks.end());
+ if (obc->get_snaptrimmer_write(mark_if_unsuccessful)) {
+ locks.insert(
+ std::make_pair(
+ hoid, ObjectLockState(obc, RWState::RWWRITE)));
+ return true;
+ } else {
+ return false;
+ }
+ }
+ /// Get write lock greedy
+ bool get_write_greedy(
+ const hobject_t &hoid,
+ ObjectContextRef obc,
+ OpRequestRef op) {
+ ceph_assert(locks.find(hoid) == locks.end());
+ if (obc->get_write_greedy(op)) {
+ locks.insert(
+ std::make_pair(
+ hoid, ObjectLockState(obc, RWState::RWWRITE)));
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /// try get read lock
+ bool try_get_read_lock(
+ const hobject_t &hoid,
+ ObjectContextRef obc) {
+ ceph_assert(locks.find(hoid) == locks.end());
+ if (obc->try_get_read_lock()) {
+ locks.insert(
+ std::make_pair(
+ hoid,
+ ObjectLockState(obc, RWState::RWREAD)));
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ void put_locks(
+ std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > *to_requeue,
+ bool *requeue_recovery,
+ bool *requeue_snaptrimmer) {
+ for (auto& p: locks) {
+ std::list<OpRequestRef> _to_requeue;
+ p.second.obc->put_lock_type(
+ p.second.type,
+ &_to_requeue,
+ requeue_recovery,
+ requeue_snaptrimmer);
+ if (to_requeue) {
+ // We can safely std::move here as the whole `locks` is going
+ // to die just after the loop.
+ to_requeue->emplace_back(std::move(p.second.obc),
+ std::move(_to_requeue));
+ }
+ }
+ locks.clear();
+ }
+ ~ObcLockManager() {
+ ceph_assert(locks.empty());
+ }
+};
+
+
+
+#endif
diff --git a/src/osd/osd_op_util.cc b/src/osd/osd_op_util.cc
new file mode 100644
index 000000000..54c590ee2
--- /dev/null
+++ b/src/osd/osd_op_util.cc
@@ -0,0 +1,263 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd/osd_op_util.h"
+
+#include "osd/ClassHandler.h"
+#include "messages/MOSDOp.h"
+
+using std::ostream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+bool OpInfo::check_rmw(int flag) const {
+ ceph_assert(rmw_flags != 0);
+ return rmw_flags & flag;
+}
+bool OpInfo::may_read() const {
+ return need_read_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_READ);
+}
+bool OpInfo::may_write() const {
+ return need_write_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_WRITE);
+}
+bool OpInfo::may_cache() const { return check_rmw(CEPH_OSD_RMW_FLAG_CACHE); }
+bool OpInfo::rwordered_forced() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_RWORDERED);
+}
+bool OpInfo::rwordered() const {
+ return may_write() || may_cache() || rwordered_forced();
+}
+
+bool OpInfo::includes_pg_op() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_PGOP);
+}
+bool OpInfo::need_read_cap() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_READ);
+}
+bool OpInfo::need_write_cap() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_WRITE);
+}
+bool OpInfo::need_promote() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE);
+}
+bool OpInfo::need_skip_handle_cache() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE);
+}
+bool OpInfo::need_skip_promote() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE);
+}
+bool OpInfo::allows_returnvec() const {
+ return check_rmw(CEPH_OSD_RMW_FLAG_RETURNVEC);
+}
+
+void OpInfo::set_rmw_flags(int flags) {
+ rmw_flags |= flags;
+}
+
+void OpInfo::set_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_READ); }
+void OpInfo::set_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_WRITE); }
+void OpInfo::set_class_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_READ); }
+void OpInfo::set_class_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_WRITE); }
+void OpInfo::set_pg_op() { set_rmw_flags(CEPH_OSD_RMW_FLAG_PGOP); }
+void OpInfo::set_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CACHE); }
+void OpInfo::set_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); }
+void OpInfo::set_skip_handle_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); }
+void OpInfo::set_skip_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); }
+void OpInfo::set_force_rwordered() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RWORDERED); }
+void OpInfo::set_returnvec() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RETURNVEC); }
+
+
+int OpInfo::set_from_op(
+ const MOSDOp *m,
+ const OSDMap &osdmap)
+{
+ vector<OSDOp>::const_iterator iter;
+
+ // client flags have no bearing on whether an op is a read, write, etc.
+ clear();
+
+ if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
+ set_force_rwordered();
+ }
+ if (m->has_flag(CEPH_OSD_FLAG_RETURNVEC)) {
+ set_returnvec();
+ }
+
+ // set bits based on op codes, called methods.
+ for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
+ if ((iter->op.op == CEPH_OSD_OP_WATCH &&
+ iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
+ /* This a bit odd. PING isn't actually a write. It can't
+ * result in an update to the object_info. PINGs also aren't
+ * resent, so there's no reason to write out a log entry.
+ *
+ * However, we pipeline them behind writes, so let's force
+ * the write_ordered flag.
+ */
+ set_force_rwordered();
+ } else {
+ if (ceph_osd_op_mode_modify(iter->op.op))
+ set_write();
+ }
+ if (ceph_osd_op_mode_read(iter->op.op))
+ set_read();
+
+ // set READ flag if there are src_oids
+ if (iter->soid.oid.name.length())
+ set_read();
+
+ // set PGOP flag if there are PG ops
+ if (ceph_osd_op_type_pg(iter->op.op))
+ set_pg_op();
+
+ if (ceph_osd_op_mode_cache(iter->op.op))
+ set_cache();
+
+ // check for ec base pool
+ int64_t poolid = m->get_pg().pool();
+ const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
+ if (pool && pool->is_tier()) {
+ const pg_pool_t *base_pool = osdmap.get_pg_pool(pool->tier_of);
+ if (base_pool && base_pool->require_rollback()) {
+ if ((iter->op.op != CEPH_OSD_OP_READ) &&
+ (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
+ (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
+ (iter->op.op != CEPH_OSD_OP_STAT) &&
+ (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
+ (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
+ (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
+ (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
+ (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
+ (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
+ (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
+ (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
+ (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
+ (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
+ (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
+ (iter->op.op != CEPH_OSD_OP_CREATE) &&
+ (iter->op.op != CEPH_OSD_OP_DELETE) &&
+ (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
+ (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
+ (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
+ (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
+ (iter->op.op != CEPH_OSD_OP_COPY_FROM) &&
+ (iter->op.op != CEPH_OSD_OP_COPY_FROM2)) {
+ set_promote();
+ }
+ }
+ }
+
+ switch (iter->op.op) {
+ case CEPH_OSD_OP_CALL:
+ {
+ bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
+ int is_write, is_read;
+ string cname, mname;
+ bp.copy(iter->op.cls.class_len, cname);
+ bp.copy(iter->op.cls.method_len, mname);
+
+ ClassHandler::ClassData *cls;
+ int r = ClassHandler::get_instance().open_class(cname, &cls);
+ if (r) {
+ if (r == -ENOENT)
+ r = -EOPNOTSUPP;
+ else if (r != -EPERM) // propagate permission errors
+ r = -EIO;
+ return r;
+ }
+ int flags = cls->get_method_flags(mname);
+ if (flags < 0) {
+ if (flags == -ENOENT)
+ r = -EOPNOTSUPP;
+ else
+ r = flags;
+ return r;
+ }
+ is_read = flags & CLS_METHOD_RD;
+ is_write = flags & CLS_METHOD_WR;
+ bool is_promote = flags & CLS_METHOD_PROMOTE;
+
+ if (is_read)
+ set_class_read();
+ if (is_write)
+ set_class_write();
+ if (is_promote)
+ set_promote();
+ add_class(std::move(cname), std::move(mname), is_read, is_write,
+ cls->allowed);
+ break;
+ }
+
+ case CEPH_OSD_OP_WATCH:
+ // force the read bit for watch since it is depends on previous
+ // watch state (and may return early if the watch exists) or, in
+ // the case of ping, is simply a read op.
+ set_read();
+ // fall through
+ case CEPH_OSD_OP_NOTIFY:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ {
+ set_promote();
+ break;
+ }
+
+ case CEPH_OSD_OP_DELETE:
+ // if we get a delete with FAILOK we can skip handle cache. without
+ // FAILOK we still need to promote (or do something smarter) to
+ // determine whether to return ENOENT or 0.
+ if (iter == m->ops.begin() &&
+ iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
+ set_skip_handle_cache();
+ }
+ // skip promotion when proxying a delete op
+ if (m->ops.size() == 1) {
+ set_skip_promote();
+ }
+ break;
+
+ case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+ case CEPH_OSD_OP_CACHE_FLUSH:
+ case CEPH_OSD_OP_CACHE_EVICT:
+ // If try_flush/flush/evict is the only op, can skip handle cache.
+ if (m->ops.size() == 1) {
+ set_skip_handle_cache();
+ }
+ break;
+
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SYNC_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_CHECKSUM:
+ case CEPH_OSD_OP_WRITEFULL:
+ if (m->ops.size() == 1 &&
+ (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
+ iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
+ set_skip_promote();
+ }
+ break;
+
+ // force promotion when pin an object in cache tier
+ case CEPH_OSD_OP_CACHE_PIN:
+ set_promote();
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (rmw_flags == 0)
+ return -EINVAL;
+
+ return 0;
+
+}
+
+ostream& operator<<(ostream& out, const OpInfo::ClassInfo& i)
+{
+ out << "class " << i.class_name << " method " << i.method_name
+ << " rd " << i.read << " wr " << i.write << " allowed " << i.allowed;
+ return out;
+}
diff --git a/src/osd/osd_op_util.h b/src/osd/osd_op_util.h
new file mode 100644
index 000000000..5fb568e40
--- /dev/null
+++ b/src/osd/osd_op_util.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "osd/OSDMap.h"
+
+#include "messages/MOSDOp.h"
+
+class OpInfo {
+public:
+ struct ClassInfo {
+ ClassInfo(std::string&& class_name, std::string&& method_name,
+ bool read, bool write, bool allowed) :
+ class_name(std::move(class_name)), method_name(std::move(method_name)),
+ read(read), write(write), allowed(allowed)
+ {}
+ const std::string class_name;
+ const std::string method_name;
+ const bool read, write, allowed;
+ };
+
+private:
+ uint64_t rmw_flags = 0;
+ std::vector<ClassInfo> classes;
+
+ void set_rmw_flags(int flags);
+
+ void add_class(std::string&& class_name, std::string&& method_name,
+ bool read, bool write, bool allowed) {
+ classes.emplace_back(std::move(class_name), std::move(method_name),
+ read, write, allowed);
+ }
+
+public:
+
+ void clear() {
+ rmw_flags = 0;
+ }
+
+ uint64_t get_flags() const {
+ return rmw_flags;
+ }
+
+ bool check_rmw(int flag) const ;
+ bool may_read() const;
+ bool may_write() const;
+ bool may_cache() const;
+ bool rwordered_forced() const;
+ bool rwordered() const;
+ bool includes_pg_op() const;
+ bool need_read_cap() const;
+ bool need_write_cap() const;
+ bool need_promote() const;
+ bool need_skip_handle_cache() const;
+ bool need_skip_promote() const;
+ bool allows_returnvec() const;
+
+ void set_read();
+ void set_write();
+ void set_cache();
+ void set_class_read();
+ void set_class_write();
+ void set_pg_op();
+ void set_promote();
+ void set_skip_handle_cache();
+ void set_skip_promote();
+ void set_force_rwordered();
+ void set_returnvec();
+
+ int set_from_op(
+ const MOSDOp *m,
+ const OSDMap &osdmap);
+
+ std::vector<ClassInfo> get_classes() const {
+ return classes;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const OpInfo::ClassInfo& i);
diff --git a/src/osd/osd_perf_counters.cc b/src/osd/osd_perf_counters.cc
new file mode 100644
index 000000000..ed63b4d3f
--- /dev/null
+++ b/src/osd/osd_perf_counters.cc
@@ -0,0 +1,321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_perf_counters.h"
+#include "include/common_fwd.h"
+
+
+PerfCounters *build_osd_logger(CephContext *cct) {
+ PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
+
+ // Latency axis configuration for op histograms, values are in nanoseconds
+ PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
+ "Latency (usec)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
+ 0, ///< Start at 0
+ 100000, ///< Quantization unit is 100usec
+ 32, ///< Enough to cover much longer than slow requests
+ };
+
+ // Op size axis configuration for op histograms, values are in bytes
+ PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
+ "Request size (bytes)",
+ PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+ 0, ///< Start at 0
+ 512, ///< Quantization unit is 512 bytes
+ 32, ///< Enough to cover requests larger than GB
+ };
+
+
+ // All the basic OSD operation stats are to be considered useful
+ osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+ osd_plb.add_u64(
+ l_osd_op_wip, "op_wip",
+ "Replication operations currently being processed (primary)");
+ osd_plb.add_u64_counter(
+ l_osd_op, "op",
+ "Client operations",
+ "ops", PerfCountersBuilder::PRIO_CRITICAL);
+ osd_plb.add_u64_counter(
+ l_osd_op_inb, "op_in_bytes",
+ "Client operations total write size",
+ "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+ osd_plb.add_u64_counter(
+ l_osd_op_outb, "op_out_bytes",
+ "Client operations total read size",
+ "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+ osd_plb.add_time_avg(
+ l_osd_op_lat, "op_latency",
+ "Latency of client operations (including queue time)",
+ "l", 9);
+ osd_plb.add_time_avg(
+ l_osd_op_process_lat, "op_process_latency",
+ "Latency of client operations (excluding queue time)");
+ osd_plb.add_time_avg(
+ l_osd_op_prepare_lat, "op_prepare_latency",
+ "Latency of client operations (excluding queue time and wait for finished)");
+
+ osd_plb.add_u64_counter(
+ l_osd_op_r, "op_r", "Client read operations");
+ osd_plb.add_u64_counter(
+ l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ osd_plb.add_time_avg(
+ l_osd_op_r_lat, "op_r_latency",
+ "Latency of read operation (including queue time)");
+ osd_plb.add_u64_counter_histogram(
+ l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of operation latency (including queue time) + data read");
+ osd_plb.add_time_avg(
+ l_osd_op_r_process_lat, "op_r_process_latency",
+ "Latency of read operation (excluding queue time)");
+ osd_plb.add_time_avg(
+ l_osd_op_r_prepare_lat, "op_r_prepare_latency",
+ "Latency of read operations (excluding queue time and wait for finished)");
+ osd_plb.add_u64_counter(
+ l_osd_op_w, "op_w", "Client write operations");
+ osd_plb.add_u64_counter(
+ l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
+ osd_plb.add_time_avg(
+ l_osd_op_w_lat, "op_w_latency",
+ "Latency of write operation (including queue time)");
+ osd_plb.add_u64_counter_histogram(
+ l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of operation latency (including queue time) + data written");
+ osd_plb.add_time_avg(
+ l_osd_op_w_process_lat, "op_w_process_latency",
+ "Latency of write operation (excluding queue time)");
+ osd_plb.add_time_avg(
+ l_osd_op_w_prepare_lat, "op_w_prepare_latency",
+ "Latency of write operations (excluding queue time and wait for finished)");
+ osd_plb.add_u64_counter(
+ l_osd_op_rw, "op_rw",
+ "Client read-modify-write operations");
+ osd_plb.add_u64_counter(
+ l_osd_op_rw_inb, "op_rw_in_bytes",
+ "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ osd_plb.add_u64_counter(
+ l_osd_op_rw_outb,"op_rw_out_bytes",
+ "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ osd_plb.add_time_avg(
+ l_osd_op_rw_lat, "op_rw_latency",
+ "Latency of read-modify-write operation (including queue time)");
+ osd_plb.add_u64_counter_histogram(
+ l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of rw operation latency (including queue time) + data written");
+ osd_plb.add_u64_counter_histogram(
+ l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
+ op_hist_x_axis_config, op_hist_y_axis_config,
+ "Histogram of rw operation latency (including queue time) + data read");
+ osd_plb.add_time_avg(
+ l_osd_op_rw_process_lat, "op_rw_process_latency",
+ "Latency of read-modify-write operation (excluding queue time)");
+ osd_plb.add_time_avg(
+ l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
+ "Latency of read-modify-write operations (excluding queue time and wait for finished)");
+
+ // Now we move on to some more obscure stats, revert to assuming things
+ // are low priority unless otherwise specified.
+ osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+
+ osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
+ "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
+ osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
+ "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
+
+ osd_plb.add_u64_counter(
+ l_osd_sop, "subop", "Suboperations");
+ osd_plb.add_u64_counter(
+ l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
+ osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
+
+ osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
+ osd_plb.add_u64_counter(
+ l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
+ osd_plb.add_time_avg(
+ l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
+ osd_plb.add_u64_counter(
+ l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
+ osd_plb.add_time_avg(
+ l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
+ osd_plb.add_u64_counter(
+ l_osd_sop_push, "subop_push", "Suboperations push messages");
+ osd_plb.add_u64_counter(
+ l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
+ osd_plb.add_time_avg(
+ l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
+
+ osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
+ osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
+ osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
+
+ osd_plb.add_u64_counter(
+ l_osd_rop, "recovery_ops",
+ "Started recovery operations",
+ "rop", PerfCountersBuilder::PRIO_INTERESTING);
+
+ osd_plb.add_u64_counter(
+ l_osd_rbytes, "recovery_bytes",
+ "recovery bytes",
+ "rbt", PerfCountersBuilder::PRIO_INTERESTING);
+
+ osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
+ osd_plb.add_u64(
+ l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
+ osd_plb.add_u64(
+ l_osd_cached_crc_adjusted, "cached_crc_adjusted",
+ "Total number getting crc from crc_cache with adjusting");
+ osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
+ "Total number of crc cache misses");
+
+ osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
+ "pgs", PerfCountersBuilder::PRIO_USEFUL);
+ osd_plb.add_u64(
+ l_osd_pg_primary, "numpg_primary",
+ "Placement groups for which this osd is primary");
+ osd_plb.add_u64(
+ l_osd_pg_replica, "numpg_replica",
+ "Placement groups for which this osd is replica");
+ osd_plb.add_u64(
+ l_osd_pg_stray, "numpg_stray",
+ "Placement groups ready to be deleted from this osd");
+ osd_plb.add_u64(
+ l_osd_pg_removing, "numpg_removing",
+ "Placement groups queued for local deletion", "pgsr",
+ PerfCountersBuilder::PRIO_USEFUL);
+ osd_plb.add_u64(
+ l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
+ osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
+ osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
+ osd_plb.add_u64_counter(
+ l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
+ osd_plb.add_u64_counter(
+ l_osd_waiting_for_map, "messages_delayed_for_map",
+ "Operations waiting for OSD map");
+
+ osd_plb.add_u64_counter(
+ l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
+ osd_plb.add_u64_counter(
+ l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
+ osd_plb.add_u64_counter(
+ l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
+ "osdmap cache miss below cache lower bound");
+ osd_plb.add_u64_avg(
+ l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
+ "osdmap cache miss, avg distance below cache lower bound");
+ osd_plb.add_u64_counter(
+ l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
+ "OSDMap buffer cache hits");
+ osd_plb.add_u64_counter(
+ l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
+ "OSDMap buffer cache misses");
+
+ osd_plb.add_u64(
+ l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ osd_plb.add_u64(
+ l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
+ PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
+
+ osd_plb.add_u64_counter(
+ l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
+
+ osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
+ osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
+ osd_plb.add_u64_counter(
+ l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
+ osd_plb.add_u64_counter(
+ l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
+ osd_plb.add_u64_counter(
+ l_osd_tier_try_flush_fail, "tier_try_flush_fail",
+ "Failed tier flush attempts");
+ osd_plb.add_u64_counter(
+ l_osd_tier_evict, "tier_evict", "Tier evictions");
+ osd_plb.add_u64_counter(
+ l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
+ osd_plb.add_u64_counter(
+ l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
+ osd_plb.add_u64_counter(
+ l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
+ osd_plb.add_u64_counter(
+ l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
+ osd_plb.add_u64_counter(
+ l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
+ osd_plb.add_u64_counter(
+ l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
+
+ osd_plb.add_u64_counter(
+ l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
+ osd_plb.add_u64_counter(
+ l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
+ osd_plb.add_u64_counter(
+ l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
+ osd_plb.add_u64_counter(
+ l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
+
+ osd_plb.add_u64_counter(
+ l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
+ osd_plb.add_u64_counter(
+ l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
+
+ osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
+ osd_plb.add_time_avg(
+ l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
+ osd_plb.add_time_avg(
+ l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
+ osd_plb.add_time_avg(
+ l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
+
+ osd_plb.add_u64_counter(
+ l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
+ osd_plb.add_u64_counter(
+ l_osd_pg_fastinfo, "osd_pg_fastinfo",
+ "PG updated its info using fastinfo attr");
+ osd_plb.add_u64_counter(
+ l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
+
+ return osd_plb.create_perf_counters();
+}
+
+
+PerfCounters *build_recoverystate_perf(CephContext *cct) {
+ PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
+
+ rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
+ rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
+ rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
+ rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
+ rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
+ rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
+ rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
+ rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
+ rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
+ rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
+ rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
+ rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
+ rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
+ rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
+ rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
+ rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
+ rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
+ rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
+ rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
+ rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
+ rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
+ rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
+ rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
+ rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
+ rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
+ rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
+ rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
+ rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
+ rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
+ rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
+ rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
+
+ return rs_perf.create_perf_counters();
+}
diff --git a/src/osd/osd_perf_counters.h b/src/osd/osd_perf_counters.h
new file mode 100644
index 000000000..9966a7f7d
--- /dev/null
+++ b/src/osd/osd_perf_counters.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/common_fwd.h"
+#include "common/perf_counters.h"
+
+enum {
+ l_osd_first = 10000,
+ l_osd_op_wip,
+ l_osd_op,
+ l_osd_op_inb,
+ l_osd_op_outb,
+ l_osd_op_lat,
+ l_osd_op_process_lat,
+ l_osd_op_prepare_lat,
+ l_osd_op_r,
+ l_osd_op_r_outb,
+ l_osd_op_r_lat,
+ l_osd_op_r_lat_outb_hist,
+ l_osd_op_r_process_lat,
+ l_osd_op_r_prepare_lat,
+ l_osd_op_w,
+ l_osd_op_w_inb,
+ l_osd_op_w_lat,
+ l_osd_op_w_lat_inb_hist,
+ l_osd_op_w_process_lat,
+ l_osd_op_w_prepare_lat,
+ l_osd_op_rw,
+ l_osd_op_rw_inb,
+ l_osd_op_rw_outb,
+ l_osd_op_rw_lat,
+ l_osd_op_rw_lat_inb_hist,
+ l_osd_op_rw_lat_outb_hist,
+ l_osd_op_rw_process_lat,
+ l_osd_op_rw_prepare_lat,
+
+ l_osd_op_before_queue_op_lat,
+ l_osd_op_before_dequeue_op_lat,
+
+ l_osd_sop,
+ l_osd_sop_inb,
+ l_osd_sop_lat,
+ l_osd_sop_w,
+ l_osd_sop_w_inb,
+ l_osd_sop_w_lat,
+ l_osd_sop_pull,
+ l_osd_sop_pull_lat,
+ l_osd_sop_push,
+ l_osd_sop_push_inb,
+ l_osd_sop_push_lat,
+
+ l_osd_pull,
+ l_osd_push,
+ l_osd_push_outb,
+
+ l_osd_rop,
+ l_osd_rbytes,
+
+ l_osd_loadavg,
+ l_osd_cached_crc,
+ l_osd_cached_crc_adjusted,
+ l_osd_missed_crc,
+
+ l_osd_pg,
+ l_osd_pg_primary,
+ l_osd_pg_replica,
+ l_osd_pg_stray,
+ l_osd_pg_removing,
+ l_osd_hb_to,
+ l_osd_map,
+ l_osd_mape,
+ l_osd_mape_dup,
+
+ l_osd_waiting_for_map,
+
+ l_osd_map_cache_hit,
+ l_osd_map_cache_miss,
+ l_osd_map_cache_miss_low,
+ l_osd_map_cache_miss_low_avg,
+ l_osd_map_bl_cache_hit,
+ l_osd_map_bl_cache_miss,
+
+ l_osd_stat_bytes,
+ l_osd_stat_bytes_used,
+ l_osd_stat_bytes_avail,
+
+ l_osd_copyfrom,
+
+ l_osd_tier_promote,
+ l_osd_tier_flush,
+ l_osd_tier_flush_fail,
+ l_osd_tier_try_flush,
+ l_osd_tier_try_flush_fail,
+ l_osd_tier_evict,
+ l_osd_tier_whiteout,
+ l_osd_tier_dirty,
+ l_osd_tier_clean,
+ l_osd_tier_delay,
+ l_osd_tier_proxy_read,
+ l_osd_tier_proxy_write,
+
+ l_osd_agent_wake,
+ l_osd_agent_skip,
+ l_osd_agent_flush,
+ l_osd_agent_evict,
+
+ l_osd_object_ctx_cache_hit,
+ l_osd_object_ctx_cache_total,
+
+ l_osd_op_cache_hit,
+ l_osd_tier_flush_lat,
+ l_osd_tier_promote_lat,
+ l_osd_tier_r_lat,
+
+ l_osd_pg_info,
+ l_osd_pg_fastinfo,
+ l_osd_pg_biginfo,
+
+ l_osd_last,
+};
+
+PerfCounters *build_osd_logger(CephContext *cct);
+
+// PeeringState perf counters
+enum {
+ rs_first = 20000,
+ rs_initial_latency,
+ rs_started_latency,
+ rs_reset_latency,
+ rs_start_latency,
+ rs_primary_latency,
+ rs_peering_latency,
+ rs_backfilling_latency,
+ rs_waitremotebackfillreserved_latency,
+ rs_waitlocalbackfillreserved_latency,
+ rs_notbackfilling_latency,
+ rs_repnotrecovering_latency,
+ rs_repwaitrecoveryreserved_latency,
+ rs_repwaitbackfillreserved_latency,
+ rs_reprecovering_latency,
+ rs_activating_latency,
+ rs_waitlocalrecoveryreserved_latency,
+ rs_waitremoterecoveryreserved_latency,
+ rs_recovering_latency,
+ rs_recovered_latency,
+ rs_clean_latency,
+ rs_active_latency,
+ rs_replicaactive_latency,
+ rs_stray_latency,
+ rs_getinfo_latency,
+ rs_getlog_latency,
+ rs_waitactingchange_latency,
+ rs_incomplete_latency,
+ rs_down_latency,
+ rs_getmissing_latency,
+ rs_waitupthru_latency,
+ rs_notrecovering_latency,
+ rs_last,
+};
+
+PerfCounters *build_recoverystate_perf(CephContext *cct);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
new file mode 100644
index 000000000..13358560f
--- /dev/null
+++ b/src/osd/osd_types.cc
@@ -0,0 +1,7212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <list>
+#include <map>
+#include <ostream>
+#include <sstream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+
+#include <boost/assign/list_of.hpp>
+
+#include "include/ceph_features.h"
+#include "include/encoding.h"
+#include "include/stringify.h"
+extern "C" {
+#include "crush/hash.h"
+}
+
+#include "common/Formatter.h"
+#include "common/StackStringStream.h"
+#include "OSDMap.h"
+#include "osd_types.h"
+#include "os/Transaction.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::decode_nohead;
+using ceph::encode;
+using ceph::encode_nohead;
+using ceph::Formatter;
+using ceph::make_timespan;
+using ceph::JSONFormatter;
+
+using namespace std::literals;
+
+const char *ceph_osd_flag_name(unsigned flag)
+{
+ switch (flag) {
+ case CEPH_OSD_FLAG_ACK: return "ack";
+ case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
+ case CEPH_OSD_FLAG_ONDISK: return "ondisk";
+ case CEPH_OSD_FLAG_RETRY: return "retry";
+ case CEPH_OSD_FLAG_READ: return "read";
+ case CEPH_OSD_FLAG_WRITE: return "write";
+ case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
+ case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
+ case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
+ case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
+ case CEPH_OSD_FLAG_PGOP: return "pgop";
+ case CEPH_OSD_FLAG_EXEC: return "exec";
+ case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
+ case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
+ case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
+ case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
+ case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
+ case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
+ case CEPH_OSD_FLAG_FLUSH: return "flush";
+ case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
+ case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
+ case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
+ case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
+ case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
+ case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
+ case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
+ case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
+ default: return "???";
+ }
+}
+
+string ceph_osd_flag_string(unsigned flags)
+{
+ string s;
+ for (unsigned i=0; i<32; ++i) {
+ if (flags & (1u<<i)) {
+ if (s.length())
+ s += "+";
+ s += ceph_osd_flag_name(1u << i);
+ }
+ }
+ if (s.length())
+ return s;
+ return string("-");
+}
+
+const char * ceph_osd_op_flag_name(unsigned flag)
+{
+ const char *name;
+
+ switch(flag) {
+ case CEPH_OSD_OP_FLAG_EXCL:
+ name = "excl";
+ break;
+ case CEPH_OSD_OP_FLAG_FAILOK:
+ name = "failok";
+ break;
+ case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
+ name = "fadvise_random";
+ break;
+ case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
+ name = "fadvise_sequential";
+ break;
+ case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
+ name = "favise_willneed";
+ break;
+ case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
+ name = "fadvise_dontneed";
+ break;
+ case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
+ name = "fadvise_nocache";
+ break;
+ case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
+ name = "with_reference";
+ break;
+ case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
+ name = "bypass_clean_cache";
+ break;
+ default:
+ name = "???";
+ };
+
+ return name;
+}
+
+string ceph_osd_op_flag_string(unsigned flags)
+{
+ string s;
+ for (unsigned i=0; i<32; ++i) {
+ if (flags & (1u<<i)) {
+ if (s.length())
+ s += "+";
+ s += ceph_osd_op_flag_name(1u << i);
+ }
+ }
+ if (s.length())
+ return s;
+ return string("-");
+}
+
+string ceph_osd_alloc_hint_flag_string(unsigned flags)
+{
+ string s;
+ for (unsigned i=0; i<32; ++i) {
+ if (flags & (1u<<i)) {
+ if (s.length())
+ s += "+";
+ s += ceph_osd_alloc_hint_flag_name(1u << i);
+ }
+ }
+ if (s.length())
+ return s;
+ return string("-");
+}
+
+void pg_shard_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(osd, bl);
+ encode(shard, bl);
+ ENCODE_FINISH(bl);
+}
+void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(osd, bl);
+ decode(shard, bl);
+ DECODE_FINISH(bl);
+}
+
+ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
+{
+ if (rhs.is_undefined())
+ return lhs << "?";
+ if (rhs.shard == shard_id_t::NO_SHARD)
+ return lhs << rhs.get_osd();
+ return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
+}
+
+void dump(Formatter* f, const osd_alerts_t& alerts)
+{
+ for (auto& a : alerts) {
+ string s0 = " osd: ";
+ s0 += stringify(a.first);
+ string s;
+ for (auto& aa : a.second) {
+ s = s0;
+ s += " ";
+ s += aa.first;
+ s += ":";
+ s += aa.second;
+ f->dump_string("alert", s);
+ }
+ }
+}
+
+// -- osd_reqid_t --
+void osd_reqid_t::dump(Formatter *f) const
+{
+ f->dump_stream("name") << name;
+ f->dump_int("inc", inc);
+ f->dump_unsigned("tid", tid);
+}
+
+void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
+{
+ o.push_back(new osd_reqid_t);
+ o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
+}
+
+// -- object_locator_t --
+
+void object_locator_t::encode(ceph::buffer::list& bl) const
+{
+ // verify that nobody's corrupted the locator
+ ceph_assert(hash == -1 || key.empty());
+ __u8 encode_compat = 3;
+ ENCODE_START(6, encode_compat, bl);
+ encode(pool, bl);
+ int32_t preferred = -1; // tell old code there is no preferred osd (-1).
+ encode(preferred, bl);
+ encode(key, bl);
+ encode(nspace, bl);
+ encode(hash, bl);
+ if (hash != -1)
+ encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
+ ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
+}
+
+void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
+ if (struct_v < 2) {
+ int32_t op;
+ decode(op, p);
+ pool = op;
+ int16_t pref;
+ decode(pref, p);
+ } else {
+ decode(pool, p);
+ int32_t preferred;
+ decode(preferred, p);
+ }
+ decode(key, p);
+ if (struct_v >= 5)
+ decode(nspace, p);
+ if (struct_v >= 6)
+ decode(hash, p);
+ else
+ hash = -1;
+ DECODE_FINISH(p);
+ // verify that nobody's corrupted the locator
+ ceph_assert(hash == -1 || key.empty());
+}
+
+void object_locator_t::dump(Formatter *f) const
+{
+ f->dump_int("pool", pool);
+ f->dump_string("key", key);
+ f->dump_string("namespace", nspace);
+ f->dump_int("hash", hash);
+}
+
+void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
+{
+ o.push_back(new object_locator_t);
+ o.push_back(new object_locator_t(123));
+ o.push_back(new object_locator_t(123, 876));
+ o.push_back(new object_locator_t(1, "n2"));
+ o.push_back(new object_locator_t(1234, "", "key"));
+ o.push_back(new object_locator_t(12, "n1", "key2"));
+}
+
+// -- request_redirect_t --
+void request_redirect_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(redirect_locator, bl);
+ encode(redirect_object, bl);
+ // legacy of the removed osd_instructions member
+ encode((uint32_t)0, bl);
+ ENCODE_FINISH(bl);
+}
+
+void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ uint32_t legacy_osd_instructions_len;
+ decode(redirect_locator, bl);
+ decode(redirect_object, bl);
+ decode(legacy_osd_instructions_len, bl);
+ if (legacy_osd_instructions_len) {
+ bl += legacy_osd_instructions_len;
+ }
+ DECODE_FINISH(bl);
+}
+
+void request_redirect_t::dump(Formatter *f) const
+{
+ f->dump_string("object", redirect_object);
+ f->open_object_section("locator");
+ redirect_locator.dump(f);
+ f->close_section(); // locator
+}
+
+void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
+{
+ object_locator_t loc(1, "redir_obj");
+ o.push_back(new request_redirect_t());
+ o.push_back(new request_redirect_t(loc, 0));
+ o.push_back(new request_redirect_t(loc, "redir_obj"));
+ o.push_back(new request_redirect_t(loc));
+}
+
+void objectstore_perf_stat_t::dump(Formatter *f) const
+{
+ // *_ms values just for compatibility.
+ f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
+ f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
+ f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
+ f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
+}
+
+void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+ uint8_t target_v = 2;
+ if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
+ target_v = 1;
+ }
+ ENCODE_START(target_v, target_v, bl);
+ if (target_v >= 2) {
+ encode(os_commit_latency_ns, bl);
+ encode(os_apply_latency_ns, bl);
+ } else {
+ constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
+ uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
+ uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
+ encode(commit_latency_ms, bl); // for compatibility with older monitor.
+ encode(apply_latency_ms, bl); // for compatibility with older monitor.
+ }
+ ENCODE_FINISH(bl);
+}
+
+void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(2, bl);
+ if (struct_v >= 2) {
+ decode(os_commit_latency_ns, bl);
+ decode(os_apply_latency_ns, bl);
+ } else {
+ uint32_t commit_latency_ms;
+ uint32_t apply_latency_ms;
+ decode(commit_latency_ms, bl);
+ decode(apply_latency_ms, bl);
+ constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
+ os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
+ os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
+ }
+ DECODE_FINISH(bl);
+}
+
+void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
+{
+ o.push_back(new objectstore_perf_stat_t());
+ o.push_back(new objectstore_perf_stat_t());
+ o.back()->os_commit_latency_ns = 20000000;
+ o.back()->os_apply_latency_ns = 30000000;
+}
+
+// -- osd_stat_t --
+void osd_stat_t::dump(Formatter *f, bool with_net) const
+{
+ f->dump_unsigned("up_from", up_from);
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("num_pgs", num_pgs);
+ f->dump_unsigned("num_osds", num_osds);
+ f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
+ f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
+
+ /// dump legacy stats fields to ensure backward compatibility.
+ f->dump_unsigned("kb", statfs.kb());
+ f->dump_unsigned("kb_used", statfs.kb_used_raw());
+ f->dump_unsigned("kb_used_data", statfs.kb_used_data());
+ f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
+ f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
+ f->dump_unsigned("kb_avail", statfs.kb_avail());
+ ////////////////////
+
+ f->open_object_section("statfs");
+ statfs.dump(f);
+ f->close_section();
+ f->open_array_section("hb_peers");
+ for (auto p : hb_peers)
+ f->dump_int("osd", p);
+ f->close_section();
+ f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
+ f->dump_int("num_snap_trimming", num_snap_trimming);
+ f->dump_int("num_shards_repaired", num_shards_repaired);
+ f->open_object_section("op_queue_age_hist");
+ op_queue_age_hist.dump(f);
+ f->close_section();
+ f->open_object_section("perf_stat");
+ os_perf_stat.dump(f);
+ f->close_section();
+ f->open_array_section("alerts");
+ ::dump(f, os_alerts);
+ f->close_section();
+ if (with_net) {
+ dump_ping_time(f);
+ }
+}
+
+void osd_stat_t::dump_ping_time(Formatter *f) const
+{
+ f->open_array_section("network_ping_times");
+ for (auto &i : hb_pingtime) {
+ f->open_object_section("entry");
+ f->dump_int("osd", i.first);
+ const time_t lu(i.second.last_update);
+ char buffer[26];
+ string lustr(ctime_r(&lu, buffer));
+ lustr.pop_back(); // Remove trailing \n
+ f->dump_string("last update", lustr);
+ f->open_array_section("interfaces");
+ f->open_object_section("interface");
+ f->dump_string("interface", "back");
+ f->open_object_section("average");
+ f->dump_float("1min", i.second.back_pingtime[0]/1000.0);
+ f->dump_float("5min", i.second.back_pingtime[1]/1000.0);
+ f->dump_float("15min", i.second.back_pingtime[2]/1000.0);
+ f->close_section(); // average
+ f->open_object_section("min");
+ f->dump_float("1min", i.second.back_min[0]/1000.0);
+ f->dump_float("5min", i.second.back_min[1]/1000.0);
+ f->dump_float("15min", i.second.back_min[2]/1000.0);
+ f->close_section(); // min
+ f->open_object_section("max");
+ f->dump_float("1min", i.second.back_max[0]/1000.0);
+ f->dump_float("5min", i.second.back_max[1]/1000.0);
+ f->dump_float("15min", i.second.back_max[2]/1000.0);
+ f->close_section(); // max
+ f->dump_float("last", i.second.back_last/1000.0);
+ f->close_section(); // interface
+
+ if (i.second.front_pingtime[0] != 0) {
+ f->open_object_section("interface");
+ f->dump_string("interface", "front");
+ f->open_object_section("average");
+ f->dump_float("1min", i.second.front_pingtime[0]/1000.0);
+ f->dump_float("5min", i.second.front_pingtime[1]/1000.0);
+ f->dump_float("15min", i.second.front_pingtime[2]/1000.0);
+ f->close_section(); // average
+ f->open_object_section("min");
+ f->dump_float("1min", i.second.front_min[0]/1000.0);
+ f->dump_float("5min", i.second.front_min[1]/1000.0);
+ f->dump_float("15min", i.second.front_min[2]/1000.0);
+ f->close_section(); // min
+ f->open_object_section("max");
+ f->dump_float("1min", i.second.front_max[0]/1000.0);
+ f->dump_float("5min", i.second.front_max[1]/1000.0);
+ f->dump_float("15min", i.second.front_max[2]/1000.0);
+ f->close_section(); // max
+ f->dump_float("last", i.second.front_last/1000.0);
+ f->close_section(); // interface
+ }
+ f->close_section(); // interfaces
+ f->close_section(); // entry
+ }
+ f->close_section(); // network_ping_time
+}
+
+void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+ ENCODE_START(14, 2, bl);
+
+ //////// for compatibility ////////
+ int64_t kb = statfs.kb();
+ int64_t kb_used = statfs.kb_used_raw();
+ int64_t kb_avail = statfs.kb_avail();
+ encode(kb, bl);
+ encode(kb_used, bl);
+ encode(kb_avail, bl);
+ ///////////////////////////////////
+
+ encode(snap_trim_queue_len, bl);
+ encode(num_snap_trimming, bl);
+ encode(hb_peers, bl);
+ encode((uint32_t)0, bl);
+ encode(op_queue_age_hist, bl);
+ encode(os_perf_stat, bl, features);
+ encode(up_from, bl);
+ encode(seq, bl);
+ encode(num_pgs, bl);
+
+ //////// for compatibility ////////
+ int64_t kb_used_data = statfs.kb_used_data();
+ int64_t kb_used_omap = statfs.kb_used_omap();
+ int64_t kb_used_meta = statfs.kb_used_internal_metadata();
+ encode(kb_used_data, bl);
+ encode(kb_used_omap, bl);
+ encode(kb_used_meta, bl);
+ encode(statfs, bl);
+ ///////////////////////////////////
+ encode(os_alerts, bl);
+ encode(num_shards_repaired, bl);
+ encode(num_osds, bl);
+ encode(num_per_pool_osds, bl);
+ encode(num_per_pool_omap_osds, bl);
+
+ // hb_pingtime map
+ encode((int)hb_pingtime.size(), bl);
+ for (auto i : hb_pingtime) {
+ encode(i.first, bl); // osd
+ encode(i.second.last_update, bl);
+ encode(i.second.back_pingtime[0], bl);
+ encode(i.second.back_pingtime[1], bl);
+ encode(i.second.back_pingtime[2], bl);
+ encode(i.second.back_min[0], bl);
+ encode(i.second.back_min[1], bl);
+ encode(i.second.back_min[2], bl);
+ encode(i.second.back_max[0], bl);
+ encode(i.second.back_max[1], bl);
+ encode(i.second.back_max[2], bl);
+ encode(i.second.back_last, bl);
+ encode(i.second.front_pingtime[0], bl);
+ encode(i.second.front_pingtime[1], bl);
+ encode(i.second.front_pingtime[2], bl);
+ encode(i.second.front_min[0], bl);
+ encode(i.second.front_min[1], bl);
+ encode(i.second.front_min[2], bl);
+ encode(i.second.front_max[0], bl);
+ encode(i.second.front_max[1], bl);
+ encode(i.second.front_max[2], bl);
+ encode(i.second.front_last, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ int64_t kb, kb_used,kb_avail;
+ int64_t kb_used_data, kb_used_omap, kb_used_meta;
+ DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
+ decode(kb, bl);
+ decode(kb_used, bl);
+ decode(kb_avail, bl);
+ decode(snap_trim_queue_len, bl);
+ decode(num_snap_trimming, bl);
+ decode(hb_peers, bl);
+ vector<int> num_hb_out;
+ decode(num_hb_out, bl);
+ if (struct_v >= 3)
+ decode(op_queue_age_hist, bl);
+ if (struct_v >= 4)
+ decode(os_perf_stat, bl);
+ if (struct_v >= 6) {
+ decode(up_from, bl);
+ decode(seq, bl);
+ }
+ if (struct_v >= 7) {
+ decode(num_pgs, bl);
+ }
+ if (struct_v >= 8) {
+ decode(kb_used_data, bl);
+ decode(kb_used_omap, bl);
+ decode(kb_used_meta, bl);
+ } else {
+ kb_used_data = kb_used;
+ kb_used_omap = 0;
+ kb_used_meta = 0;
+ }
+ if (struct_v >= 9) {
+ decode(statfs, bl);
+ } else {
+ statfs.reset();
+ statfs.total = kb << 10;
+ statfs.available = kb_avail << 10;
+ // actually it's totally unexpected to have ststfs.total < statfs.available
+ // here but unfortunately legacy generate_test_instances produced such a
+ // case hence inserting some handling rather than assert
+ statfs.internally_reserved =
+ statfs.total > statfs.available ? statfs.total - statfs.available : 0;
+ kb_used <<= 10;
+ if ((int64_t)statfs.internally_reserved > kb_used) {
+ statfs.internally_reserved -= kb_used;
+ } else {
+ statfs.internally_reserved = 0;
+ }
+ statfs.allocated = kb_used_data << 10;
+ statfs.omap_allocated = kb_used_omap << 10;
+ statfs.internal_metadata = kb_used_meta << 10;
+ }
+ if (struct_v >= 10) {
+ decode(os_alerts, bl);
+ } else {
+ os_alerts.clear();
+ }
+ if (struct_v >= 11) {
+ decode(num_shards_repaired, bl);
+ } else {
+ num_shards_repaired = 0;
+ }
+ if (struct_v >= 12) {
+ decode(num_osds, bl);
+ decode(num_per_pool_osds, bl);
+ } else {
+ num_osds = 0;
+ num_per_pool_osds = 0;
+ }
+ if (struct_v >= 13) {
+ decode(num_per_pool_omap_osds, bl);
+ } else {
+ num_per_pool_omap_osds = 0;
+ }
+ hb_pingtime.clear();
+ if (struct_v >= 14) {
+ int count;
+ decode(count, bl);
+ for (int i = 0 ; i < count ; i++) {
+ int osd;
+ decode(osd, bl);
+ struct Interfaces ifs;
+ decode(ifs.last_update, bl);
+ decode(ifs.back_pingtime[0],bl);
+ decode(ifs.back_pingtime[1], bl);
+ decode(ifs.back_pingtime[2], bl);
+ decode(ifs.back_min[0],bl);
+ decode(ifs.back_min[1], bl);
+ decode(ifs.back_min[2], bl);
+ decode(ifs.back_max[0],bl);
+ decode(ifs.back_max[1], bl);
+ decode(ifs.back_max[2], bl);
+ decode(ifs.back_last, bl);
+ decode(ifs.front_pingtime[0], bl);
+ decode(ifs.front_pingtime[1], bl);
+ decode(ifs.front_pingtime[2], bl);
+ decode(ifs.front_min[0], bl);
+ decode(ifs.front_min[1], bl);
+ decode(ifs.front_min[2], bl);
+ decode(ifs.front_max[0], bl);
+ decode(ifs.front_max[1], bl);
+ decode(ifs.front_max[2], bl);
+ decode(ifs.front_last, bl);
+ hb_pingtime[osd] = ifs;
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
+{
+ o.push_back(new osd_stat_t);
+
+ o.push_back(new osd_stat_t);
+ list<store_statfs_t*> ll;
+ store_statfs_t::generate_test_instances(ll);
+ o.back()->statfs = *ll.back();
+ o.back()->hb_peers.push_back(7);
+ o.back()->snap_trim_queue_len = 8;
+ o.back()->num_snap_trimming = 99;
+ o.back()->num_shards_repaired = 101;
+ o.back()->os_alerts[0].emplace(
+ "some alert", "some alert details");
+ o.back()->os_alerts[1].emplace(
+ "some alert2", "some alert2 details");
+ struct Interfaces gen_interfaces = {
+ 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
+ { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
+ o.back()->hb_pingtime[20] = gen_interfaces;
+ gen_interfaces = {
+ 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
+ o.back()->hb_pingtime[30] = gen_interfaces;
+}
+
+// -- pg_t --
+
+int pg_t::print(char *o, int maxlen) const
+{
+ return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
+}
+
+bool pg_t::parse(const char *s)
+{
+ uint64_t ppool;
+ uint32_t pseed;
+ int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
+ if (r < 2)
+ return false;
+ m_pool = ppool;
+ m_seed = pseed;
+ return true;
+}
+
+bool spg_t::parse(const char *s)
+{
+ shard = shard_id_t::NO_SHARD;
+ uint64_t ppool;
+ uint32_t pseed;
+ uint32_t pshard;
+ int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
+ if (r < 2)
+ return false;
+ pgid.set_pool(ppool);
+ pgid.set_ps(pseed);
+
+ const char *p = strchr(s, 's');
+ if (p) {
+ r = sscanf(p, "s%u", &pshard);
+ if (r == 1) {
+ shard = shard_id_t(pshard);
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
+{
+ while (*suffix_backwords)
+ *--buf = *suffix_backwords++;
+
+ if (!is_no_shard()) {
+ buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
+ *--buf = 's';
+ }
+
+ return pgid.calc_name(buf, "");
+}
+
+ostream& operator<<(ostream& out, const spg_t &pg)
+{
+ char buf[spg_t::calc_name_buf_size];
+ buf[spg_t::calc_name_buf_size - 1] = '\0';
+ out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
+ return out;
+}
+
+pg_t pg_t::get_ancestor(unsigned old_pg_num) const
+{
+ int old_bits = cbits(old_pg_num);
+ int old_mask = (1 << old_bits) - 1;
+ pg_t ret = *this;
+ ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
+ return ret;
+}
+
+bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
+{
+ //ceph_assert(m_seed < old_pg_num);
+ if (m_seed >= old_pg_num) {
+ // degenerate case
+ return false;
+ }
+ if (new_pg_num <= old_pg_num)
+ return false;
+
+ bool split = false;
+ if (true) {
+ unsigned old_bits = cbits(old_pg_num);
+ unsigned old_mask = (1 << old_bits) - 1;
+ for (unsigned n = 1; ; n++) {
+ unsigned next_bit = (n << (old_bits-1));
+ unsigned s = next_bit | m_seed;
+
+ if (s < old_pg_num || s == m_seed)
+ continue;
+ if (s >= new_pg_num)
+ break;
+ if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
+ split = true;
+ if (children)
+ children->insert(pg_t(s, m_pool));
+ }
+ }
+ }
+ if (false) {
+ // brute force
+ int old_bits = cbits(old_pg_num);
+ int old_mask = (1 << old_bits) - 1;
+ for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
+ unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
+ if (o == m_seed) {
+ split = true;
+ children->insert(pg_t(x, m_pool));
+ }
+ }
+ }
+ return split;
+}
+
+unsigned pg_t::get_split_bits(unsigned pg_num) const {
+ if (pg_num == 1)
+ return 0;
+ ceph_assert(pg_num > 1);
+
+ // Find unique p such that pg_num \in [2^(p-1), 2^p)
+ unsigned p = cbits(pg_num);
+ ceph_assert(p); // silence coverity #751330
+
+ if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
+ return p;
+ else
+ return p - 1;
+}
+
+bool pg_t::is_merge_source(
+ unsigned old_pg_num,
+ unsigned new_pg_num,
+ pg_t *parent) const
+{
+ if (m_seed < old_pg_num &&
+ m_seed >= new_pg_num) {
+ if (parent) {
+ pg_t t = *this;
+ while (t.m_seed >= new_pg_num) {
+ t = t.get_parent();
+ }
+ *parent = t;
+ }
+ return true;
+ }
+ return false;
+}
+
+pg_t pg_t::get_parent() const
+{
+ unsigned bits = cbits(m_seed);
+ ceph_assert(bits);
+ pg_t retval = *this;
+ retval.m_seed &= ~((~0)<<(bits - 1));
+ return retval;
+}
+
+hobject_t pg_t::get_hobj_start() const
+{
+ return hobject_t(object_t(), string(), 0, m_seed, m_pool,
+ string());
+}
+
+hobject_t pg_t::get_hobj_end(unsigned pg_num) const
+{
+ // note: this assumes a bitwise sort; with the legacy nibblewise
+ // sort a PG did not always cover a single contiguous range of the
+ // (bit-reversed) hash range.
+ unsigned bits = get_split_bits(pg_num);
+ uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
+ uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
+ if (rev_end >= 0x100000000) {
+ ceph_assert(rev_end == 0x100000000);
+ return hobject_t::get_max();
+ } else {
+ return hobject_t(object_t(), string(), CEPH_NOSNAP,
+ hobject_t::_reverse_bits(rev_end), m_pool,
+ string());
+ }
+}
+
+void pg_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("pool", m_pool);
+ f->dump_unsigned("seed", m_seed);
+}
+
+void pg_t::generate_test_instances(list<pg_t*>& o)
+{
+ o.push_back(new pg_t);
+ o.push_back(new pg_t(1, 2));
+ o.push_back(new pg_t(13123, 3));
+ o.push_back(new pg_t(131223, 4));
+}
+
+char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
+{
+ while (*suffix_backwords)
+ *--buf = *suffix_backwords++;
+
+ buf = ritoa<uint32_t, 16>(m_seed, buf);
+
+ *--buf = '.';
+
+ return ritoa<uint64_t, 10>(m_pool, buf);
+}
+
+ostream& operator<<(ostream& out, const pg_t &pg)
+{
+ char buf[pg_t::calc_name_buf_size];
+ buf[pg_t::calc_name_buf_size - 1] = '\0';
+ out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
+ return out;
+}
+
+
+// -- coll_t --
+
+void coll_t::calc_str()
+{
+ switch (type) {
+ case TYPE_META:
+ strcpy(_str_buff, "meta");
+ _str = _str_buff;
+ break;
+ case TYPE_PG:
+ _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
+ _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
+ break;
+ case TYPE_PG_TEMP:
+ _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
+ _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
+ break;
+ default:
+ ceph_abort_msg("unknown collection type");
+ }
+}
+
+bool coll_t::parse(const std::string& s)
+{
+ if (s == "meta") {
+ type = TYPE_META;
+ pgid = spg_t();
+ removal_seq = 0;
+ calc_str();
+ ceph_assert(s == _str);
+ return true;
+ }
+ if (s.find("_head") == s.length() - 5 &&
+ pgid.parse(s.substr(0, s.length() - 5))) {
+ type = TYPE_PG;
+ removal_seq = 0;
+ calc_str();
+ ceph_assert(s == _str);
+ return true;
+ }
+ if (s.find("_TEMP") == s.length() - 5 &&
+ pgid.parse(s.substr(0, s.length() - 5))) {
+ type = TYPE_PG_TEMP;
+ removal_seq = 0;
+ calc_str();
+ ceph_assert(s == _str);
+ return true;
+ }
+ return false;
+}
+
+void coll_t::encode(ceph::buffer::list& bl) const
+{
+ using ceph::encode;
+ // when changing this, remember to update encoded_size() too.
+ if (is_temp()) {
+ // can't express this as v2...
+ __u8 struct_v = 3;
+ encode(struct_v, bl);
+ encode(to_str(), bl);
+ } else {
+ __u8 struct_v = 2;
+ encode(struct_v, bl);
+ encode((__u8)type, bl);
+ encode(pgid, bl);
+ snapid_t snap = CEPH_NOSNAP;
+ encode(snap, bl);
+ }
+}
+
+size_t coll_t::encoded_size() const
+{
+ size_t r = sizeof(__u8);
+ if (is_temp()) {
+ // v3
+ r += sizeof(__u32);
+ if (_str) {
+ r += strlen(_str);
+ }
+ } else {
+ // v2
+ // 1. type
+ r += sizeof(__u8);
+ // 2. pgid
+ // - encoding header
+ r += sizeof(ceph_le32) + 2 * sizeof(__u8);
+ // - pg_t
+ r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
+ // - shard_id_t
+ r += sizeof(int8_t);
+ // 3. snapid_t
+ r += sizeof(uint64_t);
+ }
+
+ return r;
+}
+
+void coll_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ using ceph::decode;
+ __u8 struct_v;
+ decode(struct_v, bl);
+ switch (struct_v) {
+ case 1:
+ {
+ snapid_t snap;
+ decode(pgid, bl);
+ decode(snap, bl);
+
+ // infer the type
+ if (pgid == spg_t() && snap == 0) {
+ type = TYPE_META;
+ } else {
+ type = TYPE_PG;
+ }
+ removal_seq = 0;
+ }
+ break;
+
+ case 2:
+ {
+ __u8 _type;
+ snapid_t snap;
+ decode(_type, bl);
+ decode(pgid, bl);
+ decode(snap, bl);
+ type = (type_t)_type;
+ removal_seq = 0;
+ }
+ break;
+
+ case 3:
+ {
+ string str;
+ decode(str, bl);
+ bool ok = parse(str);
+ if (!ok)
+ throw std::domain_error(std::string("unable to parse pg ") + str);
+ }
+ break;
+
+ default:
+ {
+ CachedStackStringStream css;
+ *css << "coll_t::decode(): don't know how to decode version "
+ << struct_v;
+ throw std::domain_error(css->str());
+ }
+ }
+}
+
+void coll_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("type_id", (unsigned)type);
+ if (type != TYPE_META)
+ f->dump_stream("pgid") << pgid;
+ f->dump_string("name", to_str());
+}
+
+void coll_t::generate_test_instances(list<coll_t*>& o)
+{
+ o.push_back(new coll_t());
+ o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
+ o.push_back(new coll_t(o.back()->get_temp()));
+ o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
+ o.push_back(new coll_t(o.back()->get_temp()));
+ o.push_back(new coll_t());
+}
+
+// ---
+
+std::string pg_vector_string(const vector<int32_t> &a)
+{
+ CachedStackStringStream css;
+ *css << "[";
+ for (auto i = a.cbegin(); i != a.cend(); ++i) {
+ if (i != a.begin())
+ *css << ",";
+ if (*i != CRUSH_ITEM_NONE)
+ *css << *i;
+ else
+ *css << "NONE";
+ }
+ *css << "]";
+ return css->str();
+}
+
+std::string pg_state_string(uint64_t state)
+{
+ CachedStackStringStream css;
+ if (state & PG_STATE_STALE)
+ *css << "stale+";
+ if (state & PG_STATE_CREATING)
+ *css << "creating+";
+ if (state & PG_STATE_ACTIVE)
+ *css << "active+";
+ if (state & PG_STATE_ACTIVATING)
+ *css << "activating+";
+ if (state & PG_STATE_CLEAN)
+ *css << "clean+";
+ if (state & PG_STATE_RECOVERY_WAIT)
+ *css << "recovery_wait+";
+ if (state & PG_STATE_RECOVERY_TOOFULL)
+ *css << "recovery_toofull+";
+ if (state & PG_STATE_RECOVERING)
+ *css << "recovering+";
+ if (state & PG_STATE_FORCED_RECOVERY)
+ *css << "forced_recovery+";
+ if (state & PG_STATE_DOWN)
+ *css << "down+";
+ if (state & PG_STATE_RECOVERY_UNFOUND)
+ *css << "recovery_unfound+";
+ if (state & PG_STATE_BACKFILL_UNFOUND)
+ *css << "backfill_unfound+";
+ if (state & PG_STATE_UNDERSIZED)
+ *css << "undersized+";
+ if (state & PG_STATE_DEGRADED)
+ *css << "degraded+";
+ if (state & PG_STATE_REMAPPED)
+ *css << "remapped+";
+ if (state & PG_STATE_PREMERGE)
+ *css << "premerge+";
+ if (state & PG_STATE_SCRUBBING)
+ *css << "scrubbing+";
+ if (state & PG_STATE_DEEP_SCRUB)
+ *css << "deep+";
+ if (state & PG_STATE_INCONSISTENT)
+ *css << "inconsistent+";
+ if (state & PG_STATE_PEERING)
+ *css << "peering+";
+ if (state & PG_STATE_REPAIR)
+ *css << "repair+";
+ if (state & PG_STATE_BACKFILL_WAIT)
+ *css << "backfill_wait+";
+ if (state & PG_STATE_BACKFILLING)
+ *css << "backfilling+";
+ if (state & PG_STATE_FORCED_BACKFILL)
+ *css << "forced_backfill+";
+ if (state & PG_STATE_BACKFILL_TOOFULL)
+ *css << "backfill_toofull+";
+ if (state & PG_STATE_INCOMPLETE)
+ *css << "incomplete+";
+ if (state & PG_STATE_PEERED)
+ *css << "peered+";
+ if (state & PG_STATE_SNAPTRIM)
+ *css << "snaptrim+";
+ if (state & PG_STATE_SNAPTRIM_WAIT)
+ *css << "snaptrim_wait+";
+ if (state & PG_STATE_SNAPTRIM_ERROR)
+ *css << "snaptrim_error+";
+ if (state & PG_STATE_FAILED_REPAIR)
+ *css << "failed_repair+";
+ if (state & PG_STATE_LAGGY)
+ *css << "laggy+";
+ if (state & PG_STATE_WAIT)
+ *css << "wait+";
+ auto ret = css->str();
+ if (ret.length() > 0)
+ ret.resize(ret.length() - 1);
+ else
+ ret = "unknown";
+ return ret;
+}
+
+std::optional<uint64_t> pg_string_state(const std::string& state)
+{
+ std::optional<uint64_t> type;
+ if (state == "active")
+ type = PG_STATE_ACTIVE;
+ else if (state == "clean")
+ type = PG_STATE_CLEAN;
+ else if (state == "down")
+ type = PG_STATE_DOWN;
+ else if (state == "recovery_unfound")
+ type = PG_STATE_RECOVERY_UNFOUND;
+ else if (state == "backfill_unfound")
+ type = PG_STATE_BACKFILL_UNFOUND;
+ else if (state == "premerge")
+ type = PG_STATE_PREMERGE;
+ else if (state == "scrubbing")
+ type = PG_STATE_SCRUBBING;
+ else if (state == "degraded")
+ type = PG_STATE_DEGRADED;
+ else if (state == "inconsistent")
+ type = PG_STATE_INCONSISTENT;
+ else if (state == "peering")
+ type = PG_STATE_PEERING;
+ else if (state == "repair")
+ type = PG_STATE_REPAIR;
+ else if (state == "recovering")
+ type = PG_STATE_RECOVERING;
+ else if (state == "forced_recovery")
+ type = PG_STATE_FORCED_RECOVERY;
+ else if (state == "backfill_wait")
+ type = PG_STATE_BACKFILL_WAIT;
+ else if (state == "incomplete")
+ type = PG_STATE_INCOMPLETE;
+ else if (state == "stale")
+ type = PG_STATE_STALE;
+ else if (state == "remapped")
+ type = PG_STATE_REMAPPED;
+ else if (state == "deep")
+ type = PG_STATE_DEEP_SCRUB;
+ else if (state == "backfilling")
+ type = PG_STATE_BACKFILLING;
+ else if (state == "forced_backfill")
+ type = PG_STATE_FORCED_BACKFILL;
+ else if (state == "backfill_toofull")
+ type = PG_STATE_BACKFILL_TOOFULL;
+ else if (state == "recovery_wait")
+ type = PG_STATE_RECOVERY_WAIT;
+ else if (state == "recovery_toofull")
+ type = PG_STATE_RECOVERY_TOOFULL;
+ else if (state == "undersized")
+ type = PG_STATE_UNDERSIZED;
+ else if (state == "activating")
+ type = PG_STATE_ACTIVATING;
+ else if (state == "peered")
+ type = PG_STATE_PEERED;
+ else if (state == "snaptrim")
+ type = PG_STATE_SNAPTRIM;
+ else if (state == "snaptrim_wait")
+ type = PG_STATE_SNAPTRIM_WAIT;
+ else if (state == "snaptrim_error")
+ type = PG_STATE_SNAPTRIM_ERROR;
+ else if (state == "creating")
+ type = PG_STATE_CREATING;
+ else if (state == "failed_repair")
+ type = PG_STATE_FAILED_REPAIR;
+ else if (state == "laggy")
+ type = PG_STATE_LAGGY;
+ else if (state == "wait")
+ type = PG_STATE_WAIT;
+ else if (state == "unknown")
+ type = 0;
+ else
+ type = std::nullopt;
+ return type;
+}
+
+// -- eversion_t --
+string eversion_t::get_key_name() const
+{
+ std::string key(32, ' ');
+ get_key_name(&key[0]);
+ key.resize(31); // remove the null terminator
+ return key;
+}
+
+// -- pool_snap_info_t --
+void pool_snap_info_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("snapid", snapid);
+ f->dump_stream("stamp") << stamp;
+ f->dump_string("name", name);
+}
+
+void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
+ __u8 struct_v = 1;
+ encode(struct_v, bl);
+ encode(snapid, bl);
+ encode(stamp, bl);
+ encode(name, bl);
+ return;
+ }
+ ENCODE_START(2, 2, bl);
+ encode(snapid, bl);
+ encode(stamp, bl);
+ encode(name, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(snapid, bl);
+ decode(stamp, bl);
+ decode(name, bl);
+ DECODE_FINISH(bl);
+}
+
+void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
+{
+ o.push_back(new pool_snap_info_t);
+ o.push_back(new pool_snap_info_t);
+ o.back()->snapid = 1;
+ o.back()->stamp = utime_t(1, 2);
+ o.back()->name = "foo";
+}
+
+// -- pool_opts_t --
+
+// The order of items in the list is important, therefore,
+// you should always add to the end of the list when adding new options.
+
+typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
+static opt_mapping_t opt_mapping = boost::assign::map_list_of
+ ("scrub_min_interval", pool_opts_t::opt_desc_t(
+ pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
+ ("scrub_max_interval", pool_opts_t::opt_desc_t(
+ pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
+ ("deep_scrub_interval", pool_opts_t::opt_desc_t(
+ pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
+ ("recovery_priority", pool_opts_t::opt_desc_t(
+ pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
+ ("recovery_op_priority", pool_opts_t::opt_desc_t(
+ pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
+ ("scrub_priority", pool_opts_t::opt_desc_t(
+ pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
+ ("compression_mode", pool_opts_t::opt_desc_t(
+ pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
+ ("compression_algorithm", pool_opts_t::opt_desc_t(
+ pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
+ ("compression_required_ratio", pool_opts_t::opt_desc_t(
+ pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
+ ("compression_max_blob_size", pool_opts_t::opt_desc_t(
+ pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
+ ("compression_min_blob_size", pool_opts_t::opt_desc_t(
+ pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
+ ("csum_type", pool_opts_t::opt_desc_t(
+ pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
+ ("csum_max_block", pool_opts_t::opt_desc_t(
+ pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
+ ("csum_min_block", pool_opts_t::opt_desc_t(
+ pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
+ ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
+ pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
+ ("pg_num_min", pool_opts_t::opt_desc_t(
+ pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
+ ("target_size_bytes", pool_opts_t::opt_desc_t(
+ pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
+ ("target_size_ratio", pool_opts_t::opt_desc_t(
+ pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
+ ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
+ pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
+ ("read_lease_interval", pool_opts_t::opt_desc_t(
+ pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE))
+ ("dedup_tier", pool_opts_t::opt_desc_t(
+ pool_opts_t::DEDUP_TIER, pool_opts_t::INT))
+ ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
+ pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR))
+ ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
+ pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT))
+ ("pg_num_max", pool_opts_t::opt_desc_t(
+ pool_opts_t::PG_NUM_MAX, pool_opts_t::INT));
+
+bool pool_opts_t::is_opt_name(const std::string& name)
+{
+ return opt_mapping.count(name);
+}
+
+pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
+{
+ auto i = opt_mapping.find(name);
+ ceph_assert(i != opt_mapping.end());
+ return i->second;
+}
+
+bool pool_opts_t::is_set(pool_opts_t::key_t key) const
+{
+ return opts.count(key);
+}
+
+const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
+{
+ auto i = opts.find(key);
+ ceph_assert(i != opts.end());
+ return i->second;
+}
+
+bool pool_opts_t::unset(pool_opts_t::key_t key) {
+ return opts.erase(key) > 0;
+}
+
+class pool_opts_dumper_t : public boost::static_visitor<> {
+public:
+ pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
+ name(name_.c_str()), f(f_) {}
+
+ void operator()(std::string s) const {
+ f->dump_string(name, s);
+ }
+ void operator()(int64_t i) const {
+ f->dump_int(name, i);
+ }
+ void operator()(double d) const {
+ f->dump_float(name, d);
+ }
+
+private:
+ const char* name;
+ Formatter* f;
+};
+
+void pool_opts_t::dump(const std::string& name, Formatter* f) const
+{
+ const opt_desc_t& desc = get_opt_desc(name);
+ auto i = opts.find(desc.key);
+ if (i == opts.end()) {
+ return;
+ }
+ boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
+}
+
+void pool_opts_t::dump(Formatter* f) const
+{
+ for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
+ const std::string& name = i->first;
+ const opt_desc_t& desc = i->second;
+ auto j = opts.find(desc.key);
+ if (j == opts.end()) {
+ continue;
+ }
+ boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
+ }
+}
+
+class pool_opts_encoder_t : public boost::static_visitor<> {
+public:
+ explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
+ : bl(bl_),
+ features(features) {}
+
+ void operator()(const std::string &s) const {
+ encode(static_cast<int32_t>(pool_opts_t::STR), bl);
+ encode(s, bl);
+ }
+ void operator()(int64_t i) const {
+ encode(static_cast<int32_t>(pool_opts_t::INT), bl);
+ if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ encode(i, bl);
+ } else {
+ encode(static_cast<int32_t>(i), bl);
+ }
+ }
+ void operator()(double d) const {
+ encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
+ encode(d, bl);
+ }
+
+private:
+ ceph::buffer::list& bl;
+ uint64_t features;
+};
+
+void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ unsigned v = 2;
+ if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ v = 1;
+ }
+ ENCODE_START(v, 1, bl);
+ uint32_t n = static_cast<uint32_t>(opts.size());
+ encode(n, bl);
+ for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
+ encode(static_cast<int32_t>(i->first), bl);
+ boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ __u32 n;
+ decode(n, bl);
+ opts.clear();
+ while (n--) {
+ int32_t k, t;
+ decode(k, bl);
+ decode(t, bl);
+ if (t == STR) {
+ std::string s;
+ decode(s, bl);
+ opts[static_cast<key_t>(k)] = s;
+ } else if (t == INT) {
+ int64_t i;
+ if (struct_v >= 2) {
+ decode(i, bl);
+ } else {
+ int ii;
+ decode(ii, bl);
+ i = ii;
+ }
+ opts[static_cast<key_t>(k)] = i;
+ } else if (t == DOUBLE) {
+ double d;
+ decode(d, bl);
+ opts[static_cast<key_t>(k)] = d;
+ } else {
+ ceph_assert(!"invalid type");
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+ostream& operator<<(ostream& out, const pool_opts_t& opts)
+{
+ for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
+ const std::string& name = i->first;
+ const pool_opts_t::opt_desc_t& desc = i->second;
+ auto j = opts.opts.find(desc.key);
+ if (j == opts.opts.end()) {
+ continue;
+ }
+ out << " " << name << " " << j->second;
+ }
+ return out;
+}
+
+// -- pg_pool_t --
+
+const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
+const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
+const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
+
+void pg_pool_t::dump(Formatter *f) const
+{
+ f->dump_stream("create_time") << get_create_time();
+ f->dump_unsigned("flags", get_flags());
+ f->dump_string("flags_names", get_flags_string());
+ f->dump_int("type", get_type());
+ f->dump_int("size", get_size());
+ f->dump_int("min_size", get_min_size());
+ f->dump_int("crush_rule", get_crush_rule());
+ f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count);
+ f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
+ f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
+ f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
+ f->dump_int("object_hash", get_object_hash());
+ f->dump_string("pg_autoscale_mode",
+ get_pg_autoscale_mode_name(pg_autoscale_mode));
+ f->dump_unsigned("pg_num", get_pg_num());
+ f->dump_unsigned("pg_placement_num", get_pgp_num());
+ f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
+ f->dump_unsigned("pg_num_target", get_pg_num_target());
+ f->dump_unsigned("pg_num_pending", get_pg_num_pending());
+ f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
+ f->dump_stream("last_change") << get_last_change();
+ f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
+ f->dump_stream("last_force_op_resend_prenautilus")
+ << get_last_force_op_resend_prenautilus();
+ f->dump_stream("last_force_op_resend_preluminous")
+ << get_last_force_op_resend_preluminous();
+ f->dump_unsigned("auid", get_auid());
+ f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
+ f->dump_unsigned("snap_seq", get_snap_seq());
+ f->dump_unsigned("snap_epoch", get_snap_epoch());
+ f->open_array_section("pool_snaps");
+ for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
+ f->open_object_section("pool_snap_info");
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_stream("removed_snaps") << removed_snaps;
+ f->dump_unsigned("quota_max_bytes", quota_max_bytes);
+ f->dump_unsigned("quota_max_objects", quota_max_objects);
+ f->open_array_section("tiers");
+ for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
+ f->dump_unsigned("pool_id", *p);
+ f->close_section();
+ f->dump_int("tier_of", tier_of);
+ f->dump_int("read_tier", read_tier);
+ f->dump_int("write_tier", write_tier);
+ f->dump_string("cache_mode", get_cache_mode_name());
+ f->dump_unsigned("target_max_bytes", target_max_bytes);
+ f->dump_unsigned("target_max_objects", target_max_objects);
+ f->dump_unsigned("cache_target_dirty_ratio_micro",
+ cache_target_dirty_ratio_micro);
+ f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+ cache_target_dirty_high_ratio_micro);
+ f->dump_unsigned("cache_target_full_ratio_micro",
+ cache_target_full_ratio_micro);
+ f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
+ f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
+ f->dump_string("erasure_code_profile", erasure_code_profile);
+ f->open_object_section("hit_set_params");
+ hit_set_params.dump(f);
+ f->close_section(); // hit_set_params
+ f->dump_unsigned("hit_set_period", hit_set_period);
+ f->dump_unsigned("hit_set_count", hit_set_count);
+ f->dump_bool("use_gmt_hitset", use_gmt_hitset);
+ f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
+ f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
+ f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
+ f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
+ f->open_array_section("grade_table");
+ for (unsigned i = 0; i < hit_set_count; ++i)
+ f->dump_unsigned("value", get_grade(i));
+ f->close_section();
+ f->dump_unsigned("stripe_width", get_stripe_width());
+ f->dump_unsigned("expected_num_objects", expected_num_objects);
+ f->dump_bool("fast_read", fast_read);
+ f->open_object_section("options");
+ opts.dump(f);
+ f->close_section(); // options
+ f->open_object_section("application_metadata");
+ for (auto &app_pair : application_metadata) {
+ f->open_object_section(app_pair.first.c_str());
+ for (auto &kv_pair : app_pair.second) {
+ f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+ }
+ f->close_section(); // application
+ }
+ f->close_section(); // application_metadata
+}
+
+void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
+ for (size_t i = 0; i < from.size(); ++i) {
+ if (from[i] != CRUSH_ITEM_NONE) {
+ to->insert(
+ pg_shard_t(
+ from[i],
+ is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ }
+}
+
+void pg_pool_t::calc_pg_masks()
+{
+ pg_num_mask = (1 << cbits(pg_num-1)) - 1;
+ pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
+}
+
+unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
+{
+ if (pg_num == pg_num_mask + 1)
+ return pg_num; // power-of-2 split
+ unsigned mask = pg_num_mask >> 1;
+ if ((pgid.ps() & mask) < (pg_num & mask))
+ return pg_num_mask + 1; // smaller bin size (already split)
+ else
+ return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
+}
+
+bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
+{
+ if (pg_num_pending >= pg_num) {
+ return false;
+ }
+ if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
+ if (target) {
+ *target = false;
+ }
+ return true;
+ }
+ for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
+ if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
+ if (target) {
+ *target = true;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * we have two snap modes:
+ * - pool snaps
+ * - snap existence/non-existence defined by snaps[] and snap_seq
+ * - user managed snaps
+ * - existence tracked by librados user
+ */
+bool pg_pool_t::is_pool_snaps_mode() const
+{
+ return has_flag(FLAG_POOL_SNAPS);
+}
+
+bool pg_pool_t::is_unmanaged_snaps_mode() const
+{
+ return has_flag(FLAG_SELFMANAGED_SNAPS);
+}
+
+bool pg_pool_t::is_removed_snap(snapid_t s) const
+{
+ if (is_pool_snaps_mode())
+ return s <= get_snap_seq() && snaps.count(s) == 0;
+ else
+ return removed_snaps.contains(s);
+}
+
+snapid_t pg_pool_t::snap_exists(std::string_view s) const
+{
+ for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
+ if (p->second.name == s)
+ return p->second.snapid;
+ return 0;
+}
+
+void pg_pool_t::add_snap(const char *n, utime_t stamp)
+{
+ ceph_assert(!is_unmanaged_snaps_mode());
+ flags |= FLAG_POOL_SNAPS;
+ snapid_t s = get_snap_seq() + 1;
+ snap_seq = s;
+ snaps[s].snapid = s;
+ snaps[s].name = n;
+ snaps[s].stamp = stamp;
+}
+
+uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
+{
+ ceph_assert(!is_pool_snaps_mode());
+ if (snap_seq == 0) {
+ if (preoctopus_compat) {
+ // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
+ // mimic this field is not decoded but our flag is set; pre-mimic, we
+ // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
+ removed_snaps.insert(snapid_t(1));
+ }
+ snap_seq = 1;
+ }
+ flags |= FLAG_SELFMANAGED_SNAPS;
+ snap_seq = snap_seq + 1;
+ return snap_seq;
+}
+
+void pg_pool_t::remove_snap(snapid_t s)
+{
+ ceph_assert(snaps.count(s));
+ snaps.erase(s);
+ snap_seq = snap_seq + 1;
+}
+
+void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
+{
+ ceph_assert(is_unmanaged_snaps_mode());
+ ++snap_seq;
+ if (preoctopus_compat) {
+ removed_snaps.insert(s);
+ // try to add in the new seq, just to try to keep the interval_set contiguous
+ if (!removed_snaps.contains(get_snap_seq())) {
+ removed_snaps.insert(get_snap_seq());
+ }
+ }
+}
+
+SnapContext pg_pool_t::get_snap_context() const
+{
+ vector<snapid_t> s(snaps.size());
+ unsigned i = 0;
+ for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
+ s[i++] = p->first;
+ return SnapContext(get_snap_seq(), s);
+}
+
+uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
+{
+ if (ns.empty())
+ return ceph_str_hash(object_hash, key.data(), key.length());
+ int nsl = ns.length();
+ int len = key.length() + nsl + 1;
+ char buf[len];
+ memcpy(&buf[0], ns.data(), nsl);
+ buf[nsl] = '\037';
+ memcpy(&buf[nsl+1], key.data(), key.length());
+ return ceph_str_hash(object_hash, &buf[0], len);
+}
+
+uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
+{
+ return ceph_stable_mod(v, pg_num, pg_num_mask);
+}
+
+/*
+ * map a raw pg (with full precision ps) into an actual pg, for storage
+ */
+pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
+{
+ pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
+ return pg;
+}
+
+/*
+ * map raw pg (full precision ps) into a placement seed. include
+ * pool id in that value so that different pools don't use the same
+ * seeds.
+ */
+ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
+{
+ if (flags & FLAG_HASHPSPOOL) {
+ // Hash the pool id so that pool PGs do not overlap.
+ return
+ crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
+ pg.pool());
+ } else {
+ // Legacy behavior; add ps and pool together. This is not a great
+ // idea because the PGs from each pool will essentially overlap on
+ // top of each other: 0.5 == 1.4 == 2.3 == ...
+ return
+ ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
+ pg.pool();
+ }
+}
+
+uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
+{
+ uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
+ if (pg_num == pg_num_mask + 1) {
+ r &= ~pg_num_mask;
+ } else {
+ unsigned smaller_mask = pg_num_mask >> 1;
+ if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
+ r &= ~pg_num_mask;
+ } else {
+ r &= ~smaller_mask;
+ }
+ }
+ r |= pg.ps();
+ return r;
+}
+
+void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
+ // this encoding matches the old struct ceph_pg_pool
+ __u8 struct_v = 2;
+ encode(struct_v, bl);
+ encode(type, bl);
+ encode(size, bl);
+ encode(crush_rule, bl);
+ encode(object_hash, bl);
+ encode(pg_num, bl);
+ encode(pgp_num, bl);
+ __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+ encode(lpg_num, bl);
+ encode(lpgp_num, bl);
+ encode(last_change, bl);
+ encode(snap_seq, bl);
+ encode(snap_epoch, bl);
+
+ __u32 n = snaps.size();
+ encode(n, bl);
+ n = removed_snaps.num_intervals();
+ encode(n, bl);
+
+ encode(auid, bl);
+
+ encode_nohead(snaps, bl, features);
+ encode_nohead(removed_snaps, bl);
+ return;
+ }
+
+ if ((features & CEPH_FEATURE_OSDENC) == 0) {
+ __u8 struct_v = 4;
+ encode(struct_v, bl);
+ encode(type, bl);
+ encode(size, bl);
+ encode(crush_rule, bl);
+ encode(object_hash, bl);
+ encode(pg_num, bl);
+ encode(pgp_num, bl);
+ __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+ encode(lpg_num, bl);
+ encode(lpgp_num, bl);
+ encode(last_change, bl);
+ encode(snap_seq, bl);
+ encode(snap_epoch, bl);
+ encode(snaps, bl, features);
+ encode(removed_snaps, bl);
+ encode(auid, bl);
+ encode(flags, bl);
+ encode((uint32_t)0, bl); // crash_replay_interval
+ return;
+ }
+
+ if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
+ // we simply added last_force_op_resend here, which is a fully
+ // backward compatible change. however, encoding the same map
+ // differently between monitors triggers scrub noise (even though
+ // they are decodable without the feature), so let's be pendantic
+ // about it.
+ ENCODE_START(14, 5, bl);
+ encode(type, bl);
+ encode(size, bl);
+ encode(crush_rule, bl);
+ encode(object_hash, bl);
+ encode(pg_num, bl);
+ encode(pgp_num, bl);
+ __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+ encode(lpg_num, bl);
+ encode(lpgp_num, bl);
+ encode(last_change, bl);
+ encode(snap_seq, bl);
+ encode(snap_epoch, bl);
+ encode(snaps, bl, features);
+ encode(removed_snaps, bl);
+ encode(auid, bl);
+ encode(flags, bl);
+ encode((uint32_t)0, bl); // crash_replay_interval
+ encode(min_size, bl);
+ encode(quota_max_bytes, bl);
+ encode(quota_max_objects, bl);
+ encode(tiers, bl);
+ encode(tier_of, bl);
+ __u8 c = cache_mode;
+ encode(c, bl);
+ encode(read_tier, bl);
+ encode(write_tier, bl);
+ encode(properties, bl);
+ encode(hit_set_params, bl);
+ encode(hit_set_period, bl);
+ encode(hit_set_count, bl);
+ encode(stripe_width, bl);
+ encode(target_max_bytes, bl);
+ encode(target_max_objects, bl);
+ encode(cache_target_dirty_ratio_micro, bl);
+ encode(cache_target_full_ratio_micro, bl);
+ encode(cache_min_flush_age, bl);
+ encode(cache_min_evict_age, bl);
+ encode(erasure_code_profile, bl);
+ ENCODE_FINISH(bl);
+ return;
+ }
+
+ uint8_t v = 30;
+ // NOTE: any new encoding dependencies must be reflected by
+ // SIGNIFICANT_FEATURES
+ if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
+ // this was the first post-hammer thing we added; if it's missing, encode
+ // like hammer.
+ v = 21;
+ } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ v = 24;
+ } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+ v = 26;
+ } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ v = 27;
+ } else if (!is_stretch_pool()) {
+ v = 29;
+ }
+
+ ENCODE_START(v, 5, bl);
+ encode(type, bl);
+ encode(size, bl);
+ encode(crush_rule, bl);
+ encode(object_hash, bl);
+ encode(pg_num, bl);
+ encode(pgp_num, bl);
+ __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+ encode(lpg_num, bl);
+ encode(lpgp_num, bl);
+ encode(last_change, bl);
+ encode(snap_seq, bl);
+ encode(snap_epoch, bl);
+ encode(snaps, bl, features);
+ encode(removed_snaps, bl);
+ encode(auid, bl);
+ if (v >= 27) {
+ encode(flags, bl);
+ } else {
+ auto tmp = flags;
+ tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
+ encode(tmp, bl);
+ }
+ encode((uint32_t)0, bl); // crash_replay_interval
+ encode(min_size, bl);
+ encode(quota_max_bytes, bl);
+ encode(quota_max_objects, bl);
+ encode(tiers, bl);
+ encode(tier_of, bl);
+ __u8 c = cache_mode;
+ encode(c, bl);
+ encode(read_tier, bl);
+ encode(write_tier, bl);
+ encode(properties, bl);
+ encode(hit_set_params, bl);
+ encode(hit_set_period, bl);
+ encode(hit_set_count, bl);
+ encode(stripe_width, bl);
+ encode(target_max_bytes, bl);
+ encode(target_max_objects, bl);
+ encode(cache_target_dirty_ratio_micro, bl);
+ encode(cache_target_full_ratio_micro, bl);
+ encode(cache_min_flush_age, bl);
+ encode(cache_min_evict_age, bl);
+ encode(erasure_code_profile, bl);
+ encode(last_force_op_resend_preluminous, bl);
+ encode(min_read_recency_for_promote, bl);
+ encode(expected_num_objects, bl);
+ if (v >= 19) {
+ encode(cache_target_dirty_high_ratio_micro, bl);
+ }
+ if (v >= 20) {
+ encode(min_write_recency_for_promote, bl);
+ }
+ if (v >= 21) {
+ encode(use_gmt_hitset, bl);
+ }
+ if (v >= 22) {
+ encode(fast_read, bl);
+ }
+ if (v >= 23) {
+ encode(hit_set_grade_decay_rate, bl);
+ encode(hit_set_search_last_n, bl);
+ }
+ if (v >= 24) {
+ encode(opts, bl, features);
+ }
+ if (v >= 25) {
+ encode(last_force_op_resend_prenautilus, bl);
+ }
+ if (v >= 26) {
+ encode(application_metadata, bl);
+ }
+ if (v >= 27) {
+ encode(create_time, bl);
+ }
+ if (v >= 28) {
+ encode(pg_num_target, bl);
+ encode(pgp_num_target, bl);
+ encode(pg_num_pending, bl);
+ encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
+ encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
+ encode(last_force_op_resend, bl);
+ encode(pg_autoscale_mode, bl);
+ }
+ if (v >= 29) {
+ encode(last_pg_merge_meta, bl);
+ }
+ if (v >= 30) {
+ encode(peering_crush_bucket_count, bl);
+ encode(peering_crush_bucket_target, bl);
+ encode(peering_crush_bucket_barrier, bl);
+ encode(peering_crush_mandatory_member, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
+ decode(type, bl);
+ decode(size, bl);
+ decode(crush_rule, bl);
+ decode(object_hash, bl);
+ decode(pg_num, bl);
+ decode(pgp_num, bl);
+ {
+ __u32 lpg_num, lpgp_num;
+ decode(lpg_num, bl);
+ decode(lpgp_num, bl);
+ }
+ decode(last_change, bl);
+ decode(snap_seq, bl);
+ decode(snap_epoch, bl);
+
+ if (struct_v >= 3) {
+ decode(snaps, bl);
+ decode(removed_snaps, bl);
+ decode(auid, bl);
+ } else {
+ __u32 n, m;
+ decode(n, bl);
+ decode(m, bl);
+ decode(auid, bl);
+ decode_nohead(n, snaps, bl);
+ decode_nohead(m, removed_snaps, bl);
+ }
+
+ if (struct_v >= 4) {
+ decode(flags, bl);
+ uint32_t crash_replay_interval;
+ decode(crash_replay_interval, bl);
+ } else {
+ flags = 0;
+ }
+ // upgrade path for selfmanaged vs pool snaps
+ if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
+ if (!removed_snaps.empty()) {
+ flags |= FLAG_SELFMANAGED_SNAPS;
+ } else {
+ flags |= FLAG_POOL_SNAPS;
+ }
+ }
+ if (struct_v >= 7) {
+ decode(min_size, bl);
+ } else {
+ min_size = size - size/2;
+ }
+ if (struct_v >= 8) {
+ decode(quota_max_bytes, bl);
+ decode(quota_max_objects, bl);
+ }
+ if (struct_v >= 9) {
+ decode(tiers, bl);
+ decode(tier_of, bl);
+ __u8 v;
+ decode(v, bl);
+ cache_mode = (cache_mode_t)v;
+ decode(read_tier, bl);
+ decode(write_tier, bl);
+ }
+ if (struct_v >= 10) {
+ decode(properties, bl);
+ }
+ if (struct_v >= 11) {
+ decode(hit_set_params, bl);
+ decode(hit_set_period, bl);
+ decode(hit_set_count, bl);
+ } else {
+ pg_pool_t def;
+ hit_set_period = def.hit_set_period;
+ hit_set_count = def.hit_set_count;
+ }
+ if (struct_v >= 12) {
+ decode(stripe_width, bl);
+ } else {
+ set_stripe_width(0);
+ }
+ if (struct_v >= 13) {
+ decode(target_max_bytes, bl);
+ decode(target_max_objects, bl);
+ decode(cache_target_dirty_ratio_micro, bl);
+ decode(cache_target_full_ratio_micro, bl);
+ decode(cache_min_flush_age, bl);
+ decode(cache_min_evict_age, bl);
+ } else {
+ target_max_bytes = 0;
+ target_max_objects = 0;
+ cache_target_dirty_ratio_micro = 0;
+ cache_target_full_ratio_micro = 0;
+ cache_min_flush_age = 0;
+ cache_min_evict_age = 0;
+ }
+ if (struct_v >= 14) {
+ decode(erasure_code_profile, bl);
+ }
+ if (struct_v >= 15) {
+ decode(last_force_op_resend_preluminous, bl);
+ } else {
+ last_force_op_resend_preluminous = 0;
+ }
+ if (struct_v >= 16) {
+ decode(min_read_recency_for_promote, bl);
+ } else {
+ min_read_recency_for_promote = 1;
+ }
+ if (struct_v >= 17) {
+ decode(expected_num_objects, bl);
+ } else {
+ expected_num_objects = 0;
+ }
+ if (struct_v >= 19) {
+ decode(cache_target_dirty_high_ratio_micro, bl);
+ } else {
+ cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
+ }
+ if (struct_v >= 20) {
+ decode(min_write_recency_for_promote, bl);
+ } else {
+ min_write_recency_for_promote = 1;
+ }
+ if (struct_v >= 21) {
+ decode(use_gmt_hitset, bl);
+ } else {
+ use_gmt_hitset = false;
+ }
+ if (struct_v >= 22) {
+ decode(fast_read, bl);
+ } else {
+ fast_read = false;
+ }
+ if (struct_v >= 23) {
+ decode(hit_set_grade_decay_rate, bl);
+ decode(hit_set_search_last_n, bl);
+ } else {
+ hit_set_grade_decay_rate = 0;
+ hit_set_search_last_n = 1;
+ }
+ if (struct_v >= 24) {
+ decode(opts, bl);
+ }
+ if (struct_v >= 25) {
+ decode(last_force_op_resend_prenautilus, bl);
+ } else {
+ last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
+ }
+ if (struct_v >= 26) {
+ decode(application_metadata, bl);
+ }
+ if (struct_v >= 27) {
+ decode(create_time, bl);
+ }
+ if (struct_v >= 28) {
+ decode(pg_num_target, bl);
+ decode(pgp_num_target, bl);
+ decode(pg_num_pending, bl);
+ epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
+ decode(old_merge_last_epoch_started, bl);
+ decode(old_merge_last_epoch_clean, bl);
+ decode(last_force_op_resend, bl);
+ decode(pg_autoscale_mode, bl);
+ if (struct_v >= 29) {
+ decode(last_pg_merge_meta, bl);
+ } else {
+ last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
+ last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
+ }
+ } else {
+ pg_num_target = pg_num;
+ pgp_num_target = pgp_num;
+ pg_num_pending = pg_num;
+ last_force_op_resend = last_force_op_resend_prenautilus;
+ pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade
+ }
+ if (struct_v >= 30) {
+ decode(peering_crush_bucket_count, bl);
+ decode(peering_crush_bucket_target, bl);
+ decode(peering_crush_bucket_barrier, bl);
+ decode(peering_crush_mandatory_member, bl);
+ }
+ DECODE_FINISH(bl);
+ calc_pg_masks();
+ calc_grade_table();
+}
+
+bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
+ std::ostream * out) const
+{
+ if (!is_stretch_pool()) return true;
+ const uint32_t barrier_id = peering_crush_bucket_barrier;
+ const uint32_t barrier_count = peering_crush_bucket_count;
+ set<int> ancestors;
+ const shared_ptr<CrushWrapper>& crush = osdmap.crush;
+ for (int osdid : want) {
+ int ancestor = crush->get_parent_of_type(osdid, barrier_id,
+ crush_rule);
+ ancestors.insert(ancestor);
+ }
+ if (ancestors.size() < barrier_count) {
+ if (out) {
+ *out << __func__ << ": not enough crush buckets with OSDs in want set "
+ << want;
+ }
+ return false;
+ } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE &&
+ !ancestors.count(peering_crush_mandatory_member)) {
+ if (out) {
+ *out << __func__ << ": missing mandatory crush bucket member "
+ << peering_crush_mandatory_member;
+ }
+ return false;
+ }
+ return true;
+}
+
+void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
+{
+ pg_pool_t a;
+ o.push_back(new pg_pool_t(a));
+
+ a.create_time = utime_t(4,5);
+ a.type = TYPE_REPLICATED;
+ a.size = 2;
+ a.crush_rule = 3;
+ a.object_hash = 4;
+ a.pg_num = 6;
+ a.pgp_num = 4;
+ a.pgp_num_target = 4;
+ a.pg_num_target = 5;
+ a.pg_num_pending = 5;
+ a.last_pg_merge_meta.last_epoch_started = 2;
+ a.last_pg_merge_meta.last_epoch_clean = 2;
+ a.last_change = 9;
+ a.last_force_op_resend = 123823;
+ a.last_force_op_resend_preluminous = 123824;
+ a.snap_seq = 10;
+ a.snap_epoch = 11;
+ a.flags = FLAG_POOL_SNAPS;
+ a.auid = 12;
+ a.quota_max_bytes = 473;
+ a.quota_max_objects = 474;
+ o.push_back(new pg_pool_t(a));
+
+ a.snaps[3].name = "asdf";
+ a.snaps[3].snapid = 3;
+ a.snaps[3].stamp = utime_t(123, 4);
+ a.snaps[6].name = "qwer";
+ a.snaps[6].snapid = 6;
+ a.snaps[6].stamp = utime_t(23423, 4);
+ o.push_back(new pg_pool_t(a));
+
+ a.flags = FLAG_SELFMANAGED_SNAPS;
+ a.snaps.clear();
+ a.removed_snaps.insert(2);
+ a.quota_max_bytes = 2473;
+ a.quota_max_objects = 4374;
+ a.tiers.insert(0);
+ a.tiers.insert(1);
+ a.tier_of = 2;
+ a.cache_mode = CACHEMODE_WRITEBACK;
+ a.read_tier = 1;
+ a.write_tier = 1;
+ a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
+ a.hit_set_period = 3600;
+ a.hit_set_count = 8;
+ a.min_read_recency_for_promote = 1;
+ a.min_write_recency_for_promote = 1;
+ a.hit_set_grade_decay_rate = 50;
+ a.hit_set_search_last_n = 1;
+ a.calc_grade_table();
+ a.set_stripe_width(12345);
+ a.target_max_bytes = 1238132132;
+ a.target_max_objects = 1232132;
+ a.cache_target_dirty_ratio_micro = 187232;
+ a.cache_target_dirty_high_ratio_micro = 309856;
+ a.cache_target_full_ratio_micro = 987222;
+ a.cache_min_flush_age = 231;
+ a.cache_min_evict_age = 2321;
+ a.erasure_code_profile = "profile in osdmap";
+ a.expected_num_objects = 123456;
+ a.fast_read = false;
+ a.application_metadata = {{"rbd", {{"key", "value"}}}};
+ o.push_back(new pg_pool_t(a));
+}
+
+ostream& operator<<(ostream& out, const pg_pool_t& p)
+{
+ out << p.get_type_name();
+ if (p.get_type_name() == "erasure") {
+ out << " profile " << p.erasure_code_profile;
+ }
+ out << " size " << p.get_size()
+ << " min_size " << p.get_min_size()
+ << " crush_rule " << p.get_crush_rule()
+ << " object_hash " << p.get_object_hash_name()
+ << " pg_num " << p.get_pg_num()
+ << " pgp_num " << p.get_pgp_num();
+ if (p.get_pg_num_target() != p.get_pg_num()) {
+ out << " pg_num_target " << p.get_pg_num_target();
+ }
+ if (p.get_pgp_num_target() != p.get_pgp_num()) {
+ out << " pgp_num_target " << p.get_pgp_num_target();
+ }
+ if (p.get_pg_num_pending() != p.get_pg_num()) {
+ out << " pg_num_pending " << p.get_pg_num_pending();
+ }
+ if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+ out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
+ }
+ out << " last_change " << p.get_last_change();
+ if (p.get_last_force_op_resend() ||
+ p.get_last_force_op_resend_prenautilus() ||
+ p.get_last_force_op_resend_preluminous())
+ out << " lfor " << p.get_last_force_op_resend() << "/"
+ << p.get_last_force_op_resend_prenautilus() << "/"
+ << p.get_last_force_op_resend_preluminous();
+ if (p.get_auid())
+ out << " owner " << p.get_auid();
+ if (p.flags)
+ out << " flags " << p.get_flags_string();
+ if (p.quota_max_bytes)
+ out << " max_bytes " << p.quota_max_bytes;
+ if (p.quota_max_objects)
+ out << " max_objects " << p.quota_max_objects;
+ if (!p.tiers.empty())
+ out << " tiers " << p.tiers;
+ if (p.is_tier())
+ out << " tier_of " << p.tier_of;
+ if (p.has_read_tier())
+ out << " read_tier " << p.read_tier;
+ if (p.has_write_tier())
+ out << " write_tier " << p.write_tier;
+ if (p.cache_mode)
+ out << " cache_mode " << p.get_cache_mode_name();
+ if (p.target_max_bytes)
+ out << " target_bytes " << p.target_max_bytes;
+ if (p.target_max_objects)
+ out << " target_objects " << p.target_max_objects;
+ if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
+ out << " hit_set " << p.hit_set_params
+ << " " << p.hit_set_period << "s"
+ << " x" << p.hit_set_count << " decay_rate "
+ << p.hit_set_grade_decay_rate
+ << " search_last_n " << p.hit_set_search_last_n;
+ }
+ if (p.min_read_recency_for_promote)
+ out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
+ if (p.min_write_recency_for_promote)
+ out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
+ out << " stripe_width " << p.get_stripe_width();
+ if (p.expected_num_objects)
+ out << " expected_num_objects " << p.expected_num_objects;
+ if (p.fast_read)
+ out << " fast_read " << p.fast_read;
+ out << p.opts;
+ if (!p.application_metadata.empty()) {
+ out << " application ";
+ for (auto it = p.application_metadata.begin();
+ it != p.application_metadata.end(); ++it) {
+ if (it != p.application_metadata.begin())
+ out << ",";
+ out << it->first;
+ }
+ }
+ return out;
+}
+
+
+// -- object_stat_sum_t --
+
+void object_stat_sum_t::dump(Formatter *f) const
+{
+ f->dump_int("num_bytes", num_bytes);
+ f->dump_int("num_objects", num_objects);
+ f->dump_int("num_object_clones", num_object_clones);
+ f->dump_int("num_object_copies", num_object_copies);
+ f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
+ f->dump_int("num_objects_missing", num_objects_missing);
+ f->dump_int("num_objects_degraded", num_objects_degraded);
+ f->dump_int("num_objects_misplaced", num_objects_misplaced);
+ f->dump_int("num_objects_unfound", num_objects_unfound);
+ f->dump_int("num_objects_dirty", num_objects_dirty);
+ f->dump_int("num_whiteouts", num_whiteouts);
+ f->dump_int("num_read", num_rd);
+ f->dump_int("num_read_kb", num_rd_kb);
+ f->dump_int("num_write", num_wr);
+ f->dump_int("num_write_kb", num_wr_kb);
+ f->dump_int("num_scrub_errors", num_scrub_errors);
+ f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
+ f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
+ f->dump_int("num_objects_recovered", num_objects_recovered);
+ f->dump_int("num_bytes_recovered", num_bytes_recovered);
+ f->dump_int("num_keys_recovered", num_keys_recovered);
+ f->dump_int("num_objects_omap", num_objects_omap);
+ f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
+ f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
+ f->dump_int("num_flush", num_flush);
+ f->dump_int("num_flush_kb", num_flush_kb);
+ f->dump_int("num_evict", num_evict);
+ f->dump_int("num_evict_kb", num_evict_kb);
+ f->dump_int("num_promote", num_promote);
+ f->dump_int("num_flush_mode_high", num_flush_mode_high);
+ f->dump_int("num_flush_mode_low", num_flush_mode_low);
+ f->dump_int("num_evict_mode_some", num_evict_mode_some);
+ f->dump_int("num_evict_mode_full", num_evict_mode_full);
+ f->dump_int("num_objects_pinned", num_objects_pinned);
+ f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
+ f->dump_int("num_large_omap_objects", num_large_omap_objects);
+ f->dump_int("num_objects_manifest", num_objects_manifest);
+ f->dump_int("num_omap_bytes", num_omap_bytes);
+ f->dump_int("num_omap_keys", num_omap_keys);
+ f->dump_int("num_objects_repaired", num_objects_repaired);
+}
+
+void object_stat_sum_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(20, 14, bl);
+#if defined(CEPH_LITTLE_ENDIAN)
+ bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
+#else
+ encode(num_bytes, bl);
+ encode(num_objects, bl);
+ encode(num_object_clones, bl);
+ encode(num_object_copies, bl);
+ encode(num_objects_missing_on_primary, bl);
+ encode(num_objects_degraded, bl);
+ encode(num_objects_unfound, bl);
+ encode(num_rd, bl);
+ encode(num_rd_kb, bl);
+ encode(num_wr, bl);
+ encode(num_wr_kb, bl);
+ encode(num_scrub_errors, bl);
+ encode(num_objects_recovered, bl);
+ encode(num_bytes_recovered, bl);
+ encode(num_keys_recovered, bl);
+ encode(num_shallow_scrub_errors, bl);
+ encode(num_deep_scrub_errors, bl);
+ encode(num_objects_dirty, bl);
+ encode(num_whiteouts, bl);
+ encode(num_objects_omap, bl);
+ encode(num_objects_hit_set_archive, bl);
+ encode(num_objects_misplaced, bl);
+ encode(num_bytes_hit_set_archive, bl);
+ encode(num_flush, bl);
+ encode(num_flush_kb, bl);
+ encode(num_evict, bl);
+ encode(num_evict_kb, bl);
+ encode(num_promote, bl);
+ encode(num_flush_mode_high, bl);
+ encode(num_flush_mode_low, bl);
+ encode(num_evict_mode_some, bl);
+ encode(num_evict_mode_full, bl);
+ encode(num_objects_pinned, bl);
+ encode(num_objects_missing, bl);
+ encode(num_legacy_snapsets, bl);
+ encode(num_large_omap_objects, bl);
+ encode(num_objects_manifest, bl);
+ encode(num_omap_bytes, bl);
+ encode(num_omap_keys, bl);
+ encode(num_objects_repaired, bl);
+#endif
+ ENCODE_FINISH(bl);
+}
+
+void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ bool decode_finish = false;
+ static const int STAT_SUM_DECODE_VERSION = 20;
+ DECODE_START(STAT_SUM_DECODE_VERSION, bl);
+#if defined(CEPH_LITTLE_ENDIAN)
+ if (struct_v == STAT_SUM_DECODE_VERSION) {
+ bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
+ decode_finish = true;
+ }
+#endif
+ if (!decode_finish) {
+ decode(num_bytes, bl);
+ decode(num_objects, bl);
+ decode(num_object_clones, bl);
+ decode(num_object_copies, bl);
+ decode(num_objects_missing_on_primary, bl);
+ decode(num_objects_degraded, bl);
+ decode(num_objects_unfound, bl);
+ decode(num_rd, bl);
+ decode(num_rd_kb, bl);
+ decode(num_wr, bl);
+ decode(num_wr_kb, bl);
+ decode(num_scrub_errors, bl);
+ decode(num_objects_recovered, bl);
+ decode(num_bytes_recovered, bl);
+ decode(num_keys_recovered, bl);
+ decode(num_shallow_scrub_errors, bl);
+ decode(num_deep_scrub_errors, bl);
+ decode(num_objects_dirty, bl);
+ decode(num_whiteouts, bl);
+ decode(num_objects_omap, bl);
+ decode(num_objects_hit_set_archive, bl);
+ decode(num_objects_misplaced, bl);
+ decode(num_bytes_hit_set_archive, bl);
+ decode(num_flush, bl);
+ decode(num_flush_kb, bl);
+ decode(num_evict, bl);
+ decode(num_evict_kb, bl);
+ decode(num_promote, bl);
+ decode(num_flush_mode_high, bl);
+ decode(num_flush_mode_low, bl);
+ decode(num_evict_mode_some, bl);
+ decode(num_evict_mode_full, bl);
+ decode(num_objects_pinned, bl);
+ decode(num_objects_missing, bl);
+ if (struct_v >= 16) {
+ decode(num_legacy_snapsets, bl);
+ } else {
+ num_legacy_snapsets = num_object_clones; // upper bound
+ }
+ if (struct_v >= 17) {
+ decode(num_large_omap_objects, bl);
+ }
+ if (struct_v >= 18) {
+ decode(num_objects_manifest, bl);
+ }
+ if (struct_v >= 19) {
+ decode(num_omap_bytes, bl);
+ decode(num_omap_keys, bl);
+ }
+ if (struct_v >= 20) {
+ decode(num_objects_repaired, bl);
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
+{
+ object_stat_sum_t a;
+
+ a.num_bytes = 1;
+ a.num_objects = 3;
+ a.num_object_clones = 4;
+ a.num_object_copies = 5;
+ a.num_objects_missing_on_primary = 6;
+ a.num_objects_missing = 123;
+ a.num_objects_degraded = 7;
+ a.num_objects_unfound = 8;
+ a.num_rd = 9; a.num_rd_kb = 10;
+ a.num_wr = 11; a.num_wr_kb = 12;
+ a.num_objects_recovered = 14;
+ a.num_bytes_recovered = 15;
+ a.num_keys_recovered = 16;
+ a.num_deep_scrub_errors = 17;
+ a.num_shallow_scrub_errors = 18;
+ a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
+ a.num_objects_dirty = 21;
+ a.num_whiteouts = 22;
+ a.num_objects_misplaced = 1232;
+ a.num_objects_hit_set_archive = 2;
+ a.num_bytes_hit_set_archive = 27;
+ a.num_flush = 5;
+ a.num_flush_kb = 6;
+ a.num_evict = 7;
+ a.num_evict_kb = 8;
+ a.num_promote = 9;
+ a.num_flush_mode_high = 0;
+ a.num_flush_mode_low = 1;
+ a.num_evict_mode_some = 1;
+ a.num_evict_mode_full = 0;
+ a.num_objects_pinned = 20;
+ a.num_large_omap_objects = 5;
+ a.num_objects_manifest = 2;
+ a.num_omap_bytes = 20000;
+ a.num_omap_keys = 200;
+ a.num_objects_repaired = 300;
+ o.push_back(new object_stat_sum_t(a));
+}
+
+void object_stat_sum_t::add(const object_stat_sum_t& o)
+{
+ num_bytes += o.num_bytes;
+ num_objects += o.num_objects;
+ num_object_clones += o.num_object_clones;
+ num_object_copies += o.num_object_copies;
+ num_objects_missing_on_primary += o.num_objects_missing_on_primary;
+ num_objects_missing += o.num_objects_missing;
+ num_objects_degraded += o.num_objects_degraded;
+ num_objects_misplaced += o.num_objects_misplaced;
+ num_rd += o.num_rd;
+ num_rd_kb += o.num_rd_kb;
+ num_wr += o.num_wr;
+ num_wr_kb += o.num_wr_kb;
+ num_objects_unfound += o.num_objects_unfound;
+ num_scrub_errors += o.num_scrub_errors;
+ num_shallow_scrub_errors += o.num_shallow_scrub_errors;
+ num_deep_scrub_errors += o.num_deep_scrub_errors;
+ num_objects_recovered += o.num_objects_recovered;
+ num_bytes_recovered += o.num_bytes_recovered;
+ num_keys_recovered += o.num_keys_recovered;
+ num_objects_dirty += o.num_objects_dirty;
+ num_whiteouts += o.num_whiteouts;
+ num_objects_omap += o.num_objects_omap;
+ num_objects_hit_set_archive += o.num_objects_hit_set_archive;
+ num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
+ num_flush += o.num_flush;
+ num_flush_kb += o.num_flush_kb;
+ num_evict += o.num_evict;
+ num_evict_kb += o.num_evict_kb;
+ num_promote += o.num_promote;
+ num_flush_mode_high += o.num_flush_mode_high;
+ num_flush_mode_low += o.num_flush_mode_low;
+ num_evict_mode_some += o.num_evict_mode_some;
+ num_evict_mode_full += o.num_evict_mode_full;
+ num_objects_pinned += o.num_objects_pinned;
+ num_legacy_snapsets += o.num_legacy_snapsets;
+ num_large_omap_objects += o.num_large_omap_objects;
+ num_objects_manifest += o.num_objects_manifest;
+ num_omap_bytes += o.num_omap_bytes;
+ num_omap_keys += o.num_omap_keys;
+ num_objects_repaired += o.num_objects_repaired;
+}
+
+void object_stat_sum_t::sub(const object_stat_sum_t& o)
+{
+ num_bytes -= o.num_bytes;
+ num_objects -= o.num_objects;
+ num_object_clones -= o.num_object_clones;
+ num_object_copies -= o.num_object_copies;
+ num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
+ num_objects_missing -= o.num_objects_missing;
+ num_objects_degraded -= o.num_objects_degraded;
+ num_objects_misplaced -= o.num_objects_misplaced;
+ num_rd -= o.num_rd;
+ num_rd_kb -= o.num_rd_kb;
+ num_wr -= o.num_wr;
+ num_wr_kb -= o.num_wr_kb;
+ num_objects_unfound -= o.num_objects_unfound;
+ num_scrub_errors -= o.num_scrub_errors;
+ num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
+ num_deep_scrub_errors -= o.num_deep_scrub_errors;
+ num_objects_recovered -= o.num_objects_recovered;
+ num_bytes_recovered -= o.num_bytes_recovered;
+ num_keys_recovered -= o.num_keys_recovered;
+ num_objects_dirty -= o.num_objects_dirty;
+ num_whiteouts -= o.num_whiteouts;
+ num_objects_omap -= o.num_objects_omap;
+ num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
+ num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
+ num_flush -= o.num_flush;
+ num_flush_kb -= o.num_flush_kb;
+ num_evict -= o.num_evict;
+ num_evict_kb -= o.num_evict_kb;
+ num_promote -= o.num_promote;
+ num_flush_mode_high -= o.num_flush_mode_high;
+ num_flush_mode_low -= o.num_flush_mode_low;
+ num_evict_mode_some -= o.num_evict_mode_some;
+ num_evict_mode_full -= o.num_evict_mode_full;
+ num_objects_pinned -= o.num_objects_pinned;
+ num_legacy_snapsets -= o.num_legacy_snapsets;
+ num_large_omap_objects -= o.num_large_omap_objects;
+ num_objects_manifest -= o.num_objects_manifest;
+ num_omap_bytes -= o.num_omap_bytes;
+ num_omap_keys -= o.num_omap_keys;
+ num_objects_repaired -= o.num_objects_repaired;
+}
+
+bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
+{
+ return
+ l.num_bytes == r.num_bytes &&
+ l.num_objects == r.num_objects &&
+ l.num_object_clones == r.num_object_clones &&
+ l.num_object_copies == r.num_object_copies &&
+ l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
+ l.num_objects_missing == r.num_objects_missing &&
+ l.num_objects_degraded == r.num_objects_degraded &&
+ l.num_objects_misplaced == r.num_objects_misplaced &&
+ l.num_objects_unfound == r.num_objects_unfound &&
+ l.num_rd == r.num_rd &&
+ l.num_rd_kb == r.num_rd_kb &&
+ l.num_wr == r.num_wr &&
+ l.num_wr_kb == r.num_wr_kb &&
+ l.num_scrub_errors == r.num_scrub_errors &&
+ l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
+ l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
+ l.num_objects_recovered == r.num_objects_recovered &&
+ l.num_bytes_recovered == r.num_bytes_recovered &&
+ l.num_keys_recovered == r.num_keys_recovered &&
+ l.num_objects_dirty == r.num_objects_dirty &&
+ l.num_whiteouts == r.num_whiteouts &&
+ l.num_objects_omap == r.num_objects_omap &&
+ l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
+ l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
+ l.num_flush == r.num_flush &&
+ l.num_flush_kb == r.num_flush_kb &&
+ l.num_evict == r.num_evict &&
+ l.num_evict_kb == r.num_evict_kb &&
+ l.num_promote == r.num_promote &&
+ l.num_flush_mode_high == r.num_flush_mode_high &&
+ l.num_flush_mode_low == r.num_flush_mode_low &&
+ l.num_evict_mode_some == r.num_evict_mode_some &&
+ l.num_evict_mode_full == r.num_evict_mode_full &&
+ l.num_objects_pinned == r.num_objects_pinned &&
+ l.num_legacy_snapsets == r.num_legacy_snapsets &&
+ l.num_large_omap_objects == r.num_large_omap_objects &&
+ l.num_objects_manifest == r.num_objects_manifest &&
+ l.num_omap_bytes == r.num_omap_bytes &&
+ l.num_omap_keys == r.num_omap_keys &&
+ l.num_objects_repaired == r.num_objects_repaired;
+}
+
+// -- object_stat_collection_t --
+
+void object_stat_collection_t::dump(Formatter *f) const
+{
+ f->open_object_section("stat_sum");
+ sum.dump(f);
+ f->close_section();
+}
+
+void object_stat_collection_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(sum, bl);
+ encode((__u32)0, bl);
+ ENCODE_FINISH(bl);
+}
+
+void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(sum, bl);
+ {
+ map<string,object_stat_sum_t> cat_sum;
+ decode(cat_sum, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
+{
+ object_stat_collection_t a;
+ o.push_back(new object_stat_collection_t(a));
+ list<object_stat_sum_t*> l;
+ object_stat_sum_t::generate_test_instances(l);
+ for (auto p = l.begin(); p != l.end(); ++p) {
+ a.add(**p);
+ o.push_back(new object_stat_collection_t(a));
+ }
+}
+
+
+// -- pg_stat_t --
+
+bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
+{
+ if (primary && osd == acting_primary) {
+ return true;
+ } else if (!primary) {
+ for(auto it = acting.cbegin(); it != acting.cend(); ++it)
+ {
+ if (*it == osd)
+ return true;
+ }
+ }
+ return false;
+}
+
+void pg_stat_t::dump(Formatter *f) const
+{
+ f->dump_stream("version") << version;
+ f->dump_unsigned("reported_seq", reported_seq);
+ f->dump_unsigned("reported_epoch", reported_epoch);
+ f->dump_string("state", pg_state_string(state));
+ f->dump_stream("last_fresh") << last_fresh;
+ f->dump_stream("last_change") << last_change;
+ f->dump_stream("last_active") << last_active;
+ f->dump_stream("last_peered") << last_peered;
+ f->dump_stream("last_clean") << last_clean;
+ f->dump_stream("last_became_active") << last_became_active;
+ f->dump_stream("last_became_peered") << last_became_peered;
+ f->dump_stream("last_unstale") << last_unstale;
+ f->dump_stream("last_undegraded") << last_undegraded;
+ f->dump_stream("last_fullsized") << last_fullsized;
+ f->dump_unsigned("mapping_epoch", mapping_epoch);
+ f->dump_stream("log_start") << log_start;
+ f->dump_stream("ondisk_log_start") << ondisk_log_start;
+ f->dump_unsigned("created", created);
+ f->dump_unsigned("last_epoch_clean", last_epoch_clean);
+ f->dump_stream("parent") << parent;
+ f->dump_unsigned("parent_split_bits", parent_split_bits);
+ f->dump_stream("last_scrub") << last_scrub;
+ f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
+ f->dump_stream("last_deep_scrub") << last_deep_scrub;
+ f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
+ f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
+ f->dump_int("log_size", log_size);
+ f->dump_int("ondisk_log_size", ondisk_log_size);
+ f->dump_bool("stats_invalid", stats_invalid);
+ f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
+ f->dump_bool("omap_stats_invalid", omap_stats_invalid);
+ f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
+ f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
+ f->dump_bool("pin_stats_invalid", pin_stats_invalid);
+ f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
+ f->dump_unsigned("snaptrimq_len", snaptrimq_len);
+ stats.dump(f);
+ f->open_array_section("up");
+ for (auto p = up.cbegin(); p != up.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->open_array_section("acting");
+ for (auto p = acting.cbegin(); p != acting.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->open_array_section("avail_no_missing");
+ for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
+ f->dump_stream("shard") << *p;
+ f->close_section();
+ f->open_array_section("object_location_counts");
+ for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
+ f->open_object_section("entry");
+ f->dump_stream("shards") << p->first;
+ f->dump_int("objects", p->second);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("blocked_by");
+ for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("up_primary", up_primary);
+ f->dump_int("acting_primary", acting_primary);
+ f->open_array_section("purged_snaps");
+ for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
+ f->open_object_section("interval");
+ f->dump_stream("start") << i.get_start();
+ f->dump_stream("length") << i.get_len();
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void pg_stat_t::dump_brief(Formatter *f) const
+{
+ f->dump_string("state", pg_state_string(state));
+ f->open_array_section("up");
+ for (auto p = up.cbegin(); p != up.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->open_array_section("acting");
+ for (auto p = acting.cbegin(); p != acting.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("up_primary", up_primary);
+ f->dump_int("acting_primary", acting_primary);
+}
+
+void pg_stat_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(26, 22, bl);
+ encode(version, bl);
+ encode(reported_seq, bl);
+ encode(reported_epoch, bl);
+ encode((__u32)state, bl); // for older peers
+ encode(log_start, bl);
+ encode(ondisk_log_start, bl);
+ encode(created, bl);
+ encode(last_epoch_clean, bl);
+ encode(parent, bl);
+ encode(parent_split_bits, bl);
+ encode(last_scrub, bl);
+ encode(last_scrub_stamp, bl);
+ encode(stats, bl);
+ encode(log_size, bl);
+ encode(ondisk_log_size, bl);
+ encode(up, bl);
+ encode(acting, bl);
+ encode(last_fresh, bl);
+ encode(last_change, bl);
+ encode(last_active, bl);
+ encode(last_clean, bl);
+ encode(last_unstale, bl);
+ encode(mapping_epoch, bl);
+ encode(last_deep_scrub, bl);
+ encode(last_deep_scrub_stamp, bl);
+ encode(stats_invalid, bl);
+ encode(last_clean_scrub_stamp, bl);
+ encode(last_became_active, bl);
+ encode(dirty_stats_invalid, bl);
+ encode(up_primary, bl);
+ encode(acting_primary, bl);
+ encode(omap_stats_invalid, bl);
+ encode(hitset_stats_invalid, bl);
+ encode(blocked_by, bl);
+ encode(last_undegraded, bl);
+ encode(last_fullsized, bl);
+ encode(hitset_bytes_stats_invalid, bl);
+ encode(last_peered, bl);
+ encode(last_became_peered, bl);
+ encode(pin_stats_invalid, bl);
+ encode(snaptrimq_len, bl);
+ __u32 top_state = (state >> 32);
+ encode(top_state, bl);
+ encode(purged_snaps, bl);
+ encode(manifest_stats_invalid, bl);
+ encode(avail_no_missing, bl);
+ encode(object_location_counts, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ bool tmp;
+ uint32_t old_state;
+ DECODE_START(26, bl);
+ decode(version, bl);
+ decode(reported_seq, bl);
+ decode(reported_epoch, bl);
+ decode(old_state, bl);
+ decode(log_start, bl);
+ decode(ondisk_log_start, bl);
+ decode(created, bl);
+ decode(last_epoch_clean, bl);
+ decode(parent, bl);
+ decode(parent_split_bits, bl);
+ decode(last_scrub, bl);
+ decode(last_scrub_stamp, bl);
+ decode(stats, bl);
+ decode(log_size, bl);
+ decode(ondisk_log_size, bl);
+ decode(up, bl);
+ decode(acting, bl);
+ decode(last_fresh, bl);
+ decode(last_change, bl);
+ decode(last_active, bl);
+ decode(last_clean, bl);
+ decode(last_unstale, bl);
+ decode(mapping_epoch, bl);
+ decode(last_deep_scrub, bl);
+ decode(last_deep_scrub_stamp, bl);
+ decode(tmp, bl);
+ stats_invalid = tmp;
+ decode(last_clean_scrub_stamp, bl);
+ decode(last_became_active, bl);
+ decode(tmp, bl);
+ dirty_stats_invalid = tmp;
+ decode(up_primary, bl);
+ decode(acting_primary, bl);
+ decode(tmp, bl);
+ omap_stats_invalid = tmp;
+ decode(tmp, bl);
+ hitset_stats_invalid = tmp;
+ decode(blocked_by, bl);
+ decode(last_undegraded, bl);
+ decode(last_fullsized, bl);
+ decode(tmp, bl);
+ hitset_bytes_stats_invalid = tmp;
+ decode(last_peered, bl);
+ decode(last_became_peered, bl);
+ decode(tmp, bl);
+ pin_stats_invalid = tmp;
+ if (struct_v >= 23) {
+ decode(snaptrimq_len, bl);
+ if (struct_v >= 24) {
+ __u32 top_state;
+ decode(top_state, bl);
+ state = (uint64_t)old_state | ((uint64_t)top_state << 32);
+ decode(purged_snaps, bl);
+ } else {
+ state = old_state;
+ }
+ if (struct_v >= 25) {
+ decode(tmp, bl);
+ manifest_stats_invalid = tmp;
+ } else {
+ manifest_stats_invalid = true;
+ }
+ if (struct_v >= 26) {
+ decode(avail_no_missing, bl);
+ decode(object_location_counts, bl);
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
+{
+ pg_stat_t a;
+ o.push_back(new pg_stat_t(a));
+
+ a.version = eversion_t(1, 3);
+ a.reported_epoch = 1;
+ a.reported_seq = 2;
+ a.state = 123;
+ a.mapping_epoch = 998;
+ a.last_fresh = utime_t(1002, 1);
+ a.last_change = utime_t(1002, 2);
+ a.last_active = utime_t(1002, 3);
+ a.last_clean = utime_t(1002, 4);
+ a.last_unstale = utime_t(1002, 5);
+ a.last_undegraded = utime_t(1002, 7);
+ a.last_fullsized = utime_t(1002, 8);
+ a.log_start = eversion_t(1, 4);
+ a.ondisk_log_start = eversion_t(1, 5);
+ a.created = 6;
+ a.last_epoch_clean = 7;
+ a.parent = pg_t(1, 2);
+ a.parent_split_bits = 12;
+ a.last_scrub = eversion_t(9, 10);
+ a.last_scrub_stamp = utime_t(11, 12);
+ a.last_deep_scrub = eversion_t(13, 14);
+ a.last_deep_scrub_stamp = utime_t(15, 16);
+ a.last_clean_scrub_stamp = utime_t(17, 18);
+ a.snaptrimq_len = 1048576;
+ list<object_stat_collection_t*> l;
+ object_stat_collection_t::generate_test_instances(l);
+ a.stats = *l.back();
+ a.log_size = 99;
+ a.ondisk_log_size = 88;
+ a.up.push_back(123);
+ a.up_primary = 123;
+ a.acting.push_back(456);
+ a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
+ set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
+ a.object_location_counts.insert(make_pair(sset, 10));
+ sset.insert(pg_shard_t(2));
+ a.object_location_counts.insert(make_pair(sset, 5));
+ a.acting_primary = 456;
+ o.push_back(new pg_stat_t(a));
+
+ a.up.push_back(124);
+ a.up_primary = 124;
+ a.acting.push_back(124);
+ a.acting_primary = 124;
+ a.blocked_by.push_back(155);
+ a.blocked_by.push_back(156);
+ o.push_back(new pg_stat_t(a));
+}
+
+bool operator==(const pg_stat_t& l, const pg_stat_t& r)
+{
+ return
+ l.version == r.version &&
+ l.reported_seq == r.reported_seq &&
+ l.reported_epoch == r.reported_epoch &&
+ l.state == r.state &&
+ l.last_fresh == r.last_fresh &&
+ l.last_change == r.last_change &&
+ l.last_active == r.last_active &&
+ l.last_peered == r.last_peered &&
+ l.last_clean == r.last_clean &&
+ l.last_unstale == r.last_unstale &&
+ l.last_undegraded == r.last_undegraded &&
+ l.last_fullsized == r.last_fullsized &&
+ l.log_start == r.log_start &&
+ l.ondisk_log_start == r.ondisk_log_start &&
+ l.created == r.created &&
+ l.last_epoch_clean == r.last_epoch_clean &&
+ l.parent == r.parent &&
+ l.parent_split_bits == r.parent_split_bits &&
+ l.last_scrub == r.last_scrub &&
+ l.last_deep_scrub == r.last_deep_scrub &&
+ l.last_scrub_stamp == r.last_scrub_stamp &&
+ l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
+ l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
+ l.stats == r.stats &&
+ l.stats_invalid == r.stats_invalid &&
+ l.log_size == r.log_size &&
+ l.ondisk_log_size == r.ondisk_log_size &&
+ l.up == r.up &&
+ l.acting == r.acting &&
+ l.avail_no_missing == r.avail_no_missing &&
+ l.object_location_counts == r.object_location_counts &&
+ l.mapping_epoch == r.mapping_epoch &&
+ l.blocked_by == r.blocked_by &&
+ l.last_became_active == r.last_became_active &&
+ l.last_became_peered == r.last_became_peered &&
+ l.dirty_stats_invalid == r.dirty_stats_invalid &&
+ l.omap_stats_invalid == r.omap_stats_invalid &&
+ l.hitset_stats_invalid == r.hitset_stats_invalid &&
+ l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
+ l.up_primary == r.up_primary &&
+ l.acting_primary == r.acting_primary &&
+ l.pin_stats_invalid == r.pin_stats_invalid &&
+ l.manifest_stats_invalid == r.manifest_stats_invalid &&
+ l.purged_snaps == r.purged_snaps &&
+ l.snaptrimq_len == r.snaptrimq_len;
+}
+
+// -- store_statfs_t --
+
+bool store_statfs_t::operator==(const store_statfs_t& other) const
+{
+ return total == other.total
+ && available == other.available
+ && allocated == other.allocated
+ && internally_reserved == other.internally_reserved
+ && data_stored == other.data_stored
+ && data_compressed == other.data_compressed
+ && data_compressed_allocated == other.data_compressed_allocated
+ && data_compressed_original == other.data_compressed_original
+ && omap_allocated == other.omap_allocated
+ && internal_metadata == other.internal_metadata;
+}
+
+void store_statfs_t::dump(Formatter *f) const
+{
+ f->dump_int("total", total);
+ f->dump_int("available", available);
+ f->dump_int("internally_reserved", internally_reserved);
+ f->dump_int("allocated", allocated);
+ f->dump_int("data_stored", data_stored);
+ f->dump_int("data_compressed", data_compressed);
+ f->dump_int("data_compressed_allocated", data_compressed_allocated);
+ f->dump_int("data_compressed_original", data_compressed_original);
+ f->dump_int("omap_allocated", omap_allocated);
+ f->dump_int("internal_metadata", internal_metadata);
+}
+
+ostream& operator<<(ostream& out, const store_statfs_t &s)
+{
+ out << std::hex
+ << "store_statfs(0x" << s.available
+ << "/0x" << s.internally_reserved
+ << "/0x" << s.total
+ << ", data 0x" << s.data_stored
+ << "/0x" << s.allocated
+ << ", compress 0x" << s.data_compressed
+ << "/0x" << s.data_compressed_allocated
+ << "/0x" << s.data_compressed_original
+ << ", omap 0x" << s.omap_allocated
+ << ", meta 0x" << s.internal_metadata
+ << std::dec
+ << ")";
+ return out;
+}
+
+void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
+{
+ store_statfs_t a;
+ o.push_back(new store_statfs_t(a));
+ a.total = 234;
+ a.available = 123;
+ a.internally_reserved = 33;
+ a.allocated = 32;
+ a.data_stored = 44;
+ a.data_compressed = 21;
+ a.data_compressed_allocated = 12;
+ a.data_compressed_original = 13;
+ a.omap_allocated = 14;
+ a.internal_metadata = 15;
+ o.push_back(new store_statfs_t(a));
+}
+
+// -- pool_stat_t --
+
+void pool_stat_t::dump(Formatter *f) const
+{
+ stats.dump(f);
+ f->open_object_section("store_stats");
+ store_stats.dump(f);
+ f->close_section();
+ f->dump_int("log_size", log_size);
+ f->dump_int("ondisk_log_size", ondisk_log_size);
+ f->dump_int("up", up);
+ f->dump_int("acting", acting);
+ f->dump_int("num_store_stats", num_store_stats);
+}
+
+void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_OSDENC) == 0) {
+ __u8 v = 4;
+ encode(v, bl);
+ encode(stats, bl);
+ encode(log_size, bl);
+ encode(ondisk_log_size, bl);
+ return;
+ }
+
+ ENCODE_START(7, 5, bl);
+ encode(stats, bl);
+ encode(log_size, bl);
+ encode(ondisk_log_size, bl);
+ encode(up, bl);
+ encode(acting, bl);
+ encode(store_stats, bl);
+ encode(num_store_stats, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
+ if (struct_v >= 4) {
+ decode(stats, bl);
+ decode(log_size, bl);
+ decode(ondisk_log_size, bl);
+ if (struct_v >= 6) {
+ decode(up, bl);
+ decode(acting, bl);
+ } else {
+ up = 0;
+ acting = 0;
+ }
+ if (struct_v >= 7) {
+ decode(store_stats, bl);
+ decode(num_store_stats, bl);
+ } else {
+ store_stats.reset();
+ num_store_stats = 0;
+ }
+
+ } else {
+ decode(stats.sum.num_bytes, bl);
+ uint64_t num_kb;
+ decode(num_kb, bl);
+ decode(stats.sum.num_objects, bl);
+ decode(stats.sum.num_object_clones, bl);
+ decode(stats.sum.num_object_copies, bl);
+ decode(stats.sum.num_objects_missing_on_primary, bl);
+ decode(stats.sum.num_objects_degraded, bl);
+ decode(log_size, bl);
+ decode(ondisk_log_size, bl);
+ if (struct_v >= 2) {
+ decode(stats.sum.num_rd, bl);
+ decode(stats.sum.num_rd_kb, bl);
+ decode(stats.sum.num_wr, bl);
+ decode(stats.sum.num_wr_kb, bl);
+ }
+ if (struct_v >= 3) {
+ decode(stats.sum.num_objects_unfound, bl);
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
+{
+ pool_stat_t a;
+ o.push_back(new pool_stat_t(a));
+
+ list<object_stat_collection_t*> l;
+ object_stat_collection_t::generate_test_instances(l);
+ list<store_statfs_t*> ll;
+ store_statfs_t::generate_test_instances(ll);
+ a.stats = *l.back();
+ a.store_stats = *ll.back();
+ a.log_size = 123;
+ a.ondisk_log_size = 456;
+ a.acting = 3;
+ a.up = 4;
+ a.num_store_stats = 1;
+ o.push_back(new pool_stat_t(a));
+}
+
+
+// -- pg_history_t --
+
+void pg_history_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(10, 4, bl);
+ encode(epoch_created, bl);
+ encode(last_epoch_started, bl);
+ encode(last_epoch_clean, bl);
+ encode(last_epoch_split, bl);
+ encode(same_interval_since, bl);
+ encode(same_up_since, bl);
+ encode(same_primary_since, bl);
+ encode(last_scrub, bl);
+ encode(last_scrub_stamp, bl);
+ encode(last_deep_scrub, bl);
+ encode(last_deep_scrub_stamp, bl);
+ encode(last_clean_scrub_stamp, bl);
+ encode(last_epoch_marked_full, bl);
+ encode(last_interval_started, bl);
+ encode(last_interval_clean, bl);
+ encode(epoch_pool_created, bl);
+ encode(prior_readable_until_ub, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
+ decode(epoch_created, bl);
+ decode(last_epoch_started, bl);
+ if (struct_v >= 3)
+ decode(last_epoch_clean, bl);
+ else
+ last_epoch_clean = last_epoch_started; // careful, it's a lie!
+ decode(last_epoch_split, bl);
+ decode(same_interval_since, bl);
+ decode(same_up_since, bl);
+ decode(same_primary_since, bl);
+ if (struct_v >= 2) {
+ decode(last_scrub, bl);
+ decode(last_scrub_stamp, bl);
+ }
+ if (struct_v >= 5) {
+ decode(last_deep_scrub, bl);
+ decode(last_deep_scrub_stamp, bl);
+ }
+ if (struct_v >= 6) {
+ decode(last_clean_scrub_stamp, bl);
+ }
+ if (struct_v >= 7) {
+ decode(last_epoch_marked_full, bl);
+ }
+ if (struct_v >= 8) {
+ decode(last_interval_started, bl);
+ decode(last_interval_clean, bl);
+ } else {
+ if (last_epoch_started >= same_interval_since) {
+ last_interval_started = same_interval_since;
+ } else {
+ last_interval_started = last_epoch_started; // best guess
+ }
+ if (last_epoch_clean >= same_interval_since) {
+ last_interval_clean = same_interval_since;
+ } else {
+ last_interval_clean = last_epoch_clean; // best guess
+ }
+ }
+ if (struct_v >= 9) {
+ decode(epoch_pool_created, bl);
+ } else {
+ epoch_pool_created = epoch_created;
+ }
+ if (struct_v >= 10) {
+ decode(prior_readable_until_ub, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void pg_history_t::dump(Formatter *f) const
+{
+ f->dump_int("epoch_created", epoch_created);
+ f->dump_int("epoch_pool_created", epoch_pool_created);
+ f->dump_int("last_epoch_started", last_epoch_started);
+ f->dump_int("last_interval_started", last_interval_started);
+ f->dump_int("last_epoch_clean", last_epoch_clean);
+ f->dump_int("last_interval_clean", last_interval_clean);
+ f->dump_int("last_epoch_split", last_epoch_split);
+ f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
+ f->dump_int("same_up_since", same_up_since);
+ f->dump_int("same_interval_since", same_interval_since);
+ f->dump_int("same_primary_since", same_primary_since);
+ f->dump_stream("last_scrub") << last_scrub;
+ f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
+ f->dump_stream("last_deep_scrub") << last_deep_scrub;
+ f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
+ f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
+ f->dump_float(
+ "prior_readable_until_ub",
+ std::chrono::duration<double>(prior_readable_until_ub).count());
+}
+
+void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
+{
+ o.push_back(new pg_history_t);
+ o.push_back(new pg_history_t);
+ o.back()->epoch_created = 1;
+ o.back()->epoch_pool_created = 1;
+ o.back()->last_epoch_started = 2;
+ o.back()->last_interval_started = 2;
+ o.back()->last_epoch_clean = 3;
+ o.back()->last_interval_clean = 2;
+ o.back()->last_epoch_split = 4;
+ o.back()->prior_readable_until_ub = make_timespan(3.1415);
+ o.back()->same_up_since = 5;
+ o.back()->same_interval_since = 6;
+ o.back()->same_primary_since = 7;
+ o.back()->last_scrub = eversion_t(8, 9);
+ o.back()->last_scrub_stamp = utime_t(10, 11);
+ o.back()->last_deep_scrub = eversion_t(12, 13);
+ o.back()->last_deep_scrub_stamp = utime_t(14, 15);
+ o.back()->last_clean_scrub_stamp = utime_t(16, 17);
+ o.back()->last_epoch_marked_full = 18;
+}
+
+
+// -- pg_info_t --
+
+void pg_info_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(32, 26, bl);
+ encode(pgid.pgid, bl);
+ encode(last_update, bl);
+ encode(last_complete, bl);
+ encode(log_tail, bl);
+ encode(hobject_t(), bl); // old (nibblewise) last_backfill
+ encode(stats, bl);
+ history.encode(bl);
+ encode(purged_snaps, bl);
+ encode(last_epoch_started, bl);
+ encode(last_user_version, bl);
+ encode(hit_set, bl);
+ encode(pgid.shard, bl);
+ encode(last_backfill, bl);
+ encode(true, bl); // was last_backfill_bitwise
+ encode(last_interval_started, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(32, bl);
+ decode(pgid.pgid, bl);
+ decode(last_update, bl);
+ decode(last_complete, bl);
+ decode(log_tail, bl);
+ {
+ hobject_t old_last_backfill;
+ decode(old_last_backfill, bl);
+ }
+ decode(stats, bl);
+ history.decode(bl);
+ decode(purged_snaps, bl);
+ decode(last_epoch_started, bl);
+ decode(last_user_version, bl);
+ decode(hit_set, bl);
+ decode(pgid.shard, bl);
+ decode(last_backfill, bl);
+ {
+ bool last_backfill_bitwise;
+ decode(last_backfill_bitwise, bl);
+ // note: we may see a false value here since the default value for
+ // the member was false, so it often didn't get set to true until
+ // peering progressed.
+ }
+ if (struct_v >= 32) {
+ decode(last_interval_started, bl);
+ } else {
+ last_interval_started = last_epoch_started;
+ }
+ DECODE_FINISH(bl);
+}
+
+// -- pg_info_t --
+
+void pg_info_t::dump(Formatter *f) const
+{
+ f->dump_stream("pgid") << pgid;
+ f->dump_stream("last_update") << last_update;
+ f->dump_stream("last_complete") << last_complete;
+ f->dump_stream("log_tail") << log_tail;
+ f->dump_int("last_user_version", last_user_version);
+ f->dump_stream("last_backfill") << last_backfill;
+ f->open_array_section("purged_snaps");
+ for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
+ i != purged_snaps.end();
+ ++i) {
+ f->open_object_section("purged_snap_interval");
+ f->dump_stream("start") << i.get_start();
+ f->dump_stream("length") << i.get_len();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_object_section("history");
+ history.dump(f);
+ f->close_section();
+ f->open_object_section("stats");
+ stats.dump(f);
+ f->close_section();
+
+ f->dump_int("empty", is_empty());
+ f->dump_int("dne", dne());
+ f->dump_int("incomplete", is_incomplete());
+ f->dump_int("last_epoch_started", last_epoch_started);
+
+ f->open_object_section("hit_set_history");
+ hit_set.dump(f);
+ f->close_section();
+}
+
+void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
+{
+ o.push_back(new pg_info_t);
+ o.push_back(new pg_info_t);
+ list<pg_history_t*> h;
+ pg_history_t::generate_test_instances(h);
+ o.back()->history = *h.back();
+ o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
+ o.back()->last_update = eversion_t(3, 4);
+ o.back()->last_complete = eversion_t(5, 6);
+ o.back()->last_user_version = 2;
+ o.back()->log_tail = eversion_t(7, 8);
+ o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
+ {
+ list<pg_stat_t*> s;
+ pg_stat_t::generate_test_instances(s);
+ o.back()->stats = *s.back();
+ }
+ {
+ list<pg_hit_set_history_t*> s;
+ pg_hit_set_history_t::generate_test_instances(s);
+ o.back()->hit_set = *s.back();
+ }
+}
+
+// -- pg_notify_t --
+void pg_notify_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(query_epoch, bl);
+ encode(epoch_sent, bl);
+ encode(info, bl);
+ encode(to, bl);
+ encode(from, bl);
+ encode(past_intervals, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(3, bl);
+ decode(query_epoch, bl);
+ decode(epoch_sent, bl);
+ decode(info, bl);
+ decode(to, bl);
+ decode(from, bl);
+ if (struct_v >= 3) {
+ decode(past_intervals, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void pg_notify_t::dump(Formatter *f) const
+{
+ f->dump_int("from", from);
+ f->dump_int("to", to);
+ f->dump_unsigned("query_epoch", query_epoch);
+ f->dump_unsigned("epoch_sent", epoch_sent);
+ {
+ f->open_object_section("info");
+ info.dump(f);
+ f->close_section();
+ }
+ f->dump_object("past_intervals", past_intervals);
+}
+
+void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
+{
+ o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
+ pg_info_t(), PastIntervals()));
+ o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
+ pg_info_t(), PastIntervals()));
+}
+
+ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
+{
+ lhs << "(query:" << notify.query_epoch
+ << " sent:" << notify.epoch_sent
+ << " " << notify.info;
+ if (notify.from != shard_id_t::NO_SHARD ||
+ notify.to != shard_id_t::NO_SHARD)
+ lhs << " " << (unsigned)notify.from
+ << "->" << (unsigned)notify.to;
+ lhs << " " << notify.past_intervals;
+ return lhs << ")";
+}
+
+// -- pg_interval_t --
+
+void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(4, 2, bl);
+ encode(first, bl);
+ encode(last, bl);
+ encode(up, bl);
+ encode(acting, bl);
+ encode(maybe_went_rw, bl);
+ encode(primary, bl);
+ encode(up_primary, bl);
+ ENCODE_FINISH(bl);
+}
+
+void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
+ decode(first, bl);
+ decode(last, bl);
+ decode(up, bl);
+ decode(acting, bl);
+ decode(maybe_went_rw, bl);
+ if (struct_v >= 3) {
+ decode(primary, bl);
+ } else {
+ if (acting.size())
+ primary = acting[0];
+ }
+ if (struct_v >= 4) {
+ decode(up_primary, bl);
+ } else {
+ if (up.size())
+ up_primary = up[0];
+ }
+ DECODE_FINISH(bl);
+}
+
+void PastIntervals::pg_interval_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("first", first);
+ f->dump_unsigned("last", last);
+ f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
+ f->open_array_section("up");
+ for (auto p = up.cbegin(); p != up.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->open_array_section("acting");
+ for (auto p = acting.cbegin(); p != acting.cend(); ++p)
+ f->dump_int("osd", *p);
+ f->close_section();
+ f->dump_int("primary", primary);
+ f->dump_int("up_primary", up_primary);
+}
+
+void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
+{
+ o.push_back(new pg_interval_t);
+ o.push_back(new pg_interval_t);
+ o.back()->up.push_back(1);
+ o.back()->acting.push_back(2);
+ o.back()->acting.push_back(3);
+ o.back()->first = 4;
+ o.back()->last = 5;
+ o.back()->maybe_went_rw = true;
+}
+
+WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
+
+
+/**
+ * pi_compact_rep
+ *
+ * PastIntervals only needs to be able to answer two questions:
+ * 1) Where should the primary look for unfound objects?
+ * 2) List a set of subsets of the OSDs such that contacting at least
+ * one from each subset guarantees we speak to at least one witness
+ * of any completed write.
+ *
+ * Crucially, 2) does not require keeping *all* past intervals. Certainly,
+ * we don't need to keep any where maybe_went_rw would be false. We also
+ * needn't keep two intervals where the actingset in one is a subset
+ * of the other (only need to keep the smaller of the two sets). In order
+ * to accurately trim the set of intervals as last_epoch_started changes
+ * without rebuilding the set from scratch, we'll retain the larger set
+ * if it in an older interval.
+ */
+struct compact_interval_t {
+ epoch_t first;
+ epoch_t last;
+ set<pg_shard_t> acting;
+ bool supersedes(const compact_interval_t &other) {
+ for (auto &&i: acting) {
+ if (!other.acting.count(i))
+ return false;
+ }
+ return true;
+ }
+ void dump(Formatter *f) const {
+ f->open_object_section("compact_interval_t");
+ f->dump_stream("first") << first;
+ f->dump_stream("last") << last;
+ f->dump_stream("acting") << acting;
+ f->close_section();
+ }
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(first, bl);
+ encode(last, bl);
+ encode(acting, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(1, bl);
+ decode(first, bl);
+ decode(last, bl);
+ decode(acting, bl);
+ DECODE_FINISH(bl);
+ }
+ static void generate_test_instances(list<compact_interval_t*> & o) {
+ /* Not going to be used, we'll generate pi_compact_rep directly */
+ }
+};
+ostream &operator<<(ostream &o, const compact_interval_t &rhs)
+{
+ return o << "([" << rhs.first << "," << rhs.last
+ << "] acting " << rhs.acting << ")";
+}
+WRITE_CLASS_ENCODER(compact_interval_t)
+
+class pi_compact_rep : public PastIntervals::interval_rep {
+ epoch_t first = 0;
+ epoch_t last = 0; // inclusive
+ set<pg_shard_t> all_participants;
+ list<compact_interval_t> intervals;
+ pi_compact_rep(
+ bool ec_pool,
+ std::list<PastIntervals::pg_interval_t> &&intervals) {
+ for (auto &&i: intervals)
+ add_interval(ec_pool, i);
+ }
+public:
+ pi_compact_rep() = default;
+ pi_compact_rep(const pi_compact_rep &) = default;
+ pi_compact_rep(pi_compact_rep &&) = default;
+ pi_compact_rep &operator=(const pi_compact_rep &) = default;
+ pi_compact_rep &operator=(pi_compact_rep &&) = default;
+
+ size_t size() const override { return intervals.size(); }
+ bool empty() const override {
+ return first > last || (first == 0 && last == 0);
+ }
+ void clear() override {
+ *this = pi_compact_rep();
+ }
+ pair<epoch_t, epoch_t> get_bounds() const override {
+ return make_pair(first, last + 1);
+ }
+ void adjust_start_backwards(epoch_t last_epoch_clean) override {
+ first = last_epoch_clean;
+ }
+
+ set<pg_shard_t> get_all_participants(
+ bool ec_pool) const override {
+ return all_participants;
+ }
+ void add_interval(
+ bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
+ if (first == 0)
+ first = interval.first;
+ ceph_assert(interval.last > last);
+ last = interval.last;
+ set<pg_shard_t> acting;
+ for (unsigned i = 0; i < interval.acting.size(); ++i) {
+ if (interval.acting[i] == CRUSH_ITEM_NONE)
+ continue;
+ acting.insert(
+ pg_shard_t(
+ interval.acting[i],
+ ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ all_participants.insert(acting.begin(), acting.end());
+ if (!interval.maybe_went_rw)
+ return;
+ intervals.push_back(
+ compact_interval_t{interval.first, interval.last, acting});
+ auto plast = intervals.end();
+ --plast;
+ for (auto cur = intervals.begin(); cur != plast; ) {
+ if (plast->supersedes(*cur)) {
+ intervals.erase(cur++);
+ } else {
+ ++cur;
+ }
+ }
+ }
+ unique_ptr<PastIntervals::interval_rep> clone() const override {
+ return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
+ }
+ ostream &print(ostream &out) const override {
+ return out << "([" << first << "," << last
+ << "] all_participants=" << all_participants
+ << " intervals=" << intervals << ")";
+ }
+ void encode(ceph::buffer::list &bl) const override {
+ ENCODE_START(1, 1, bl);
+ encode(first, bl);
+ encode(last, bl);
+ encode(all_participants, bl);
+ encode(intervals, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) override {
+ DECODE_START(1, bl);
+ decode(first, bl);
+ decode(last, bl);
+ decode(all_participants, bl);
+ decode(intervals, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(Formatter *f) const override {
+ f->open_object_section("PastIntervals::compact_rep");
+ f->dump_stream("first") << first;
+ f->dump_stream("last") << last;
+ f->open_array_section("all_participants");
+ for (auto& i : all_participants) {
+ f->dump_object("pg_shard", i);
+ }
+ f->close_section();
+ f->open_array_section("intervals");
+ for (auto &&i: intervals) {
+ i.dump(f);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ static void generate_test_instances(list<pi_compact_rep*> &o) {
+ using ival = PastIntervals::pg_interval_t;
+ using ivallst = std::list<ival>;
+ o.push_back(
+ new pi_compact_rep(
+ true, ivallst
+ { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
+ , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
+ , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
+ , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
+ }));
+ o.push_back(
+ new pi_compact_rep(
+ false, ivallst
+ { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
+ , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
+ , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
+ , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
+ }));
+ o.push_back(
+ new pi_compact_rep(
+ true, ivallst
+ { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
+ , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
+ , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
+ , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
+ }));
+ }
+ void iterate_mayberw_back_to(
+ epoch_t les,
+ std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
+ for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
+ if (i->last < les)
+ break;
+ f(i->first, i->acting);
+ }
+ }
+ virtual ~pi_compact_rep() override {}
+};
+WRITE_CLASS_ENCODER(pi_compact_rep)
+
+PastIntervals::PastIntervals()
+{
+ past_intervals.reset(new pi_compact_rep);
+}
+
+PastIntervals::PastIntervals(const PastIntervals &rhs)
+ : past_intervals(rhs.past_intervals ?
+ rhs.past_intervals->clone() :
+ nullptr) {}
+
+PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
+{
+ PastIntervals other(rhs);
+ swap(other);
+ return *this;
+}
+
+ostream& operator<<(ostream& out, const PastIntervals &i)
+{
+ if (i.past_intervals) {
+ return i.past_intervals->print(out);
+ } else {
+ return out << "(empty)";
+ }
+}
+
+ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
+{
+ return out << "PriorSet("
+ << "ec_pool: " << i.ec_pool
+ << ", probe: " << i.probe
+ << ", down: " << i.down
+ << ", blocked_by: " << i.blocked_by
+ << ", pg_down: " << i.pg_down
+ << ")";
+}
+
+void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ __u8 type = 0;
+ decode(type, bl);
+ switch (type) {
+ case 0:
+ break;
+ case 1:
+ ceph_abort_msg("pi_simple_rep support removed post-luminous");
+ break;
+ case 2:
+ past_intervals.reset(new pi_compact_rep);
+ past_intervals->decode(bl);
+ break;
+ }
+ DECODE_FINISH(bl);
+}
+
+void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
+{
+ {
+ list<pi_compact_rep *> compact;
+ pi_compact_rep::generate_test_instances(compact);
+ for (auto &&i: compact) {
+ // takes ownership of contents
+ o.push_back(new PastIntervals(i));
+ }
+ }
+ return;
+}
+
+bool PastIntervals::is_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const vector<int> &old_acting,
+ const vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const vector<int> &old_up,
+ const vector<int> &new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ unsigned old_pg_num,
+ unsigned new_pg_num,
+ unsigned old_pg_num_pending,
+ unsigned new_pg_num_pending,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
+ uint32_t old_crush_count,
+ uint32_t new_crush_count,
+ uint32_t old_crush_target,
+ uint32_t new_crush_target,
+ uint32_t old_crush_barrier,
+ uint32_t new_crush_barrier,
+ int32_t old_crush_member,
+ int32_t new_crush_member,
+ pg_t pgid) {
+ return old_acting_primary != new_acting_primary ||
+ new_acting != old_acting ||
+ old_up_primary != new_up_primary ||
+ new_up != old_up ||
+ old_min_size != new_min_size ||
+ old_size != new_size ||
+ pgid.is_split(old_pg_num, new_pg_num, 0) ||
+ // (is or was) pre-merge source
+ pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
+ pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
+ // merge source
+ pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
+ // (is or was) pre-merge target
+ pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
+ pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
+ // merge target
+ pgid.is_merge_target(old_pg_num, new_pg_num) ||
+ old_sort_bitwise != new_sort_bitwise ||
+ old_recovery_deletes != new_recovery_deletes ||
+ old_crush_count != new_crush_count ||
+ old_crush_target != new_crush_target ||
+ old_crush_barrier != new_crush_barrier ||
+ old_crush_member != new_crush_member;
+}
+
+bool PastIntervals::is_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const vector<int> &old_acting,
+ const vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const vector<int> &old_up,
+ const vector<int> &new_up,
+ const OSDMap *osdmap,
+ const OSDMap *lastmap,
+ pg_t pgid)
+{
+ const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
+ if (!plast) {
+ return false; // after pool is deleted there are no more interval changes
+ }
+ const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
+ if (!pi) {
+ return true; // pool was deleted this epoch -> (final!) interval change
+ }
+ return
+ is_new_interval(old_acting_primary,
+ new_acting_primary,
+ old_acting,
+ new_acting,
+ old_up_primary,
+ new_up_primary,
+ old_up,
+ new_up,
+ plast->size,
+ pi->size,
+ plast->min_size,
+ pi->min_size,
+ plast->get_pg_num(),
+ pi->get_pg_num(),
+ plast->get_pg_num_pending(),
+ pi->get_pg_num_pending(),
+ lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
+ osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
+ lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
+ osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
+ plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
+ plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
+ plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
+ plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
+ pgid);
+}
+
+bool PastIntervals::check_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const vector<int> &old_acting,
+ const vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const vector<int> &old_up,
+ const vector<int> &new_up,
+ epoch_t same_interval_since,
+ epoch_t last_epoch_clean,
+ const OSDMap *osdmap,
+ const OSDMap *lastmap,
+ pg_t pgid,
+ const IsPGRecoverablePredicate &could_have_gone_active,
+ PastIntervals *past_intervals,
+ std::ostream *out)
+{
+ /*
+ * We have to be careful to gracefully deal with situations like
+ * so. Say we have a power outage or something that takes out both
+ * OSDs, but the monitor doesn't mark them down in the same epoch.
+ * The history may look like
+ *
+ * 1: A B
+ * 2: B
+ * 3: let's say B dies for good, too (say, from the power spike)
+ * 4: A
+ *
+ * which makes it look like B may have applied updates to the PG
+ * that we need in order to proceed. This sucks...
+ *
+ * To minimize the risk of this happening, we CANNOT go active if
+ * _any_ OSDs in the prior set are down until we send an MOSDAlive
+ * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
+ * Then, we have something like
+ *
+ * 1: A B
+ * 2: B up_thru[B]=0
+ * 3:
+ * 4: A
+ *
+ * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
+ *
+ * or,
+ *
+ * 1: A B
+ * 2: B up_thru[B]=0
+ * 3: B up_thru[B]=2
+ * 4:
+ * 5: A
+ *
+ * -> we must wait for B, bc it was alive through 2, and could have
+ * written to the pg.
+ *
+ * If B is really dead, then an administrator will need to manually
+ * intervene by marking the OSD as "lost."
+ */
+
+ // remember past interval
+ // NOTE: a change in the up set primary triggers an interval
+ // change, even though the interval members in the pg_interval_t
+ // do not change.
+ ceph_assert(past_intervals);
+ ceph_assert(past_intervals->past_intervals);
+ if (is_new_interval(
+ old_acting_primary,
+ new_acting_primary,
+ old_acting,
+ new_acting,
+ old_up_primary,
+ new_up_primary,
+ old_up,
+ new_up,
+ osdmap,
+ lastmap,
+ pgid)) {
+ pg_interval_t i;
+ i.first = same_interval_since;
+ i.last = osdmap->get_epoch() - 1;
+ ceph_assert(i.first <= i.last);
+ i.acting = old_acting;
+ i.up = old_up;
+ i.primary = old_acting_primary;
+ i.up_primary = old_up_primary;
+
+ unsigned num_acting = 0;
+ for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
+ if (*p != CRUSH_ITEM_NONE)
+ ++num_acting;
+
+ ceph_assert(lastmap->get_pools().count(pgid.pool()));
+ const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
+ set<pg_shard_t> old_acting_shards;
+ old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
+
+ if (num_acting &&
+ i.primary != -1 &&
+ num_acting >= old_pg_pool.min_size &&
+ (!old_pg_pool.is_stretch_pool() ||
+ old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
+ could_have_gone_active(old_acting_shards)) {
+ if (out)
+ *out << __func__ << " " << i
+ << " up_thru " << lastmap->get_up_thru(i.primary)
+ << " up_from " << lastmap->get_up_from(i.primary)
+ << " last_epoch_clean " << last_epoch_clean;
+ if (lastmap->get_up_thru(i.primary) >= i.first &&
+ lastmap->get_up_from(i.primary) <= i.first) {
+ i.maybe_went_rw = true;
+ if (out)
+ *out << " " << i
+ << " : primary up " << lastmap->get_up_from(i.primary)
+ << "-" << lastmap->get_up_thru(i.primary)
+ << " includes interval"
+ << std::endl;
+ } else if (last_epoch_clean >= i.first &&
+ last_epoch_clean <= i.last) {
+ // If the last_epoch_clean is included in this interval, then
+ // the pg must have been rw (for recovery to have completed).
+ // This is important because we won't know the _real_
+ // first_epoch because we stop at last_epoch_clean, and we
+ // don't want the oldest interval to randomly have
+ // maybe_went_rw false depending on the relative up_thru vs
+ // last_epoch_clean timing.
+ i.maybe_went_rw = true;
+ if (out)
+ *out << " " << i
+ << " : includes last_epoch_clean " << last_epoch_clean
+ << " and presumed to have been rw"
+ << std::endl;
+ } else {
+ i.maybe_went_rw = false;
+ if (out)
+ *out << " " << i
+ << " : primary up " << lastmap->get_up_from(i.primary)
+ << "-" << lastmap->get_up_thru(i.primary)
+ << " does not include interval"
+ << std::endl;
+ }
+ } else {
+ i.maybe_went_rw = false;
+ if (out)
+ *out << __func__ << " " << i << " : acting set is too small" << std::endl;
+ }
+ past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// true if the given map affects the prior set
+bool PastIntervals::PriorSet::affected_by_map(
+ const OSDMap &osdmap,
+ const DoutPrefixProvider *dpp) const
+{
+ for (auto p = probe.begin(); p != probe.end(); ++p) {
+ int o = p->osd;
+
+ // did someone in the prior set go down?
+ if (osdmap.is_down(o) && down.count(o) == 0) {
+ ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
+ return true;
+ }
+
+ // did a down osd in cur get (re)marked as lost?
+ auto r = blocked_by.find(o);
+ if (r != blocked_by.end()) {
+ if (!osdmap.exists(o)) {
+ ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
+ return true;
+ }
+ if (osdmap.get_info(o).lost_at != r->second) {
+ ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+ return true;
+ }
+ }
+ }
+
+ // did someone in the prior down set go up?
+ for (auto p = down.cbegin(); p != down.cend(); ++p) {
+ int o = *p;
+
+ if (osdmap.is_up(o)) {
+ ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
+ return true;
+ }
+
+ // did someone in the prior set get lost or destroyed?
+ if (!osdmap.exists(o)) {
+ ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
+ return true;
+ }
+ // did a down osd in down get (re)marked as lost?
+ auto r = blocked_by.find(o);
+ if (r != blocked_by.end()) {
+ if (osdmap.get_info(o).lost_at != r->second) {
+ ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
+{
+ out << "interval(" << i.first << "-" << i.last
+ << " up " << i.up << "(" << i.up_primary << ")"
+ << " acting " << i.acting << "(" << i.primary << ")";
+ if (i.maybe_went_rw)
+ out << " maybe_went_rw";
+ out << ")";
+ return out;
+}
+
+
+
+// -- pg_query_t --
+
+void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
+ ENCODE_START(3, 3, bl);
+ encode(type, bl);
+ encode(since, bl);
+ history.encode(bl);
+ encode(epoch_sent, bl);
+ encode(to, bl);
+ encode(from, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(3, bl);
+ decode(type, bl);
+ decode(since, bl);
+ history.decode(bl);
+ decode(epoch_sent, bl);
+ decode(to, bl);
+ decode(from, bl);
+ DECODE_FINISH(bl);
+}
+
+void pg_query_t::dump(Formatter *f) const
+{
+ f->dump_int("from", from);
+ f->dump_int("to", to);
+ f->dump_string("type", get_type_name());
+ f->dump_stream("since") << since;
+ f->dump_stream("epoch_sent") << epoch_sent;
+ f->open_object_section("history");
+ history.dump(f);
+ f->close_section();
+}
+void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
+{
+ o.push_back(new pg_query_t());
+ list<pg_history_t*> h;
+ pg_history_t::generate_test_instances(h);
+ o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
+ o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
+ o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
+ eversion_t(4, 5), *h.back(), 4));
+ o.push_back(new pg_query_t(pg_query_t::FULLLOG,
+ shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
+ *h.back(), 5));
+}
+
+// -- pg_lease_t --
+
+void pg_lease_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(readable_until, bl);
+ encode(readable_until_ub, bl);
+ encode(interval, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_lease_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(readable_until, p);
+ decode(readable_until_ub, p);
+ decode(interval, p);
+ DECODE_FINISH(p);
+}
+
+void pg_lease_t::dump(Formatter *f) const
+{
+ f->dump_stream("readable_until") << readable_until;
+ f->dump_stream("readable_until_ub") << readable_until_ub;
+ f->dump_stream("interval") << interval;
+}
+
+void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
+{
+ o.push_back(new pg_lease_t());
+ o.push_back(new pg_lease_t());
+ o.back()->readable_until = make_timespan(1.5);
+ o.back()->readable_until_ub = make_timespan(3.4);
+ o.back()->interval = make_timespan(1.0);
+}
+
+// -- pg_lease_ack_t --
+
+void pg_lease_ack_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(readable_until_ub, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(readable_until_ub, p);
+ DECODE_FINISH(p);
+}
+
+void pg_lease_ack_t::dump(Formatter *f) const
+{
+ f->dump_stream("readable_until_ub") << readable_until_ub;
+}
+
+void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
+{
+ o.push_back(new pg_lease_ack_t());
+ o.push_back(new pg_lease_ack_t());
+ o.back()->readable_until_ub = make_timespan(3.4);
+}
+
+
+// -- ObjectModDesc --
+void ObjectModDesc::visit(Visitor *visitor) const
+{
+ auto bp = bl.cbegin();
+ try {
+ while (!bp.end()) {
+ DECODE_START(max_required_version, bp);
+ uint8_t code;
+ decode(code, bp);
+ switch (code) {
+ case APPEND: {
+ uint64_t size;
+ decode(size, bp);
+ visitor->append(size);
+ break;
+ }
+ case SETATTRS: {
+ map<string, std::optional<ceph::buffer::list> > attrs;
+ decode(attrs, bp);
+ visitor->setattrs(attrs);
+ break;
+ }
+ case DELETE: {
+ version_t old_version;
+ decode(old_version, bp);
+ visitor->rmobject(old_version);
+ break;
+ }
+ case CREATE: {
+ visitor->create();
+ break;
+ }
+ case UPDATE_SNAPS: {
+ set<snapid_t> snaps;
+ decode(snaps, bp);
+ visitor->update_snaps(snaps);
+ break;
+ }
+ case TRY_DELETE: {
+ version_t old_version;
+ decode(old_version, bp);
+ visitor->try_rmobject(old_version);
+ break;
+ }
+ case ROLLBACK_EXTENTS: {
+ vector<pair<uint64_t, uint64_t> > extents;
+ version_t gen;
+ decode(gen, bp);
+ decode(extents, bp);
+ visitor->rollback_extents(gen,extents);
+ break;
+ }
+ default:
+ ceph_abort_msg("Invalid rollback code");
+ }
+ DECODE_FINISH(bp);
+ }
+ } catch (...) {
+ ceph_abort_msg("Invalid encoding");
+ }
+}
+
+struct DumpVisitor : public ObjectModDesc::Visitor {
+ Formatter *f;
+ explicit DumpVisitor(Formatter *f) : f(f) {}
+ void append(uint64_t old_size) override {
+ f->open_object_section("op");
+ f->dump_string("code", "APPEND");
+ f->dump_unsigned("old_size", old_size);
+ f->close_section();
+ }
+ void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
+ f->open_object_section("op");
+ f->dump_string("code", "SETATTRS");
+ f->open_array_section("attrs");
+ for (auto i = attrs.begin(); i != attrs.end(); ++i) {
+ f->dump_string("attr_name", i->first);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ void rmobject(version_t old_version) override {
+ f->open_object_section("op");
+ f->dump_string("code", "RMOBJECT");
+ f->dump_unsigned("old_version", old_version);
+ f->close_section();
+ }
+ void try_rmobject(version_t old_version) override {
+ f->open_object_section("op");
+ f->dump_string("code", "TRY_RMOBJECT");
+ f->dump_unsigned("old_version", old_version);
+ f->close_section();
+ }
+ void create() override {
+ f->open_object_section("op");
+ f->dump_string("code", "CREATE");
+ f->close_section();
+ }
+ void update_snaps(const set<snapid_t> &snaps) override {
+ f->open_object_section("op");
+ f->dump_string("code", "UPDATE_SNAPS");
+ f->dump_stream("snaps") << snaps;
+ f->close_section();
+ }
+ void rollback_extents(
+ version_t gen,
+ const vector<pair<uint64_t, uint64_t> > &extents) override {
+ f->open_object_section("op");
+ f->dump_string("code", "ROLLBACK_EXTENTS");
+ f->dump_unsigned("gen", gen);
+ f->dump_stream("snaps") << extents;
+ f->close_section();
+ }
+};
+
+void ObjectModDesc::dump(Formatter *f) const
+{
+ f->open_object_section("object_mod_desc");
+ f->dump_bool("can_local_rollback", can_local_rollback);
+ f->dump_bool("rollback_info_completed", rollback_info_completed);
+ {
+ f->open_array_section("ops");
+ DumpVisitor vis(f);
+ visit(&vis);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
+{
+ map<string, std::optional<ceph::buffer::list> > attrs;
+ attrs[OI_ATTR];
+ attrs[SS_ATTR];
+ attrs["asdf"];
+ o.push_back(new ObjectModDesc());
+ o.back()->append(100);
+ o.back()->setattrs(attrs);
+ o.push_back(new ObjectModDesc());
+ o.back()->rmobject(1001);
+ o.push_back(new ObjectModDesc());
+ o.back()->create();
+ o.back()->setattrs(attrs);
+ o.push_back(new ObjectModDesc());
+ o.back()->create();
+ o.back()->setattrs(attrs);
+ o.back()->mark_unrollbackable();
+ o.back()->append(1000);
+}
+
+void ObjectModDesc::encode(ceph::buffer::list &_bl) const
+{
+ ENCODE_START(max_required_version, max_required_version, _bl);
+ encode(can_local_rollback, _bl);
+ encode(rollback_info_completed, _bl);
+ encode(bl, _bl);
+ ENCODE_FINISH(_bl);
+}
+void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
+{
+ DECODE_START(2, _bl);
+ max_required_version = struct_v;
+ decode(can_local_rollback, _bl);
+ decode(rollback_info_completed, _bl);
+ decode(bl, _bl);
+ // ensure bl does not pin a larger ceph::buffer in memory
+ bl.rebuild();
+ bl.reassign_to_mempool(mempool::mempool_osd_pglog);
+ DECODE_FINISH(_bl);
+}
+
+std::atomic<uint32_t> ObjectCleanRegions::max_num_intervals = {10};
+
+void ObjectCleanRegions::set_max_num_intervals(uint32_t num)
+{
+ max_num_intervals = num;
+}
+
+void ObjectCleanRegions::trim()
+{
+ while(clean_offsets.num_intervals() > max_num_intervals) {
+ typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
+ if (shortest_interval == clean_offsets.end())
+ break;
+ for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
+ it != clean_offsets.end();
+ ++it) {
+ if (it.get_len() < shortest_interval.get_len())
+ shortest_interval = it;
+ }
+ clean_offsets.erase(shortest_interval);
+ }
+}
+
+void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
+{
+ clean_offsets.intersection_of(other.clean_offsets);
+ clean_omap = clean_omap && other.clean_omap;
+ trim();
+}
+
+void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
+{
+ interval_set<uint64_t> clean_region;
+ clean_region.insert(0, (uint64_t)-1);
+ clean_region.erase(offset, len);
+ clean_offsets.intersection_of(clean_region);
+ trim();
+}
+
+bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const
+{
+ return clean_offsets.contains(offset, len);
+}
+
+void ObjectCleanRegions::mark_omap_dirty()
+{
+ clean_omap = false;
+}
+
+void ObjectCleanRegions::mark_object_new()
+{
+ new_object = true;
+}
+
+void ObjectCleanRegions::mark_fully_dirty()
+{
+ mark_data_region_dirty(0, (uint64_t)-1);
+ mark_omap_dirty();
+ mark_object_new();
+}
+
+interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
+{
+ interval_set<uint64_t> dirty_region;
+ dirty_region.insert(0, (uint64_t)-1);
+ dirty_region.subtract(clean_offsets);
+ return dirty_region;
+}
+
+bool ObjectCleanRegions::omap_is_dirty() const
+{
+ return !clean_omap;
+}
+
+bool ObjectCleanRegions::object_is_exist() const
+{
+ return !new_object;
+}
+
+void ObjectCleanRegions::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ using ceph::encode;
+ encode(clean_offsets, bl);
+ encode(clean_omap, bl);
+ encode(new_object, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ using ceph::decode;
+ decode(clean_offsets, bl);
+ decode(clean_omap, bl);
+ decode(new_object, bl);
+ DECODE_FINISH(bl);
+}
+
+void ObjectCleanRegions::dump(Formatter *f) const
+{
+ f->open_object_section("object_clean_regions");
+ f->dump_stream("clean_offsets") << clean_offsets;
+ f->dump_bool("clean_omap", clean_omap);
+ f->dump_bool("new_object", new_object);
+ f->close_section();
+}
+
+void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
+{
+ o.push_back(new ObjectCleanRegions());
+ o.push_back(new ObjectCleanRegions());
+ o.back()->mark_data_region_dirty(4096, 40960);
+ o.back()->mark_omap_dirty();
+ o.back()->mark_object_new();
+}
+
+ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
+{
+ return out << "clean_offsets: " << ocr.clean_offsets
+ << ", clean_omap: " << ocr.clean_omap
+ << ", new_object: " << ocr.new_object;
+}
+
+// -- pg_log_entry_t --
+
+string pg_log_entry_t::get_key_name() const
+{
+ return version.get_key_name();
+}
+
+void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
+{
+ using ceph::encode;
+ ceph::buffer::list ebl(sizeof(*this)*2);
+ this->encode(ebl);
+ __u32 crc = ebl.crc32c(0);
+ encode(ebl, bl);
+ encode(crc, bl);
+}
+
+void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
+{
+ using ceph::decode;
+ ceph::buffer::list bl;
+ decode(bl, p);
+ __u32 crc;
+ decode(crc, p);
+ if (crc != bl.crc32c(0))
+ throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
+ auto q = bl.cbegin();
+ this->decode(q);
+}
+
+void pg_log_entry_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(14, 4, bl);
+ encode(op, bl);
+ encode(soid, bl);
+ encode(version, bl);
+
+ /**
+ * Added with reverting_to:
+ * Previous code used prior_version to encode
+ * what we now call reverting_to. This will
+ * allow older code to decode reverting_to
+ * into prior_version as expected.
+ */
+ if (op == LOST_REVERT)
+ encode(reverting_to, bl);
+ else
+ encode(prior_version, bl);
+
+ encode(reqid, bl);
+ encode(mtime, bl);
+ if (op == LOST_REVERT)
+ encode(prior_version, bl);
+ encode(snaps, bl);
+ encode(user_version, bl);
+ encode(mod_desc, bl);
+ encode(extra_reqids, bl);
+ if (op == ERROR)
+ encode(return_code, bl);
+ if (!extra_reqids.empty())
+ encode(extra_reqid_return_codes, bl);
+ encode(clean_regions, bl);
+ if (op != ERROR)
+ encode(return_code, bl);
+ encode(op_returns, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
+ decode(op, bl);
+ if (struct_v < 2) {
+ sobject_t old_soid;
+ decode(old_soid, bl);
+ soid.oid = old_soid.oid;
+ soid.snap = old_soid.snap;
+ invalid_hash = true;
+ } else {
+ decode(soid, bl);
+ }
+ if (struct_v < 3)
+ invalid_hash = true;
+ decode(version, bl);
+
+ if (struct_v >= 6 && op == LOST_REVERT)
+ decode(reverting_to, bl);
+ else
+ decode(prior_version, bl);
+
+ decode(reqid, bl);
+
+ decode(mtime, bl);
+ if (struct_v < 5)
+ invalid_pool = true;
+
+ if (op == LOST_REVERT) {
+ if (struct_v >= 6) {
+ decode(prior_version, bl);
+ } else {
+ reverting_to = prior_version;
+ }
+ }
+ if (struct_v >= 7 || // for v >= 7, this is for all ops.
+ op == CLONE) { // for v < 7, it's only present for CLONE.
+ decode(snaps, bl);
+ // ensure snaps does not pin a larger ceph::buffer in memory
+ snaps.rebuild();
+ snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+
+ if (struct_v >= 8)
+ decode(user_version, bl);
+ else
+ user_version = version.version;
+
+ if (struct_v >= 9)
+ decode(mod_desc, bl);
+ else
+ mod_desc.mark_unrollbackable();
+ if (struct_v >= 10)
+ decode(extra_reqids, bl);
+ if (struct_v >= 11 && op == ERROR)
+ decode(return_code, bl);
+ if (struct_v >= 12 && !extra_reqids.empty())
+ decode(extra_reqid_return_codes, bl);
+ if (struct_v >= 13)
+ decode(clean_regions, bl);
+ else
+ clean_regions.mark_fully_dirty();
+ if (struct_v >= 14) {
+ if (op != ERROR) {
+ decode(return_code, bl);
+ }
+ decode(op_returns, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void pg_log_entry_t::dump(Formatter *f) const
+{
+ f->dump_string("op", get_op_name());
+ f->dump_stream("object") << soid;
+ f->dump_stream("version") << version;
+ f->dump_stream("prior_version") << prior_version;
+ f->dump_stream("reqid") << reqid;
+ f->open_array_section("extra_reqids");
+ uint32_t idx = 0;
+ for (auto p = extra_reqids.begin();
+ p != extra_reqids.end();
+ ++idx, ++p) {
+ f->open_object_section("extra_reqid");
+ f->dump_stream("reqid") << p->first;
+ f->dump_stream("user_version") << p->second;
+ auto it = extra_reqid_return_codes.find(idx);
+ if (it != extra_reqid_return_codes.end()) {
+ f->dump_int("return_code", it->second);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_stream("mtime") << mtime;
+ f->dump_int("return_code", return_code);
+ if (!op_returns.empty()) {
+ f->open_array_section("op_returns");
+ for (auto& i : op_returns) {
+ f->dump_object("op", i);
+ }
+ f->close_section();
+ }
+ if (snaps.length() > 0) {
+ vector<snapid_t> v;
+ ceph::buffer::list c = snaps;
+ auto p = c.cbegin();
+ try {
+ using ceph::decode;
+ decode(v, p);
+ } catch (...) {
+ v.clear();
+ }
+ f->open_object_section("snaps");
+ for (auto p = v.begin(); p != v.end(); ++p)
+ f->dump_unsigned("snap", *p);
+ f->close_section();
+ }
+ {
+ f->open_object_section("mod_desc");
+ mod_desc.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("clean_regions");
+ clean_regions.dump(f);
+ f->close_section();
+ }
+}
+
+void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
+{
+ o.push_back(new pg_log_entry_t());
+ hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
+ o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
+ 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ utime_t(8,9), 0));
+ o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
+ 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ utime_t(8,9), -ENOENT));
+}
+
+ostream& operator<<(ostream& out, const pg_log_entry_t& e)
+{
+ out << e.version << " (" << e.prior_version << ") "
+ << std::left << std::setw(8) << e.get_op_name() << ' '
+ << e.soid << " by " << e.reqid << " " << e.mtime
+ << " " << e.return_code;
+ if (!e.op_returns.empty()) {
+ out << " " << e.op_returns;
+ }
+ if (e.snaps.length()) {
+ vector<snapid_t> snaps;
+ ceph::buffer::list c = e.snaps;
+ auto p = c.cbegin();
+ try {
+ decode(snaps, p);
+ } catch (...) {
+ snaps.clear();
+ }
+ out << " snaps " << snaps;
+ }
+ out << " ObjectCleanRegions " << e.clean_regions;
+ return out;
+}
+
+// -- pg_log_dup_t --
+
+std::string pg_log_dup_t::get_key_name() const
+{
+ static const char prefix[] = "dup_";
+ std::string key(36, ' ');
+ memcpy(&key[0], prefix, 4);
+ version.get_key_name(&key[4]);
+ key.resize(35); // remove the null terminator
+ return key;
+}
+
+void pg_log_dup_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(2, 1, bl);
+ encode(reqid, bl);
+ encode(version, bl);
+ encode(user_version, bl);
+ encode(return_code, bl);
+ encode(op_returns, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(2, bl);
+ decode(reqid, bl);
+ decode(version, bl);
+ decode(user_version, bl);
+ decode(return_code, bl);
+ if (struct_v >= 2) {
+ decode(op_returns, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void pg_log_dup_t::dump(Formatter *f) const
+{
+ f->dump_stream("reqid") << reqid;
+ f->dump_stream("version") << version;
+ f->dump_stream("user_version") << user_version;
+ f->dump_stream("return_code") << return_code;
+ if (!op_returns.empty()) {
+ f->open_array_section("op_returns");
+ for (auto& i : op_returns) {
+ f->dump_object("op", i);
+ }
+ f->close_section();
+ }
+}
+
+void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
+{
+ o.push_back(new pg_log_dup_t());
+ o.push_back(new pg_log_dup_t(eversion_t(1,2),
+ 1,
+ osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ 0));
+ o.push_back(new pg_log_dup_t(eversion_t(1,2),
+ 2,
+ osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+ -ENOENT));
+}
+
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
+ out << "log_dup(reqid=" << e.reqid <<
+ " v=" << e.version << " uv=" << e.user_version <<
+ " rc=" << e.return_code;
+ if (!e.op_returns.empty()) {
+ out << " " << e.op_returns;
+ }
+ return out << ")";
+}
+
+
+// -- pg_log_t --
+
+// out: pg_log_t that only has entries that apply to import_pgid using curmap
+// reject: Entries rejected from "in" are in the reject.log. Other fields not set.
+void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
+ const string &hit_set_namespace, const pg_log_t &in,
+ pg_log_t &out, pg_log_t &reject)
+{
+ out = in;
+ out.log.clear();
+ reject.log.clear();
+
+ for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
+
+ // Reject pg log entries for temporary objects
+ if (i->soid.is_temp()) {
+ reject.log.push_back(*i);
+ continue;
+ }
+
+ if (i->soid.nspace != hit_set_namespace) {
+ object_t oid = i->soid.oid;
+ object_locator_t loc(i->soid);
+ pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
+
+ if (import_pgid.pgid == pgid) {
+ out.log.push_back(*i);
+ } else {
+ reject.log.push_back(*i);
+ }
+ } else {
+ out.log.push_back(*i);
+ }
+ }
+}
+
+void pg_log_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(7, 3, bl);
+ encode(head, bl);
+ encode(tail, bl);
+ encode(log, bl);
+ encode(can_rollback_to, bl);
+ encode(rollback_info_trimmed_to, bl);
+ encode(dups, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
+ decode(head, bl);
+ decode(tail, bl);
+ if (struct_v < 2) {
+ bool backlog;
+ decode(backlog, bl);
+ }
+ decode(log, bl);
+ if (struct_v >= 5)
+ decode(can_rollback_to, bl);
+
+ if (struct_v >= 6)
+ decode(rollback_info_trimmed_to, bl);
+ else
+ rollback_info_trimmed_to = tail;
+
+ if (struct_v >= 7)
+ decode(dups, bl);
+
+ DECODE_FINISH(bl);
+
+ // handle hobject_t format change
+ if (struct_v < 4) {
+ for (auto i = log.begin(); i != log.end(); ++i) {
+ if (!i->soid.is_max() && i->soid.pool == -1)
+ i->soid.pool = pool;
+ }
+ }
+}
+
+void pg_log_t::dump(Formatter *f) const
+{
+ f->dump_stream("head") << head;
+ f->dump_stream("tail") << tail;
+ f->open_array_section("log");
+ for (auto p = log.cbegin(); p != log.cend(); ++p) {
+ f->open_object_section("entry");
+ p->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("dups");
+ for (const auto& entry : dups) {
+ f->open_object_section("entry");
+ entry.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
+{
+ o.push_back(new pg_log_t);
+
+ // this is nonsensical:
+ o.push_back(new pg_log_t);
+ o.back()->head = eversion_t(1,2);
+ o.back()->tail = eversion_t(3,4);
+ list<pg_log_entry_t*> e;
+ pg_log_entry_t::generate_test_instances(e);
+ for (auto p = e.begin(); p != e.end(); ++p)
+ o.back()->log.push_back(**p);
+}
+
+static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
+{
+ auto earliest_dup_version =
+ target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
+ lgeneric_subdout(cct, osd, 20) << __func__ << " earliest_dup_version "
+ << earliest_dup_version << dendl;
+
+ for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
+ if (d->version.version >= earliest_dup_version) {
+ lgeneric_subdout(cct, osd, 20)
+ << "copy_up_to/copy_after copy dup version "
+ << d->version << dendl;
+ target.dups.push_back(pg_log_dup_t(*d));
+ }
+ }
+
+ for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
+ ceph_assert(i->version > other.tail);
+ if (i->version > target.tail)
+ break;
+ if (i->version.version >= earliest_dup_version) {
+ lgeneric_subdout(cct, osd, 20)
+ << "copy_up_to/copy_after copy dup from log version "
+ << i->version << dendl;
+ target.dups.push_back(pg_log_dup_t(*i));
+ }
+ }
+}
+
+
+void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
+{
+ can_rollback_to = other.can_rollback_to;
+ head = other.head;
+ tail = other.tail;
+ lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v
+ << " dups.size()=" << dups.size()
+ << " other.dups.size()=" << other.dups.size() << dendl;
+ for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
+ ceph_assert(i->version > other.tail);
+ if (i->version <= v) {
+ // make tail accurate.
+ tail = i->version;
+ break;
+ }
+ lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
+ log.push_front(*i);
+ }
+ _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
+ lgeneric_subdout(cct, osd, 20) << __func__ << " END v " << v
+ << " dups.size()=" << dups.size()
+ << " other.dups.size()=" << other.dups.size() << dendl;
+}
+
+void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
+{
+ can_rollback_to = other.can_rollback_to;
+ int n = 0;
+ head = other.head;
+ tail = other.tail;
+ lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max
+ << " dups.size()=" << dups.size()
+ << " other.dups.size()=" << other.dups.size() << dendl;
+ for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
+ ceph_assert(i->version > other.tail);
+ if (n++ >= max) {
+ tail = i->version;
+ break;
+ }
+ lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
+ log.push_front(*i);
+ }
+ _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
+ lgeneric_subdout(cct, osd, 20) << __func__ << " END max " << max
+ << " dups.size()=" << dups.size()
+ << " other.dups.size()=" << other.dups.size() << dendl;
+}
+
+ostream& pg_log_t::print(ostream& out) const
+{
+ out << *this << std::endl;
+ for (auto p = log.cbegin(); p != log.cend(); ++p)
+ out << *p << std::endl;
+ for (const auto& entry : dups) {
+ out << " dup entry: " << entry << std::endl;
+ }
+ return out;
+}
+
+// -- pg_missing_t --
+
+ostream& operator<<(ostream& out, const pg_missing_item& i)
+{
+ out << i.need;
+ if (i.have != eversion_t())
+ out << "(" << i.have << ")";
+ out << " flags = " << i.flag_str()
+ << " " << i.clean_regions;
+ return out;
+}
+
+// -- object_copy_cursor_t --
+
+void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(attr_complete, bl);
+ encode(data_offset, bl);
+ encode(data_complete, bl);
+ encode(omap_offset, bl);
+ encode(omap_complete, bl);
+ ENCODE_FINISH(bl);
+}
+
+void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(attr_complete, bl);
+ decode(data_offset, bl);
+ decode(data_complete, bl);
+ decode(omap_offset, bl);
+ decode(omap_complete, bl);
+ DECODE_FINISH(bl);
+}
+
+void object_copy_cursor_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("attr_complete", (int)attr_complete);
+ f->dump_unsigned("data_offset", data_offset);
+ f->dump_unsigned("data_complete", (int)data_complete);
+ f->dump_string("omap_offset", omap_offset);
+ f->dump_unsigned("omap_complete", (int)omap_complete);
+}
+
+void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
+{
+ o.push_back(new object_copy_cursor_t);
+ o.push_back(new object_copy_cursor_t);
+ o.back()->attr_complete = true;
+ o.back()->data_offset = 123;
+ o.push_back(new object_copy_cursor_t);
+ o.back()->attr_complete = true;
+ o.back()->data_complete = true;
+ o.back()->omap_offset = "foo";
+ o.push_back(new object_copy_cursor_t);
+ o.back()->attr_complete = true;
+ o.back()->data_complete = true;
+ o.back()->omap_complete = true;
+}
+
+// -- object_copy_data_t --
+
+void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ ENCODE_START(8, 5, bl);
+ encode(size, bl);
+ encode(mtime, bl);
+ encode(attrs, bl);
+ encode(data, bl);
+ encode(omap_data, bl);
+ encode(cursor, bl);
+ encode(omap_header, bl);
+ encode(snaps, bl);
+ encode(snap_seq, bl);
+ encode(flags, bl);
+ encode(data_digest, bl);
+ encode(omap_digest, bl);
+ encode(reqids, bl);
+ encode(truncate_seq, bl);
+ encode(truncate_size, bl);
+ encode(reqid_return_codes, bl);
+ ENCODE_FINISH(bl);
+}
+
+void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(8, bl);
+ if (struct_v < 5) {
+ // old
+ decode(size, bl);
+ decode(mtime, bl);
+ {
+ string category;
+ decode(category, bl); // no longer used
+ }
+ decode(attrs, bl);
+ decode(data, bl);
+ {
+ map<string,ceph::buffer::list> omap;
+ decode(omap, bl);
+ omap_data.clear();
+ if (!omap.empty()) {
+ using ceph::encode;
+ encode(omap, omap_data);
+ }
+ }
+ decode(cursor, bl);
+ if (struct_v >= 2)
+ decode(omap_header, bl);
+ if (struct_v >= 3) {
+ decode(snaps, bl);
+ decode(snap_seq, bl);
+ } else {
+ snaps.clear();
+ snap_seq = 0;
+ }
+ if (struct_v >= 4) {
+ decode(flags, bl);
+ decode(data_digest, bl);
+ decode(omap_digest, bl);
+ }
+ } else {
+ // current
+ decode(size, bl);
+ decode(mtime, bl);
+ decode(attrs, bl);
+ decode(data, bl);
+ decode(omap_data, bl);
+ decode(cursor, bl);
+ decode(omap_header, bl);
+ decode(snaps, bl);
+ decode(snap_seq, bl);
+ if (struct_v >= 4) {
+ decode(flags, bl);
+ decode(data_digest, bl);
+ decode(omap_digest, bl);
+ }
+ if (struct_v >= 6) {
+ decode(reqids, bl);
+ }
+ if (struct_v >= 7) {
+ decode(truncate_seq, bl);
+ decode(truncate_size, bl);
+ }
+ if (struct_v >= 8) {
+ decode(reqid_return_codes, bl);
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
+{
+ o.push_back(new object_copy_data_t());
+
+ list<object_copy_cursor_t*> cursors;
+ object_copy_cursor_t::generate_test_instances(cursors);
+ auto ci = cursors.begin();
+ o.back()->cursor = **(ci++);
+
+ o.push_back(new object_copy_data_t());
+ o.back()->cursor = **(ci++);
+
+ o.push_back(new object_copy_data_t());
+ o.back()->size = 1234;
+ o.back()->mtime.set_from_double(1234);
+ ceph::buffer::ptr bp("there", 5);
+ ceph::buffer::list bl;
+ bl.push_back(bp);
+ o.back()->attrs["hello"] = bl;
+ ceph::buffer::ptr bp2("not", 3);
+ ceph::buffer::list bl2;
+ bl2.push_back(bp2);
+ map<string,ceph::buffer::list> omap;
+ omap["why"] = bl2;
+ using ceph::encode;
+ encode(omap, o.back()->omap_data);
+ ceph::buffer::ptr databp("iamsomedatatocontain", 20);
+ o.back()->data.push_back(databp);
+ o.back()->omap_header.append("this is an omap header");
+ o.back()->snaps.push_back(123);
+ o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
+}
+
+void object_copy_data_t::dump(Formatter *f) const
+{
+ f->open_object_section("cursor");
+ cursor.dump(f);
+ f->close_section(); // cursor
+ f->dump_int("size", size);
+ f->dump_stream("mtime") << mtime;
+ /* we should really print out the attrs here, but ceph::buffer::list
+ const-correctness prevents that */
+ f->dump_int("attrs_size", attrs.size());
+ f->dump_int("flags", flags);
+ f->dump_unsigned("data_digest", data_digest);
+ f->dump_unsigned("omap_digest", omap_digest);
+ f->dump_int("omap_data_length", omap_data.length());
+ f->dump_int("omap_header_length", omap_header.length());
+ f->dump_int("data_length", data.length());
+ f->open_array_section("snaps");
+ for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
+ f->dump_unsigned("snap", *p);
+ f->close_section();
+ f->open_array_section("reqids");
+ uint32_t idx = 0;
+ for (auto p = reqids.begin();
+ p != reqids.end();
+ ++idx, ++p) {
+ f->open_object_section("extra_reqid");
+ f->dump_stream("reqid") << p->first;
+ f->dump_stream("user_version") << p->second;
+ auto it = reqid_return_codes.find(idx);
+ if (it != reqid_return_codes.end()) {
+ f->dump_int("return_code", it->second);
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+// -- pg_create_t --
+
+void pg_create_t::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(created, bl);
+ encode(parent, bl);
+ encode(split_bits, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(created, bl);
+ decode(parent, bl);
+ decode(split_bits, bl);
+ DECODE_FINISH(bl);
+}
+
+void pg_create_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("created", created);
+ f->dump_stream("parent") << parent;
+ f->dump_int("split_bits", split_bits);
+}
+
+void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
+{
+ o.push_back(new pg_create_t);
+ o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
+}
+
+
+// -- pg_hit_set_info_t --
+
+void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(2, 1, bl);
+ encode(begin, bl);
+ encode(end, bl);
+ encode(version, bl);
+ encode(using_gmt, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
+{
+ DECODE_START(2, p);
+ decode(begin, p);
+ decode(end, p);
+ decode(version, p);
+ if (struct_v >= 2) {
+ decode(using_gmt, p);
+ } else {
+ using_gmt = false;
+ }
+ DECODE_FINISH(p);
+}
+
+void pg_hit_set_info_t::dump(Formatter *f) const
+{
+ f->dump_stream("begin") << begin;
+ f->dump_stream("end") << end;
+ f->dump_stream("version") << version;
+ f->dump_stream("using_gmt") << using_gmt;
+}
+
+void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
+{
+ ls.push_back(new pg_hit_set_info_t);
+ ls.push_back(new pg_hit_set_info_t);
+ ls.back()->begin = utime_t(1, 2);
+ ls.back()->end = utime_t(3, 4);
+}
+
+
+// -- pg_hit_set_history_t --
+
+void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(current_last_update, bl);
+ {
+ utime_t dummy_stamp;
+ encode(dummy_stamp, bl);
+ }
+ {
+ pg_hit_set_info_t dummy_info;
+ encode(dummy_info, bl);
+ }
+ encode(history, bl);
+ ENCODE_FINISH(bl);
+}
+
+void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(current_last_update, p);
+ {
+ utime_t dummy_stamp;
+ decode(dummy_stamp, p);
+ }
+ {
+ pg_hit_set_info_t dummy_info;
+ decode(dummy_info, p);
+ }
+ decode(history, p);
+ DECODE_FINISH(p);
+}
+
+void pg_hit_set_history_t::dump(Formatter *f) const
+{
+ f->dump_stream("current_last_update") << current_last_update;
+ f->open_array_section("history");
+ for (auto p = history.cbegin(); p != history.cend(); ++p) {
+ f->open_object_section("info");
+ p->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
+{
+ ls.push_back(new pg_hit_set_history_t);
+ ls.push_back(new pg_hit_set_history_t);
+ ls.back()->current_last_update = eversion_t(1, 2);
+ ls.back()->history.push_back(pg_hit_set_info_t());
+}
+
+// -- OSDSuperblock --
+
+void OSDSuperblock::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(9, 5, bl);
+ encode(cluster_fsid, bl);
+ encode(whoami, bl);
+ encode(current_epoch, bl);
+ encode(oldest_map, bl);
+ encode(newest_map, bl);
+ encode(weight, bl);
+ compat_features.encode(bl);
+ encode(clean_thru, bl);
+ encode(mounted, bl);
+ encode(osd_fsid, bl);
+ encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
+ encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
+ encode(purged_snaps_last, bl);
+ encode(last_purged_snaps_scrub, bl);
+ ENCODE_FINISH(bl);
+}
+
+void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
+ if (struct_v < 3) {
+ string magic;
+ decode(magic, bl);
+ }
+ decode(cluster_fsid, bl);
+ decode(whoami, bl);
+ decode(current_epoch, bl);
+ decode(oldest_map, bl);
+ decode(newest_map, bl);
+ decode(weight, bl);
+ if (struct_v >= 2) {
+ compat_features.decode(bl);
+ } else { //upgrade it!
+ compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ }
+ decode(clean_thru, bl);
+ decode(mounted, bl);
+ if (struct_v >= 4)
+ decode(osd_fsid, bl);
+ if (struct_v >= 6) {
+ epoch_t last_map_marked_full;
+ decode(last_map_marked_full, bl);
+ }
+ if (struct_v >= 7) {
+ map<int64_t,epoch_t> pool_last_map_marked_full;
+ decode(pool_last_map_marked_full, bl);
+ }
+ if (struct_v >= 9) {
+ decode(purged_snaps_last, bl);
+ decode(last_purged_snaps_scrub, bl);
+ } else {
+ purged_snaps_last = 0;
+ }
+ DECODE_FINISH(bl);
+}
+
+void OSDSuperblock::dump(Formatter *f) const
+{
+ f->dump_stream("cluster_fsid") << cluster_fsid;
+ f->dump_stream("osd_fsid") << osd_fsid;
+ f->dump_int("whoami", whoami);
+ f->dump_int("current_epoch", current_epoch);
+ f->dump_int("oldest_map", oldest_map);
+ f->dump_int("newest_map", newest_map);
+ f->dump_float("weight", weight);
+ f->open_object_section("compat");
+ compat_features.dump(f);
+ f->close_section();
+ f->dump_int("clean_thru", clean_thru);
+ f->dump_int("last_epoch_mounted", mounted);
+ f->dump_unsigned("purged_snaps_last", purged_snaps_last);
+ f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
+}
+
+void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
+{
+ OSDSuperblock z;
+ o.push_back(new OSDSuperblock(z));
+ z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
+ z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
+ z.whoami = 3;
+ z.current_epoch = 4;
+ z.oldest_map = 5;
+ z.newest_map = 9;
+ z.mounted = 8;
+ z.clean_thru = 7;
+ o.push_back(new OSDSuperblock(z));
+ o.push_back(new OSDSuperblock(z));
+}
+
+// -- SnapSet --
+
+void SnapSet::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(seq, bl);
+ encode(true, bl); // head_exists
+ encode(snaps, bl);
+ encode(clones, bl);
+ encode(clone_overlap, bl);
+ encode(clone_size, bl);
+ encode(clone_snaps, bl);
+ ENCODE_FINISH(bl);
+}
+
+void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(seq, bl);
+ bl += 1u; // skip legacy head_exists (always true)
+ decode(snaps, bl);
+ decode(clones, bl);
+ decode(clone_overlap, bl);
+ decode(clone_size, bl);
+ if (struct_v >= 3) {
+ decode(clone_snaps, bl);
+ } else {
+ clone_snaps.clear();
+ }
+ DECODE_FINISH(bl);
+}
+
+void SnapSet::dump(Formatter *f) const
+{
+ f->dump_unsigned("seq", seq);
+ f->open_array_section("clones");
+ for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
+ f->open_object_section("clone");
+ f->dump_unsigned("snap", *p);
+ auto cs = clone_size.find(*p);
+ if (cs != clone_size.end())
+ f->dump_unsigned("size", cs->second);
+ else
+ f->dump_string("size", "????");
+ auto co = clone_overlap.find(*p);
+ if (co != clone_overlap.end())
+ f->dump_stream("overlap") << co->second;
+ else
+ f->dump_stream("overlap") << "????";
+ auto q = clone_snaps.find(*p);
+ if (q != clone_snaps.end()) {
+ f->open_array_section("snaps");
+ for (auto s : q->second) {
+ f->dump_unsigned("snap", s);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void SnapSet::generate_test_instances(list<SnapSet*>& o)
+{
+ o.push_back(new SnapSet);
+ o.push_back(new SnapSet);
+ o.back()->seq = 123;
+ o.back()->snaps.push_back(123);
+ o.back()->snaps.push_back(12);
+ o.push_back(new SnapSet);
+ o.back()->seq = 123;
+ o.back()->snaps.push_back(123);
+ o.back()->snaps.push_back(12);
+ o.back()->clones.push_back(12);
+ o.back()->clone_size[12] = 12345;
+ o.back()->clone_overlap[12];
+ o.back()->clone_snaps[12] = {12, 10, 8};
+}
+
+ostream& operator<<(ostream& out, const SnapSet& cs)
+{
+ return out << cs.seq << "=" << cs.snaps << ":"
+ << cs.clone_snaps;
+}
+
+void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
+{
+ // NOTE: our reconstruction of snaps (and the snapc) is not strictly
+ // correct: it will not include snaps that still logically exist
+ // but for which there was no clone that is defined. For all
+ // practical purposes this doesn't matter, since we only use that
+ // information to clone on the OSD, and we have already moved
+ // forward past that part of the object history.
+
+ seq = ss.seq;
+ set<snapid_t> _snaps;
+ set<snapid_t> _clones;
+ for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
+ if (p->cloneid != librados::SNAP_HEAD) {
+ _clones.insert(p->cloneid);
+ _snaps.insert(p->snaps.begin(), p->snaps.end());
+ clone_size[p->cloneid] = p->size;
+ clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
+ for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
+ clone_overlap[p->cloneid].insert(q->first, q->second);
+ if (!legacy) {
+ // p->snaps is ascending; clone_snaps is descending
+ vector<snapid_t>& v = clone_snaps[p->cloneid];
+ for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
+ v.push_back(*q);
+ }
+ }
+ }
+ }
+
+ // ascending
+ clones.clear();
+ clones.reserve(_clones.size());
+ for (auto p = _clones.begin(); p != _clones.end(); ++p)
+ clones.push_back(*p);
+
+ // descending
+ snaps.clear();
+ snaps.reserve(_snaps.size());
+ for (auto p = _snaps.rbegin();
+ p != _snaps.rend(); ++p)
+ snaps.push_back(*p);
+}
+
+uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
+{
+ ceph_assert(clone_size.count(clone));
+ uint64_t size = clone_size.find(clone)->second;
+ ceph_assert(clone_overlap.count(clone));
+ const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
+ ceph_assert(size >= (uint64_t)overlap.size());
+ return size - overlap.size();
+}
+
+void SnapSet::filter(const pg_pool_t &pinfo)
+{
+ vector<snapid_t> oldsnaps;
+ oldsnaps.swap(snaps);
+ for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
+ if (!pinfo.is_removed_snap(*i))
+ snaps.push_back(*i);
+ }
+}
+
+SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
+{
+ SnapSet ss = *this;
+ ss.filter(pinfo);
+ return ss;
+}
+
+// -- watch_info_t --
+
+void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ ENCODE_START(4, 3, bl);
+ encode(cookie, bl);
+ encode(timeout_seconds, bl);
+ encode(addr, bl, features);
+ ENCODE_FINISH(bl);
+}
+
+void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
+ decode(cookie, bl);
+ if (struct_v < 2) {
+ uint64_t ver;
+ decode(ver, bl);
+ }
+ decode(timeout_seconds, bl);
+ if (struct_v >= 4) {
+ decode(addr, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void watch_info_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("cookie", cookie);
+ f->dump_unsigned("timeout_seconds", timeout_seconds);
+ f->open_object_section("addr");
+ addr.dump(f);
+ f->close_section();
+}
+
+void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
+{
+ o.push_back(new watch_info_t);
+ o.push_back(new watch_info_t);
+ o.back()->cookie = 123;
+ o.back()->timeout_seconds = 99;
+ entity_addr_t ea;
+ ea.set_type(entity_addr_t::TYPE_LEGACY);
+ ea.set_nonce(1);
+ ea.set_family(AF_INET);
+ ea.set_in4_quad(0, 127);
+ ea.set_in4_quad(1, 0);
+ ea.set_in4_quad(2, 1);
+ ea.set_in4_quad(3, 2);
+ ea.set_port(2);
+ o.back()->addr = ea;
+}
+
+// -- chunk_info_t --
+
+void chunk_info_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(offset, bl);
+ encode(length, bl);
+ encode(oid, bl);
+ __u32 _flags = flags;
+ encode(_flags, bl);
+ ENCODE_FINISH(bl);
+}
+
+void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ decode(offset, bl);
+ decode(length, bl);
+ decode(oid, bl);
+ __u32 _flags;
+ decode(_flags, bl);
+ flags = (cflag_t)_flags;
+ DECODE_FINISH(bl);
+}
+
+void chunk_info_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("length", length);
+ f->open_object_section("oid");
+ oid.dump(f);
+ f->close_section();
+ f->dump_unsigned("flags", flags);
+}
+
+
+bool chunk_info_t::operator==(const chunk_info_t& cit) const
+{
+ if (has_fingerprint()) {
+ if (oid.oid.name == cit.oid.oid.name) {
+ return true;
+ }
+ } else {
+ if (offset == cit.offset && length == cit.length &&
+ oid.oid.name == cit.oid.oid.name) {
+ return true;
+ }
+
+ }
+ return false;
+}
+
+bool operator==(const std::pair<const long unsigned int, chunk_info_t> & l,
+ const std::pair<const long unsigned int, chunk_info_t> & r)
+{
+ return l.first == r.first &&
+ l.second == r.second;
+}
+
+ostream& operator<<(ostream& out, const chunk_info_t& ci)
+{
+ return out << "(len: " << ci.length << " oid: " << ci.oid
+ << " offset: " << ci.offset
+ << " flags: " << ci.get_flag_string(ci.flags) << ")";
+}
+
+// -- object_manifest_t --
+
+std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci)
+{
+ return out << ci.ref_delta << std::endl;
+}
+
+void object_manifest_t::calc_refs_to_inc_on_set(
+ const object_manifest_t* _g,
+ const object_manifest_t* _l,
+ object_ref_delta_t &refs) const
+{
+ /* avoid to increment the same reference on adjacent clones */
+ auto iter = chunk_map.begin();
+ auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur)
+ -> bool {
+ if (cur) {
+ auto c = cur->chunk_map.find(i->first);
+ if (c != cur->chunk_map.end() && c->second == i->second) {
+ return true;
+
+ }
+ }
+ return false;
+ };
+
+ /* If at least a same chunk exists on either _g or _l, do not increment
+ * the reference
+ *
+ * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+ * 20: [0, 2) aaa, <- set_chunk
+ * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
+ * --> incremnt the reference
+ *
+ * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+ * 20: [0, 2) ccc, <- set_chunk
+ * 30: [0, 2) abc, [6, 2) bbb, [8, 2) ccc
+ * --> do not need to increment
+ *
+ * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+ * 20: [0, 2) ccc, <- set_chunk
+ * 30: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+ * --> decrement the reference of ccc
+ *
+ */
+ for (; iter != chunk_map.end(); ++iter) {
+ auto found_g = find_chunk(iter, _g);
+ auto found_l = find_chunk(iter, _l);
+ if (!found_g && !found_l) {
+ refs.inc_ref(iter->second.oid);
+ } else if (found_g && found_l) {
+ refs.dec_ref(iter->second.oid);
+ }
+ }
+}
+
+void object_manifest_t::calc_refs_to_drop_on_modify(
+ const object_manifest_t* _l,
+ const ObjectCleanRegions& clean_regions,
+ object_ref_delta_t &refs) const
+{
+ for (auto &p : chunk_map) {
+ if (!clean_regions.is_clean_region(p.first, p.second.length)) {
+ // has previous snapshot
+ if (_l) {
+ /*
+ * Let's assume that there is a manifest snapshotted object which has three chunks
+ * head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
+ * 20: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
+ *
+ * If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
+ * 20 has the reference for bbb. Therefore, we only drop the reference if two chunks
+ * (head: [6, 2) and 20: [6, 2)) are different.
+ *
+ */
+ auto c = _l->chunk_map.find(p.first);
+ if (c != _l->chunk_map.end()) {
+ if (p.second == c->second) {
+ continue;
+ }
+ }
+ refs.dec_ref(p.second.oid);
+ } else {
+ // decrement the reference of the updated chunks if the manifest object has no snapshot
+ refs.dec_ref(p.second.oid);
+ }
+ }
+ }
+}
+
+void object_manifest_t::calc_refs_to_drop_on_removal(
+ const object_manifest_t* _g,
+ const object_manifest_t* _l,
+ object_ref_delta_t &refs) const
+{
+ /* At a high level, the rule is that consecutive clones with the same reference
+ * at the same offset share a reference. As such, removing *this may result
+ * in removing references in two cases:
+ * 1) *this has a reference which it shares with neither _g nor _l
+ * 2) _g and _l have a reference which they share with each other but not
+ * *this.
+ *
+ * For a particular offset, both 1 and 2 can happen.
+ *
+ * Notably, this means that to evaluate the reference change from removing
+ * the object with *this, we only need to look at the two adjacent clones.
+ */
+
+ // Paper over possibly missing _g or _l -- nullopt is semantically the same
+ // as an empty chunk_map
+ static const object_manifest_t empty;
+ const object_manifest_t &g = _g ? *_g : empty;
+ const object_manifest_t &l = _l ? *_l : empty;
+
+ auto giter = g.chunk_map.begin();
+ auto iter = chunk_map.begin();
+ auto liter = l.chunk_map.begin();
+
+ // Translate iter, map pair to the current offset, end() -> max
+ auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest)
+ -> uint64_t {
+ return i == manifest.chunk_map.end() ?
+ std::numeric_limits<uint64_t>::max() : i->first;
+ };
+
+ /* If current matches the offset at iter, returns the chunk at *iter
+ * and increments iter. Otherwise, returns nullptr.
+ *
+ * current will always be derived from the min of *giter, *iter, and
+ * *liter on each cycle, so the result will be that each loop iteration
+ * will pick up all chunks at the offest being considered, each offset
+ * will be considered once, and all offsets will be considered.
+ */
+ auto get_chunk = [](
+ uint64_t current, decltype(iter) &i, const object_manifest_t &manifest)
+ -> const chunk_info_t * {
+ if (i == manifest.chunk_map.end() || current != i->first) {
+ return nullptr;
+ } else {
+ return &(i++)->second;
+ }
+ };
+
+ while (giter != g.chunk_map.end() ||
+ iter != chunk_map.end() ||
+ liter != l.chunk_map.end()) {
+ auto current = std::min(
+ std::min(get_offset(giter, g), get_offset(iter, *this)),
+ get_offset(liter, l));
+
+ auto gchunk = get_chunk(current, giter, g);
+ auto chunk = get_chunk(current, iter, *this);
+ auto lchunk = get_chunk(current, liter, l);
+
+ if (gchunk && lchunk && *gchunk == *lchunk &&
+ (!chunk || *gchunk != *chunk)) {
+ // case 1 from above: l and g match, chunk does not
+ refs.dec_ref(gchunk->oid);
+ }
+
+ if (chunk &&
+ (!gchunk || chunk->oid != gchunk->oid) &&
+ (!lchunk || chunk->oid != lchunk->oid)) {
+ // case 2 from above: *this matches neither
+ refs.dec_ref(chunk->oid);
+ }
+ }
+}
+
+void object_manifest_t::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(type, bl);
+ switch (type) {
+ case TYPE_NONE: break;
+ case TYPE_REDIRECT:
+ encode(redirect_target, bl);
+ break;
+ case TYPE_CHUNKED:
+ encode(chunk_map, bl);
+ break;
+ default:
+ ceph_abort();
+ }
+ ENCODE_FINISH(bl);
+}
+
+void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(1, bl);
+ decode(type, bl);
+ switch (type) {
+ case TYPE_NONE: break;
+ case TYPE_REDIRECT:
+ decode(redirect_target, bl);
+ break;
+ case TYPE_CHUNKED:
+ decode(chunk_map, bl);
+ break;
+ default:
+ ceph_abort();
+ }
+ DECODE_FINISH(bl);
+}
+
+void object_manifest_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("type", type);
+ if (type == TYPE_REDIRECT) {
+ f->open_object_section("redirect_target");
+ redirect_target.dump(f);
+ f->close_section();
+ } else if (type == TYPE_CHUNKED) {
+ f->open_array_section("chunk_map");
+ for (auto& p : chunk_map) {
+ f->open_object_section("chunk");
+ f->dump_unsigned("offset", p.first);
+ p.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+}
+
+void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
+{
+ o.push_back(new object_manifest_t());
+ o.back()->type = TYPE_REDIRECT;
+}
+
+ostream& operator<<(ostream& out, const object_manifest_t& om)
+{
+ out << "manifest(" << om.get_type_name();
+ if (om.is_redirect()) {
+ out << " " << om.redirect_target;
+ } else if (om.is_chunked()) {
+ out << " " << om.chunk_map;
+ }
+ out << ")";
+ return out;
+}
+
+// -- object_info_t --
+
+void object_info_t::copy_user_bits(const object_info_t& other)
+{
+ // these bits are copied from head->clone.
+ size = other.size;
+ mtime = other.mtime;
+ local_mtime = other.local_mtime;
+ last_reqid = other.last_reqid;
+ truncate_seq = other.truncate_seq;
+ truncate_size = other.truncate_size;
+ flags = other.flags;
+ user_version = other.user_version;
+ data_digest = other.data_digest;
+ omap_digest = other.omap_digest;
+}
+
+void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+ object_locator_t myoloc(soid);
+ map<entity_name_t, watch_info_t> old_watchers;
+ for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
+ old_watchers.insert(make_pair(i->first.second, i->second));
+ }
+ ENCODE_START(17, 8, bl);
+ encode(soid, bl);
+ encode(myoloc, bl); //Retained for compatibility
+ encode((__u32)0, bl); // was category, no longer used
+ encode(version, bl);
+ encode(prior_version, bl);
+ encode(last_reqid, bl);
+ encode(size, bl);
+ encode(mtime, bl);
+ if (soid.snap == CEPH_NOSNAP)
+ encode(osd_reqid_t(), bl); // used to be wrlock_by
+ else
+ encode((uint32_t)0, bl); // was legacy_snaps
+ encode(truncate_seq, bl);
+ encode(truncate_size, bl);
+ encode(is_lost(), bl);
+ encode(old_watchers, bl, features);
+ /* shenanigans to avoid breaking backwards compatibility in the disk format.
+ * When we can, switch this out for simply putting the version_t on disk. */
+ eversion_t user_eversion(0, user_version);
+ encode(user_eversion, bl);
+ encode(test_flag(FLAG_USES_TMAP), bl);
+ encode(watchers, bl, features);
+ __u32 _flags = flags;
+ encode(_flags, bl);
+ encode(local_mtime, bl);
+ encode(data_digest, bl);
+ encode(omap_digest, bl);
+ encode(expected_object_size, bl);
+ encode(expected_write_size, bl);
+ encode(alloc_hint_flags, bl);
+ if (has_manifest()) {
+ encode(manifest, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+ object_locator_t myoloc;
+ DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
+ map<entity_name_t, watch_info_t> old_watchers;
+ decode(soid, bl);
+ decode(myoloc, bl);
+ {
+ string category;
+ decode(category, bl); // no longer used
+ }
+ decode(version, bl);
+ decode(prior_version, bl);
+ decode(last_reqid, bl);
+ decode(size, bl);
+ decode(mtime, bl);
+ if (soid.snap == CEPH_NOSNAP) {
+ osd_reqid_t wrlock_by;
+ decode(wrlock_by, bl);
+ } else {
+ vector<snapid_t> legacy_snaps;
+ decode(legacy_snaps, bl);
+ }
+ decode(truncate_seq, bl);
+ decode(truncate_size, bl);
+
+ // if this is struct_v >= 13, we will overwrite this
+ // below since this field is just here for backwards
+ // compatibility
+ __u8 lo;
+ decode(lo, bl);
+ flags = (flag_t)lo;
+
+ decode(old_watchers, bl);
+ eversion_t user_eversion;
+ decode(user_eversion, bl);
+ user_version = user_eversion.version;
+
+ if (struct_v >= 9) {
+ bool uses_tmap = false;
+ decode(uses_tmap, bl);
+ if (uses_tmap)
+ set_flag(FLAG_USES_TMAP);
+ } else {
+ set_flag(FLAG_USES_TMAP);
+ }
+ if (struct_v < 10)
+ soid.pool = myoloc.pool;
+ if (struct_v >= 11) {
+ decode(watchers, bl);
+ } else {
+ for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
+ watchers.insert(
+ make_pair(
+ make_pair(i->second.cookie, i->first), i->second));
+ }
+ }
+ if (struct_v >= 13) {
+ __u32 _flags;
+ decode(_flags, bl);
+ flags = (flag_t)_flags;
+ }
+ if (struct_v >= 14) {
+ decode(local_mtime, bl);
+ } else {
+ local_mtime = utime_t();
+ }
+ if (struct_v >= 15) {
+ decode(data_digest, bl);
+ decode(omap_digest, bl);
+ } else {
+ data_digest = omap_digest = -1;
+ clear_flag(FLAG_DATA_DIGEST);
+ clear_flag(FLAG_OMAP_DIGEST);
+ }
+ if (struct_v >= 16) {
+ decode(expected_object_size, bl);
+ decode(expected_write_size, bl);
+ decode(alloc_hint_flags, bl);
+ } else {
+ expected_object_size = 0;
+ expected_write_size = 0;
+ alloc_hint_flags = 0;
+ }
+ if (struct_v >= 17) {
+ if (has_manifest()) {
+ decode(manifest, bl);
+ }
+ }
+ DECODE_FINISH(bl);
+}
+
+void object_info_t::dump(Formatter *f) const
+{
+ f->open_object_section("oid");
+ soid.dump(f);
+ f->close_section();
+ f->dump_stream("version") << version;
+ f->dump_stream("prior_version") << prior_version;
+ f->dump_stream("last_reqid") << last_reqid;
+ f->dump_unsigned("user_version", user_version);
+ f->dump_unsigned("size", size);
+ f->dump_stream("mtime") << mtime;
+ f->dump_stream("local_mtime") << local_mtime;
+ f->dump_unsigned("lost", (int)is_lost());
+ vector<string> sv = get_flag_vector(flags);
+ f->open_array_section("flags");
+ for (auto str: sv)
+ f->dump_string("flags", str);
+ f->close_section();
+ f->dump_unsigned("truncate_seq", truncate_seq);
+ f->dump_unsigned("truncate_size", truncate_size);
+ f->dump_format("data_digest", "0x%08x", data_digest);
+ f->dump_format("omap_digest", "0x%08x", omap_digest);
+ f->dump_unsigned("expected_object_size", expected_object_size);
+ f->dump_unsigned("expected_write_size", expected_write_size);
+ f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+ f->dump_object("manifest", manifest);
+ f->open_object_section("watchers");
+ for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
+ CachedStackStringStream css;
+ *css << p->first.second;
+ f->open_object_section(css->strv());
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void object_info_t::generate_test_instances(list<object_info_t*>& o)
+{
+ o.push_back(new object_info_t());
+
+ // fixme
+}
+
+
+ostream& operator<<(ostream& out, const object_info_t& oi)
+{
+ out << oi.soid << "(" << oi.version
+ << " " << oi.last_reqid;
+ if (oi.flags)
+ out << " " << oi.get_flag_string();
+ out << " s " << oi.size;
+ out << " uv " << oi.user_version;
+ if (oi.is_data_digest())
+ out << " dd " << std::hex << oi.data_digest << std::dec;
+ if (oi.is_omap_digest())
+ out << " od " << std::hex << oi.omap_digest << std::dec;
+ out << " alloc_hint [" << oi.expected_object_size
+ << " " << oi.expected_write_size
+ << " " << oi.alloc_hint_flags << "]";
+ if (oi.has_manifest())
+ out << " " << oi.manifest;
+ out << ")";
+ return out;
+}
+
+// -- ObjectRecovery --
+void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(first, bl);
+ encode(data_complete, bl);
+ encode(data_recovered_to, bl);
+ encode(omap_recovered_to, bl);
+ encode(omap_complete, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(first, bl);
+ decode(data_complete, bl);
+ decode(data_recovered_to, bl);
+ decode(omap_recovered_to, bl);
+ decode(omap_complete, bl);
+ DECODE_FINISH(bl);
+}
+
+ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
+{
+ return prog.print(out);
+}
+
+void ObjectRecoveryProgress::generate_test_instances(
+ list<ObjectRecoveryProgress*>& o)
+{
+ o.push_back(new ObjectRecoveryProgress);
+ o.back()->first = false;
+ o.back()->data_complete = true;
+ o.back()->omap_complete = true;
+ o.back()->data_recovered_to = 100;
+
+ o.push_back(new ObjectRecoveryProgress);
+ o.back()->first = true;
+ o.back()->data_complete = false;
+ o.back()->omap_complete = false;
+ o.back()->data_recovered_to = 0;
+}
+
+ostream &ObjectRecoveryProgress::print(ostream &out) const
+{
+ return out << "ObjectRecoveryProgress("
+ << ( first ? "" : "!" ) << "first, "
+ << "data_recovered_to:" << data_recovered_to
+ << ", data_complete:" << ( data_complete ? "true" : "false" )
+ << ", omap_recovered_to:" << omap_recovered_to
+ << ", omap_complete:" << ( omap_complete ? "true" : "false" )
+ << ", error:" << ( error ? "true" : "false" )
+ << ")";
+}
+
+void ObjectRecoveryProgress::dump(Formatter *f) const
+{
+ f->dump_int("first?", first);
+ f->dump_int("data_complete?", data_complete);
+ f->dump_unsigned("data_recovered_to", data_recovered_to);
+ f->dump_int("omap_complete?", omap_complete);
+ f->dump_string("omap_recovered_to", omap_recovered_to);
+}
+
+void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+ ENCODE_START(3, 1, bl);
+ encode(soid, bl);
+ encode(version, bl);
+ encode(size, bl);
+ encode(oi, bl, features);
+ encode(ss, bl);
+ encode(copy_subset, bl);
+ encode(clone_subset, bl);
+ encode(object_exist, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
+ int64_t pool)
+{
+ DECODE_START(3, bl);
+ decode(soid, bl);
+ decode(version, bl);
+ decode(size, bl);
+ decode(oi, bl);
+ decode(ss, bl);
+ decode(copy_subset, bl);
+ decode(clone_subset, bl);
+ if (struct_v > 2)
+ decode(object_exist, bl);
+ else
+ object_exist = false;
+ DECODE_FINISH(bl);
+ if (struct_v < 2) {
+ if (!soid.is_max() && soid.pool == -1)
+ soid.pool = pool;
+ map<hobject_t, interval_set<uint64_t>> tmp;
+ tmp.swap(clone_subset);
+ for (auto i = tmp.begin(); i != tmp.end(); ++i) {
+ hobject_t first(i->first);
+ if (!first.is_max() && first.pool == -1)
+ first.pool = pool;
+ clone_subset[first].swap(i->second);
+ }
+ }
+}
+
+void ObjectRecoveryInfo::generate_test_instances(
+ list<ObjectRecoveryInfo*>& o)
+{
+ o.push_back(new ObjectRecoveryInfo);
+ o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
+ o.back()->version = eversion_t(0,0);
+ o.back()->size = 100;
+ o.back()->object_exist = false;
+}
+
+
+void ObjectRecoveryInfo::dump(Formatter *f) const
+{
+ f->dump_stream("object") << soid;
+ f->dump_stream("at_version") << version;
+ f->dump_stream("size") << size;
+ {
+ f->open_object_section("object_info");
+ oi.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("snapset");
+ ss.dump(f);
+ f->close_section();
+ }
+ f->dump_stream("copy_subset") << copy_subset;
+ f->dump_stream("clone_subset") << clone_subset;
+ f->dump_stream("object_exist") << object_exist;
+}
+
+ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
+{
+ return inf.print(out);
+}
+
+ostream &ObjectRecoveryInfo::print(ostream &out) const
+{
+ return out << "ObjectRecoveryInfo("
+ << soid << "@" << version
+ << ", size: " << size
+ << ", copy_subset: " << copy_subset
+ << ", clone_subset: " << clone_subset
+ << ", snapset: " << ss
+ << ", object_exist: " << object_exist
+ << ")";
+}
+
+// -- PushReplyOp --
+void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
+{
+ o.push_back(new PushReplyOp);
+ o.push_back(new PushReplyOp);
+ o.back()->soid = hobject_t(sobject_t("asdf", 2));
+ o.push_back(new PushReplyOp);
+ o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
+}
+
+void PushReplyOp::encode(ceph::buffer::list &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(soid, bl);
+ ENCODE_FINISH(bl);
+}
+
+void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(soid, bl);
+ DECODE_FINISH(bl);
+}
+
+void PushReplyOp::dump(Formatter *f) const
+{
+ f->dump_stream("soid") << soid;
+}
+
+ostream &PushReplyOp::print(ostream &out) const
+{
+ return out
+ << "PushReplyOp(" << soid
+ << ")";
+}
+
+ostream& operator<<(ostream& out, const PushReplyOp &op)
+{
+ return op.print(out);
+}
+
+uint64_t PushReplyOp::cost(CephContext *cct) const
+{
+
+ return cct->_conf->osd_push_per_object_cost +
+ cct->_conf->osd_recovery_max_chunk;
+}
+
+// -- PullOp --
+void PullOp::generate_test_instances(list<PullOp*> &o)
+{
+ o.push_back(new PullOp);
+ o.push_back(new PullOp);
+ o.back()->soid = hobject_t(sobject_t("asdf", 2));
+ o.back()->recovery_info.version = eversion_t(3, 10);
+ o.push_back(new PullOp);
+ o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
+ o.back()->recovery_info.version = eversion_t(0, 0);
+}
+
+void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(soid, bl);
+ encode(recovery_info, bl, features);
+ encode(recovery_progress, bl);
+ ENCODE_FINISH(bl);
+}
+
+void PullOp::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(soid, bl);
+ decode(recovery_info, bl);
+ decode(recovery_progress, bl);
+ DECODE_FINISH(bl);
+}
+
+void PullOp::dump(Formatter *f) const
+{
+ f->dump_stream("soid") << soid;
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+}
+
+ostream &PullOp::print(ostream &out) const
+{
+ return out
+ << "PullOp(" << soid
+ << ", recovery_info: " << recovery_info
+ << ", recovery_progress: " << recovery_progress
+ << ")";
+}
+
+ostream& operator<<(ostream& out, const PullOp &op)
+{
+ return op.print(out);
+}
+
+uint64_t PullOp::cost(CephContext *cct) const
+{
+ return cct->_conf->osd_push_per_object_cost +
+ cct->_conf->osd_recovery_max_chunk;
+}
+
+// -- PushOp --
+void PushOp::generate_test_instances(list<PushOp*> &o)
+{
+ o.push_back(new PushOp);
+ o.push_back(new PushOp);
+ o.back()->soid = hobject_t(sobject_t("asdf", 2));
+ o.back()->version = eversion_t(3, 10);
+ o.push_back(new PushOp);
+ o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
+ o.back()->version = eversion_t(0, 0);
+}
+
+void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(soid, bl);
+ encode(version, bl);
+ encode(data, bl);
+ encode(data_included, bl);
+ encode(omap_header, bl);
+ encode(omap_entries, bl);
+ encode(attrset, bl);
+ encode(recovery_info, bl, features);
+ encode(after_progress, bl);
+ encode(before_progress, bl);
+ ENCODE_FINISH(bl);
+}
+
+void PushOp::decode(ceph::buffer::list::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(soid, bl);
+ decode(version, bl);
+ decode(data, bl);
+ decode(data_included, bl);
+ decode(omap_header, bl);
+ decode(omap_entries, bl);
+ decode(attrset, bl);
+ decode(recovery_info, bl);
+ decode(after_progress, bl);
+ decode(before_progress, bl);
+ DECODE_FINISH(bl);
+}
+
+void PushOp::dump(Formatter *f) const
+{
+ f->dump_stream("soid") << soid;
+ f->dump_stream("version") << version;
+ f->dump_int("data_len", data.length());
+ f->dump_stream("data_included") << data_included;
+ f->dump_int("omap_header_len", omap_header.length());
+ f->dump_int("omap_entries_len", omap_entries.size());
+ f->dump_int("attrset_len", attrset.size());
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("after_progress");
+ after_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("before_progress");
+ before_progress.dump(f);
+ f->close_section();
+ }
+}
+
+ostream &PushOp::print(ostream &out) const
+{
+ return out
+ << "PushOp(" << soid
+ << ", version: " << version
+ << ", data_included: " << data_included
+ << ", data_size: " << data.length()
+ << ", omap_header_size: " << omap_header.length()
+ << ", omap_entries_size: " << omap_entries.size()
+ << ", attrset_size: " << attrset.size()
+ << ", recovery_info: " << recovery_info
+ << ", after_progress: " << after_progress
+ << ", before_progress: " << before_progress
+ << ")";
+}
+
+ostream& operator<<(ostream& out, const PushOp &op)
+{
+ return op.print(out);
+}
+
+uint64_t PushOp::cost(CephContext *cct) const
+{
+ uint64_t cost = data_included.size();
+ for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
+ cost += i->second.length();
+ }
+ cost += cct->_conf->osd_push_per_object_cost;
+ return cost;
+}
+
+// -- ScrubMap --
+
+void ScrubMap::merge_incr(const ScrubMap &l)
+{
+ ceph_assert(valid_through == l.incr_since);
+ valid_through = l.valid_through;
+
+ for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
+ if (p->second.negative) {
+ auto q = objects.find(p->first);
+ if (q != objects.end()) {
+ objects.erase(q);
+ }
+ } else {
+ objects[p->first] = p->second;
+ }
+ }
+}
+
+void ScrubMap::encode(ceph::buffer::list& bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(objects, bl);
+ encode((__u32)0, bl); // used to be attrs; now deprecated
+ ceph::buffer::list old_logbl; // not used
+ encode(old_logbl, bl);
+ encode(valid_through, bl);
+ encode(incr_since, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(objects, bl);
+ {
+ map<string,string> attrs; // deprecated
+ decode(attrs, bl);
+ }
+ ceph::buffer::list old_logbl; // not used
+ decode(old_logbl, bl);
+ decode(valid_through, bl);
+ decode(incr_since, bl);
+ DECODE_FINISH(bl);
+
+ // handle hobject_t upgrade
+ if (struct_v < 3) {
+ map<hobject_t, object> tmp;
+ tmp.swap(objects);
+ for (auto i = tmp.begin(); i != tmp.end(); ++i) {
+ hobject_t first(i->first);
+ if (!first.is_max() && first.pool == -1)
+ first.pool = pool;
+ objects[first] = i->second;
+ }
+ }
+}
+
+void ScrubMap::dump(Formatter *f) const
+{
+ f->dump_stream("valid_through") << valid_through;
+ f->dump_stream("incremental_since") << incr_since;
+ f->open_array_section("objects");
+ for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
+ f->open_object_section("object");
+ f->dump_string("name", p->first.oid.name);
+ f->dump_unsigned("hash", p->first.get_hash());
+ f->dump_string("key", p->first.get_key());
+ f->dump_int("snapid", p->first.snap);
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
+{
+ o.push_back(new ScrubMap);
+ o.push_back(new ScrubMap);
+ o.back()->valid_through = eversion_t(1, 2);
+ o.back()->incr_since = eversion_t(3, 4);
+ list<object*> obj;
+ object::generate_test_instances(obj);
+ o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
+ obj.pop_back();
+ o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
+}
+
+// -- ScrubMap::object --
+
+void ScrubMap::object::encode(ceph::buffer::list& bl) const
+{
+ bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
+ ENCODE_START(10, 7, bl);
+ encode(size, bl);
+ encode(negative, bl);
+ encode(attrs, bl);
+ encode(digest, bl);
+ encode(digest_present, bl);
+ encode((uint32_t)0, bl); // obsolete nlinks
+ encode((uint32_t)0, bl); // snapcolls
+ encode(omap_digest, bl);
+ encode(omap_digest_present, bl);
+ encode(compat_read_error, bl);
+ encode(stat_error, bl);
+ encode(read_error, bl);
+ encode(ec_hash_mismatch, bl);
+ encode(ec_size_mismatch, bl);
+ encode(large_omap_object_found, bl);
+ encode(large_omap_object_key_count, bl);
+ encode(large_omap_object_value_size, bl);
+ encode(object_omap_bytes, bl);
+ encode(object_omap_keys, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
+{
+ DECODE_START(10, bl);
+ decode(size, bl);
+ bool tmp, compat_read_error = false;
+ decode(tmp, bl);
+ negative = tmp;
+ decode(attrs, bl);
+ decode(digest, bl);
+ decode(tmp, bl);
+ digest_present = tmp;
+ {
+ uint32_t nlinks;
+ decode(nlinks, bl);
+ set<snapid_t> snapcolls;
+ decode(snapcolls, bl);
+ }
+ decode(omap_digest, bl);
+ decode(tmp, bl);
+ omap_digest_present = tmp;
+ decode(compat_read_error, bl);
+ decode(tmp, bl);
+ stat_error = tmp;
+ if (struct_v >= 8) {
+ decode(tmp, bl);
+ read_error = tmp;
+ decode(tmp, bl);
+ ec_hash_mismatch = tmp;
+ decode(tmp, bl);
+ ec_size_mismatch = tmp;
+ }
+ // If older encoder found a read_error, set read_error
+ if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
+ read_error = true;
+ if (struct_v >= 9) {
+ decode(tmp, bl);
+ large_omap_object_found = tmp;
+ decode(large_omap_object_key_count, bl);
+ decode(large_omap_object_value_size, bl);
+ }
+ if (struct_v >= 10) {
+ decode(object_omap_bytes, bl);
+ decode(object_omap_keys, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void ScrubMap::object::dump(Formatter *f) const
+{
+ f->dump_int("size", size);
+ f->dump_int("negative", negative);
+ f->open_array_section("attrs");
+ for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
+ f->open_object_section("attr");
+ f->dump_string("name", p->first);
+ f->dump_int("length", p->second.length());
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void ScrubMap::object::generate_test_instances(list<object*>& o)
+{
+ o.push_back(new object);
+ o.push_back(new object);
+ o.back()->negative = true;
+ o.push_back(new object);
+ o.back()->size = 123;
+ o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
+ o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
+}
+
+// -- OSDOp --
+
+ostream& operator<<(ostream& out, const OSDOp& op)
+{
+ out << ceph_osd_op_name(op.op.op);
+ if (ceph_osd_op_type_data(op.op.op)) {
+ // data extent
+ switch (op.op.op) {
+ case CEPH_OSD_OP_ASSERT_VER:
+ out << " v" << op.op.assert_ver.ver;
+ break;
+ case CEPH_OSD_OP_TRUNCATE:
+ out << " " << op.op.extent.offset;
+ break;
+ case CEPH_OSD_OP_MASKTRUNC:
+ case CEPH_OSD_OP_TRIMTRUNC:
+ out << " " << op.op.extent.truncate_seq << "@"
+ << (int64_t)op.op.extent.truncate_size;
+ break;
+ case CEPH_OSD_OP_ROLLBACK:
+ out << " " << snapid_t(op.op.snap.snapid);
+ break;
+ case CEPH_OSD_OP_WATCH:
+ out << " " << ceph_osd_watch_op_name(op.op.watch.op)
+ << " cookie " << op.op.watch.cookie;
+ if (op.op.watch.gen)
+ out << " gen " << op.op.watch.gen;
+ break;
+ case CEPH_OSD_OP_NOTIFY:
+ out << " cookie " << op.op.notify.cookie;
+ break;
+ case CEPH_OSD_OP_COPY_GET:
+ out << " max " << op.op.copy_get.max;
+ break;
+ case CEPH_OSD_OP_COPY_FROM:
+ out << " ver " << op.op.copy_from.src_version;
+ break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ out << " object_size " << op.op.alloc_hint.expected_object_size
+ << " write_size " << op.op.alloc_hint.expected_write_size;
+ break;
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_SYNC_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_ZERO:
+ case CEPH_OSD_OP_APPEND:
+ case CEPH_OSD_OP_MAPEXT:
+ case CEPH_OSD_OP_CMPEXT:
+ out << " " << op.op.extent.offset << "~" << op.op.extent.length;
+ if (op.op.extent.truncate_seq)
+ out << " [" << op.op.extent.truncate_seq << "@"
+ << (int64_t)op.op.extent.truncate_size << "]";
+ if (op.op.flags)
+ out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
+ default:
+ // don't show any arg info
+ break;
+ }
+ } else if (ceph_osd_op_type_attr(op.op.op)) {
+ // xattr name
+ if (op.op.xattr.name_len && op.indata.length()) {
+ out << " ";
+ op.indata.write(0, op.op.xattr.name_len, out);
+ }
+ if (op.op.xattr.value_len)
+ out << " (" << op.op.xattr.value_len << ")";
+ if (op.op.op == CEPH_OSD_OP_CMPXATTR)
+ out << " op " << (int)op.op.xattr.cmp_op
+ << " mode " << (int)op.op.xattr.cmp_mode;
+ } else if (ceph_osd_op_type_exec(op.op.op)) {
+ // class.method
+ if (op.op.cls.class_len && op.indata.length()) {
+ out << " ";
+ op.indata.write(0, op.op.cls.class_len, out);
+ out << ".";
+ op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
+ }
+ } else if (ceph_osd_op_type_pg(op.op.op)) {
+ switch (op.op.op) {
+ case CEPH_OSD_OP_PGLS:
+ case CEPH_OSD_OP_PGLS_FILTER:
+ case CEPH_OSD_OP_PGNLS:
+ case CEPH_OSD_OP_PGNLS_FILTER:
+ out << " start_epoch " << op.op.pgls.start_epoch;
+ break;
+ case CEPH_OSD_OP_PG_HITSET_LS:
+ break;
+ case CEPH_OSD_OP_PG_HITSET_GET:
+ out << " " << utime_t(op.op.hit_set_get.stamp);
+ break;
+ case CEPH_OSD_OP_SCRUBLS:
+ break;
+ }
+ }
+ if (op.indata.length()) {
+ out << " in=" << op.indata.length() << "b";
+ }
+ if (op.outdata.length()) {
+ out << " out=" << op.outdata.length() << "b";
+ }
+ return out;
+}
+
+
+void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
+{
+ auto datap = in.begin();
+ for (unsigned i = 0; i < ops.size(); i++) {
+ if (ops[i].op.payload_len) {
+ datap.copy(ops[i].op.payload_len, ops[i].outdata);
+ }
+ }
+}
+
+void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
+{
+ for (unsigned i = 0; i < ops.size(); i++) {
+ ops[i].op.payload_len = ops[i].outdata.length();
+ if (ops[i].outdata.length()) {
+ out.append(ops[i].outdata);
+ }
+ }
+}
+
+int prepare_info_keymap(
+ CephContext* cct,
+ map<string,bufferlist> *km,
+ string *key_to_remove,
+ epoch_t epoch,
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ bool dirty_big_info,
+ bool dirty_epoch,
+ bool try_fast_info,
+ PerfCounters *logger,
+ DoutPrefixProvider *dpp)
+{
+ if (dirty_epoch) {
+ encode(epoch, (*km)[string(epoch_key)]);
+ }
+
+ if (logger)
+ logger->inc(l_osd_pg_info);
+
+ // try to do info efficiently?
+ if (!dirty_big_info && try_fast_info &&
+ info.last_update > last_written_info.last_update) {
+ pg_fast_info_t fast;
+ fast.populate_from(info);
+ bool did = fast.try_apply_to(&last_written_info);
+ ceph_assert(did); // we verified last_update increased above
+ if (info == last_written_info) {
+ encode(fast, (*km)[string(fastinfo_key)]);
+ if (logger)
+ logger->inc(l_osd_pg_fastinfo);
+ return 0;
+ }
+ if (dpp) {
+ ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
+ {
+ JSONFormatter jf(true);
+ jf.dump_object("info", info);
+ jf.flush(*_dout);
+ }
+ {
+ *_dout << "\nlast_written_info:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("last_written_info", last_written_info);
+ jf.flush(*_dout);
+ }
+ *_dout << dendl;
+ }
+ } else if (info.last_update <= last_written_info.last_update) {
+ // clean up any potentially stale fastinfo key resulting from last_update
+ // not moving forwards (e.g., a backwards jump during peering)
+ *key_to_remove = fastinfo_key;
+ }
+
+ last_written_info = info;
+
+ // info. store purged_snaps separately.
+ interval_set<snapid_t> purged_snaps;
+ purged_snaps.swap(info.purged_snaps);
+ encode(info, (*km)[string(info_key)]);
+ purged_snaps.swap(info.purged_snaps);
+
+ if (dirty_big_info) {
+ // potentially big stuff
+ bufferlist& bigbl = (*km)[string(biginfo_key)];
+ encode(past_intervals, bigbl);
+ encode(info.purged_snaps, bigbl);
+ //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
+ if (logger)
+ logger->inc(l_osd_pg_biginfo);
+ }
+
+ return 0;
+}
+
+void create_pg_collection(
+ ceph::os::Transaction& t, spg_t pgid, int bits)
+{
+ coll_t coll(pgid);
+ t.create_collection(coll, bits);
+}
+
+void init_pg_ondisk(
+ ceph::os::Transaction& t,
+ spg_t pgid,
+ const pg_pool_t *pool)
+{
+ coll_t coll(pgid);
+ if (pool) {
+ // Give a hint to the PG collection
+ bufferlist hint;
+ uint32_t pg_num = pool->get_pg_num();
+ uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
+ encode(pg_num, hint);
+ encode(expected_num_objects_pg, hint);
+ uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
+ t.collection_hint(coll, hint_type, hint);
+ }
+
+ ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+ t.touch(coll, pgmeta_oid);
+ map<string,bufferlist> values;
+ __u8 struct_v = pg_latest_struct_v;
+ encode(struct_v, values[string(infover_key)]);
+ t.omap_setkeys(coll, pgmeta_oid, values);
+}
+
+PGLSFilter::PGLSFilter() : cct(nullptr)
+{
+}
+
+PGLSFilter::~PGLSFilter()
+{
+}
+
+int PGLSPlainFilter::init(ceph::bufferlist::const_iterator &params)
+{
+ try {
+ decode(xattr, params);
+ decode(val, params);
+ } catch (ceph::buffer::error &e) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+bool PGLSPlainFilter::filter(const hobject_t& obj,
+ const ceph::bufferlist& xattr_data) const
+{
+ return xattr_data.contents_equal(val.c_str(), val.size());
+}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
new file mode 100644
index 000000000..93645c5f2
--- /dev/null
+++ b/src/osd/osd_types.h
@@ -0,0 +1,6568 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_TYPES_H
+#define CEPH_OSD_TYPES_H
+
+#include <atomic>
+#include <sstream>
+#include <cstdio>
+#include <memory>
+#include <string_view>
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/optional/optional_io.hpp>
+#include <boost/variant.hpp>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+
+#include "include/rados/rados_types.hpp"
+#include "include/mempool.h"
+
+#include "msg/msg_types.h"
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/utime.h"
+#include "include/CompatSet.h"
+#include "common/ceph_context.h"
+#include "common/histogram.h"
+#include "include/interval_set.h"
+#include "include/inline_memory.h"
+#include "common/Formatter.h"
+#include "common/bloom_filter.hpp"
+#include "common/hobject.h"
+#include "common/snap_types.h"
+#include "HitSet.h"
+#include "Watch.h"
+#include "include/cmp.h"
+#include "librados/ListObjectImpl.h"
+#include "compressor/Compressor.h"
+#include "osd_perf_counters.h"
+
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
+
+#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
+#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
+#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
+#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
+#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
+#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
+#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
+#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
+#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
+#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
+#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
+#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
+#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
+#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
+
+
+/// pool priority range set by user
+#define OSD_POOL_PRIORITY_MAX 10
+#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
+
+/// min recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MIN 0
+
+/// base backfill priority for MBackfillReserve
+#define OSD_BACKFILL_PRIORITY_BASE 100
+
+/// base backfill priority for MBackfillReserve (degraded PG)
+#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
+
+/// base recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_BASE 180
+
+/// base backfill priority for MBackfillReserve (inactive PG)
+#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
+
+/// base recovery priority for MRecoveryReserve (inactive PG)
+#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
+
+/// max manually/automatically set recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MAX 253
+
+/// backfill priority for MBackfillReserve, when forced manually
+#define OSD_BACKFILL_PRIORITY_FORCED 254
+
+/// recovery priority for MRecoveryReserve, when forced manually
+#define OSD_RECOVERY_PRIORITY_FORCED 255
+
+/// priority for pg deletion when osd is not fullish
+#define OSD_DELETE_PRIORITY_NORMAL 179
+
+/// priority for pg deletion when osd is approaching full
+#define OSD_DELETE_PRIORITY_FULLISH 219
+
+/// priority when more full
+#define OSD_DELETE_PRIORITY_FULL 255
+
+static std::map<int, int> max_prio_map = {
+ {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
+ {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
+ {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
+ {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
+ {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
+};
+
+typedef hobject_t collection_list_handle_t;
+
+/// convert a single CPEH_OSD_FLAG_* to a std::string
+const char *ceph_osd_flag_name(unsigned flag);
+/// convert a single CEPH_OSD_OF_FLAG_* to a std::string
+const char *ceph_osd_op_flag_name(unsigned flag);
+
+/// convert CEPH_OSD_FLAG_* op flags to a std::string
+std::string ceph_osd_flag_string(unsigned flags);
+/// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
+std::string ceph_osd_op_flag_string(unsigned flags);
+/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
+std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
+
+typedef std::map<std::string,std::string> osd_alert_list_t;
+/// map osd id -> alert_list_t
+typedef std::map<int, osd_alert_list_t> osd_alerts_t;
+void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
+
+
+typedef interval_set<
+ snapid_t,
+ mempool::osdmap::flat_map> snap_interval_set_t;
+
+
+/**
+ * osd request identifier
+ *
+ * caller name + incarnation# + tid to unique identify this request.
+ */
+struct osd_reqid_t {
+ entity_name_t name; // who
+ ceph_tid_t tid;
+ int32_t inc; // incarnation
+
+ osd_reqid_t()
+ : tid(0), inc(0)
+ {}
+ osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
+ : name(a), tid(t), inc(i)
+ {}
+
+ DENC(osd_reqid_t, v, p) {
+ DENC_START(2, 2, p);
+ denc(v.name, p);
+ denc(v.tid, p);
+ denc(v.inc, p);
+ DENC_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<osd_reqid_t*>& o);
+};
+WRITE_CLASS_DENC(osd_reqid_t)
+
+
+
+struct pg_shard_t {
+ static const int32_t NO_OSD = 0x7fffffff;
+ int32_t osd;
+ shard_id_t shard;
+ pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
+ explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
+ pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
+ bool is_undefined() const {
+ return osd == -1;
+ }
+ std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const {
+ f->dump_unsigned("osd", osd);
+ if (shard != shard_id_t::NO_SHARD) {
+ f->dump_unsigned("shard", shard);
+ }
+ }
+};
+WRITE_CLASS_ENCODER(pg_shard_t)
+WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
+WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
+std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
+
+using HobjToShardSetMapping = std::map<hobject_t, std::set<pg_shard_t>>;
+
+class IsPGRecoverablePredicate {
+public:
+ /**
+ * have encodes the shards available
+ */
+ virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
+ virtual ~IsPGRecoverablePredicate() {}
+};
+
+class IsPGReadablePredicate {
+public:
+ /**
+ * have encodes the shards available
+ */
+ virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
+ virtual ~IsPGReadablePredicate() {}
+};
+
+inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
+ return out << r.name << "." << r.inc << ":" << r.tid;
+}
+
+inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
+ return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
+}
+inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
+ return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
+}
+inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
+ return (l.name < r.name) || (l.inc < r.inc) ||
+ (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
+}
+inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
+ return (l.name < r.name) || (l.inc < r.inc) ||
+ (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
+}
+inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
+inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
+
+namespace std {
+ template<> struct hash<osd_reqid_t> {
+ size_t operator()(const osd_reqid_t &r) const {
+ static hash<uint64_t> H;
+ return H(r.name.num() ^ r.tid ^ r.inc);
+ }
+ };
+} // namespace std
+
+
+// -----
+
+// a locator constrains the placement of an object. mainly, which pool
+// does it go in.
+struct object_locator_t {
+ // You specify either the hash or the key -- not both
+ std::int64_t pool; ///< pool id
+ std::string key; ///< key string (if non-empty)
+ std::string nspace; ///< namespace
+ std::int64_t hash; ///< hash position (if >= 0)
+
+ explicit object_locator_t()
+ : pool(-1), hash(-1) {}
+ explicit object_locator_t(int64_t po)
+ : pool(po), hash(-1) {}
+ explicit object_locator_t(int64_t po, int64_t ps)
+ : pool(po), hash(ps) {}
+ explicit object_locator_t(int64_t po, std::string_view ns)
+ : pool(po), nspace(ns), hash(-1) {}
+ explicit object_locator_t(int64_t po, std::string_view ns, int64_t ps)
+ : pool(po), nspace(ns), hash(ps) {}
+ explicit object_locator_t(int64_t po, std::string_view ns, std::string_view s)
+ : pool(po), key(s), nspace(ns), hash(-1) {}
+ explicit object_locator_t(const hobject_t& soid)
+ : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
+
+ int64_t get_pool() const {
+ return pool;
+ }
+
+ void clear() {
+ pool = -1;
+ key = "";
+ nspace = "";
+ hash = -1;
+ }
+
+ bool empty() const {
+ return pool == -1;
+ }
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<object_locator_t*>& o);
+};
+WRITE_CLASS_ENCODER(object_locator_t)
+
+inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
+ return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
+}
+inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
+ return !(l == r);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
+{
+ out << "@" << loc.pool;
+ if (loc.nspace.length())
+ out << ";" << loc.nspace;
+ if (loc.key.length())
+ out << ":" << loc.key;
+ return out;
+}
+
+struct request_redirect_t {
+private:
+ object_locator_t redirect_locator; ///< this is authoritative
+ std::string redirect_object; ///< If non-empty, the request goes to this object name
+
+ friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
+public:
+
+ request_redirect_t() {}
+ explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
+ redirect_locator(orig) { redirect_locator.pool = rpool; }
+ explicit request_redirect_t(const object_locator_t& rloc) :
+ redirect_locator(rloc) {}
+ explicit request_redirect_t(const object_locator_t& orig,
+ const std::string& robj) :
+ redirect_locator(orig), redirect_object(robj) {}
+
+ bool empty() const { return redirect_locator.empty() &&
+ redirect_object.empty(); }
+
+ void combine_with_locator(object_locator_t& orig, std::string& obj) const {
+ orig = redirect_locator;
+ if (!redirect_object.empty())
+ obj = redirect_object;
+ }
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<request_redirect_t*>& o);
+};
+WRITE_CLASS_ENCODER(request_redirect_t)
+
+inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
+ out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
+ return out;
+}
+
+// Internal OSD op flags - set by the OSD based on the op types
+enum {
+ CEPH_OSD_RMW_FLAG_READ = (1 << 1),
+ CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
+ CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
+ CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
+ CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
+ CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
+ CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
+ CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
+ CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
+ CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
+ CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
+};
+
+
+// pg stuff
+
+#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
+
+// placement seed (a hash value)
+typedef uint32_t ps_t;
+
+// old (v1) pg_t encoding (wrap old struct ceph_pg)
+struct old_pg_t {
+ ceph_pg v;
+ void encode(ceph::buffer::list& bl) const {
+ ceph::encode_raw(v, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ ceph::decode_raw(v, bl);
+ }
+};
+WRITE_CLASS_ENCODER(old_pg_t)
+
+// placement group id
+struct pg_t {
+ uint64_t m_pool;
+ uint32_t m_seed;
+
+ pg_t() : m_pool(0), m_seed(0) {}
+ pg_t(ps_t seed, uint64_t pool) :
+ m_pool(pool), m_seed(seed) {}
+ // cppcheck-suppress noExplicitConstructor
+ pg_t(const ceph_pg& cpg) :
+ m_pool(cpg.pool), m_seed(cpg.ps) {}
+
+ // cppcheck-suppress noExplicitConstructor
+ pg_t(const old_pg_t& opg) {
+ *this = opg.v;
+ }
+
+ old_pg_t get_old_pg() const {
+ old_pg_t o;
+ ceph_assert(m_pool < 0xffffffffull);
+ o.v.pool = m_pool;
+ o.v.ps = m_seed;
+ o.v.preferred = (__s16)-1;
+ return o;
+ }
+
+ ps_t ps() const {
+ return m_seed;
+ }
+ int64_t pool() const {
+ return m_pool;
+ }
+
+ static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
+ char *calc_name(char *buf, const char *suffix_backwords) const;
+
+ void set_ps(ps_t p) {
+ m_seed = p;
+ }
+ void set_pool(uint64_t p) {
+ m_pool = p;
+ }
+
+ pg_t get_parent() const;
+ pg_t get_ancestor(unsigned old_pg_num) const;
+
+ int print(char *o, int maxlen) const;
+ bool parse(const char *s);
+
+ bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
+
+ bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
+ bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
+ return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
+ }
+
+ /**
+ * Returns b such that for all object o:
+ * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
+ */
+ unsigned get_split_bits(unsigned pg_num) const;
+
+ bool contains(int bits, const ghobject_t& oid) const {
+ return
+ (int64_t)m_pool == oid.hobj.get_logical_pool() &&
+ oid.match(bits, ps());
+ }
+ bool contains(int bits, const hobject_t& oid) const {
+ return
+ (int64_t)m_pool == oid.get_logical_pool() &&
+ oid.match(bits, ps());
+ }
+
+ hobject_t get_hobj_start() const;
+ hobject_t get_hobj_end(unsigned pg_num) const;
+
+ // strong ordering is supported
+ inline int compare(const pg_t& p) const noexcept {
+ if (auto delta = pool() - p.pool(); delta != 0) {
+ return delta;
+ } else if (ps() < p.ps()) {
+ return -1;
+ } else if (ps() > p.ps()) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(m_pool, bl);
+ encode(m_seed, bl);
+ encode((int32_t)-1, bl); // was preferred
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ decode(m_pool, bl);
+ decode(m_seed, bl);
+ bl += sizeof(int32_t); // was preferred
+ }
+ void decode_old(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ old_pg_t opg;
+ decode(opg, bl);
+ *this = opg;
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_t)
+
+inline bool operator<(const pg_t& l, const pg_t& r) {
+ return l.compare(r) < 0;
+}
+inline bool operator<=(const pg_t& l, const pg_t& r) {
+ return l.compare(r) <= 0;
+}
+inline bool operator==(const pg_t& l, const pg_t& r) {
+ return l.compare(r) == 0;
+}
+inline bool operator!=(const pg_t& l, const pg_t& r) {
+ return l.compare(r) != 0;
+}
+inline bool operator>(const pg_t& l, const pg_t& r) {
+ return l.compare(r) > 0;
+}
+inline bool operator>=(const pg_t& l, const pg_t& r) {
+ return l.compare(r) >= 0;
+}
+
+std::ostream& operator<<(std::ostream& out, const pg_t &pg);
+
+namespace std {
+ template<> struct hash< pg_t >
+ {
+ size_t operator()( const pg_t& x ) const
+ {
+ static hash<uint32_t> H;
+ // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
+ return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
+ }
+ };
+} // namespace std
+
+struct spg_t {
+ pg_t pgid;
+ shard_id_t shard;
+ spg_t() : shard(shard_id_t::NO_SHARD) {}
+ spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
+ explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
+ unsigned get_split_bits(unsigned pg_num) const {
+ return pgid.get_split_bits(pg_num);
+ }
+ spg_t get_parent() const {
+ return spg_t(pgid.get_parent(), shard);
+ }
+ ps_t ps() const {
+ return pgid.ps();
+ }
+ uint64_t pool() const {
+ return pgid.pool();
+ }
+ void reset_shard(shard_id_t s) {
+ shard = s;
+ }
+
+ static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
+ char *calc_name(char *buf, const char *suffix_backwords) const;
+
+ bool parse(const char *s);
+ bool parse(const std::string& s) {
+ return parse(s.c_str());
+ }
+
+ spg_t get_ancestor(unsigned old_pg_num) const {
+ return spg_t(pgid.get_ancestor(old_pg_num), shard);
+ }
+
+ bool is_split(unsigned old_pg_num, unsigned new_pg_num,
+ std::set<spg_t> *pchildren) const {
+ std::set<pg_t> _children;
+ std::set<pg_t> *children = pchildren ? &_children : NULL;
+ bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
+ if (pchildren && is_split) {
+ for (std::set<pg_t>::iterator i = _children.begin();
+ i != _children.end();
+ ++i) {
+ pchildren->insert(spg_t(*i, shard));
+ }
+ }
+ return is_split;
+ }
+ bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
+ return pgid.is_merge_target(old_pg_num, new_pg_num);
+ }
+ bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
+ spg_t *parent) const {
+ spg_t out = *this;
+ bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
+ if (r && parent) {
+ *parent = out;
+ }
+ return r;
+ }
+
+ bool is_no_shard() const {
+ return shard == shard_id_t::NO_SHARD;
+ }
+
+ ghobject_t make_pgmeta_oid() const {
+ return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(pgid, bl);
+ encode(shard, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(pgid, bl);
+ decode(shard, bl);
+ DECODE_FINISH(bl);
+ }
+
+ ghobject_t make_temp_ghobject(const std::string& name) const {
+ return ghobject_t(
+ hobject_t(object_t(name), "", CEPH_NOSNAP,
+ pgid.ps(),
+ hobject_t::get_temp_pool(pgid.pool()),
+ ""),
+ ghobject_t::NO_GEN,
+ shard);
+ }
+
+ unsigned hash_to_shard(unsigned num_shards) const {
+ return ps() % num_shards;
+ }
+};
+WRITE_CLASS_ENCODER(spg_t)
+WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
+WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
+
+namespace std {
+ template<> struct hash< spg_t >
+ {
+ size_t operator()( const spg_t& x ) const
+ {
+ static hash<uint32_t> H;
+ return H(hash<pg_t>()(x.pgid) ^ x.shard);
+ }
+ };
+} // namespace std
+
+std::ostream& operator<<(std::ostream& out, const spg_t &pg);
+
+// ----------------------
+
+class coll_t {
+ enum type_t {
+ TYPE_META = 0,
+ TYPE_LEGACY_TEMP = 1, /* no longer used */
+ TYPE_PG = 2,
+ TYPE_PG_TEMP = 3,
+ };
+ type_t type;
+ spg_t pgid;
+ uint64_t removal_seq; // note: deprecated, not encoded
+
+ char _str_buff[spg_t::calc_name_buf_size];
+ char *_str;
+
+ void calc_str();
+
+ coll_t(type_t t, spg_t p, uint64_t r)
+ : type(t), pgid(p), removal_seq(r) {
+ calc_str();
+ }
+
+public:
+ coll_t() : type(TYPE_META), removal_seq(0)
+ {
+ calc_str();
+ }
+
+ coll_t(const coll_t& other)
+ : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
+ calc_str();
+ }
+
+ explicit coll_t(spg_t pgid)
+ : type(TYPE_PG), pgid(pgid), removal_seq(0)
+ {
+ calc_str();
+ }
+
+ coll_t& operator=(const coll_t& rhs)
+ {
+ this->type = rhs.type;
+ this->pgid = rhs.pgid;
+ this->removal_seq = rhs.removal_seq;
+ this->calc_str();
+ return *this;
+ }
+
+ // named constructors
+ static coll_t meta() {
+ return coll_t();
+ }
+ static coll_t pg(spg_t p) {
+ return coll_t(p);
+ }
+
+ const std::string to_str() const {
+ return std::string(_str);
+ }
+ const char *c_str() const {
+ return _str;
+ }
+
+ bool parse(const std::string& s);
+
+ int operator<(const coll_t &rhs) const {
+ return type < rhs.type ||
+ (type == rhs.type && pgid < rhs.pgid);
+ }
+
+ bool is_meta() const {
+ return type == TYPE_META;
+ }
+ bool is_pg_prefix(spg_t *pgid_) const {
+ if (type == TYPE_PG || type == TYPE_PG_TEMP) {
+ *pgid_ = pgid;
+ return true;
+ }
+ return false;
+ }
+ bool is_pg() const {
+ return type == TYPE_PG;
+ }
+ bool is_pg(spg_t *pgid_) const {
+ if (type == TYPE_PG) {
+ *pgid_ = pgid;
+ return true;
+ }
+ return false;
+ }
+ bool is_temp() const {
+ return type == TYPE_PG_TEMP;
+ }
+ bool is_temp(spg_t *pgid_) const {
+ if (type == TYPE_PG_TEMP) {
+ *pgid_ = pgid;
+ return true;
+ }
+ return false;
+ }
+ int64_t pool() const {
+ return pgid.pool();
+ }
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ size_t encoded_size() const;
+
+ inline bool operator==(const coll_t& rhs) const {
+ // only compare type if meta
+ if (type != rhs.type)
+ return false;
+ if (type == TYPE_META)
+ return true;
+ return type == rhs.type && pgid == rhs.pgid;
+ }
+ inline bool operator!=(const coll_t& rhs) const {
+ return !(*this == rhs);
+ }
+
+ // get a TEMP collection that corresponds to the current collection,
+ // which we presume is a pg collection.
+ coll_t get_temp() const {
+ ceph_assert(type == TYPE_PG);
+ return coll_t(TYPE_PG_TEMP, pgid, 0);
+ }
+
+ ghobject_t get_min_hobj() const {
+ ghobject_t o;
+ switch (type) {
+ case TYPE_PG:
+ o.hobj.pool = pgid.pool();
+ o.set_shard(pgid.shard);
+ break;
+ case TYPE_META:
+ o.hobj.pool = -1;
+ break;
+ default:
+ break;
+ }
+ return o;
+ }
+
+ unsigned hash_to_shard(unsigned num_shards) const {
+ if (type == TYPE_PG)
+ return pgid.hash_to_shard(num_shards);
+ return 0; // whatever.
+ }
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<coll_t*>& o);
+};
+
+WRITE_CLASS_ENCODER(coll_t)
+
+inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
+ out << c.to_str();
+ return out;
+}
+
+namespace std {
+ template<> struct hash<coll_t> {
+ size_t operator()(const coll_t &c) const {
+ size_t h = 0;
+ std::string str(c.to_str());
+ std::string::const_iterator end(str.end());
+ for (std::string::const_iterator s = str.begin(); s != end; ++s) {
+ h += *s;
+ h += (h << 10);
+ h ^= (h >> 6);
+ }
+ h += (h << 3);
+ h ^= (h >> 11);
+ h += (h << 15);
+ return h;
+ }
+ };
+} // namespace std
+
+inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
+{
+ out << pg_t(ol.ol_pgid);
+ int su = ol.ol_stripe_unit;
+ if (su)
+ out << ".su=" << su;
+ return out;
+}
+
+
+
+// compound rados version type
+/* WARNING: If add member in eversion_t, please make sure the encode/decode function
+ * work well. For little-endian machine, we should make sure there is no padding
+ * in 32-bit machine and 64-bit machine.
+ */
+class eversion_t {
+public:
+ version_t version;
+ epoch_t epoch;
+ __u32 __pad;
+ eversion_t() : version(0), epoch(0), __pad(0) {}
+ eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
+
+ // cppcheck-suppress noExplicitConstructor
+ eversion_t(const ceph_eversion& ce) :
+ version(ce.version),
+ epoch(ce.epoch),
+ __pad(0) { }
+
+ explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
+
+ static const eversion_t& max() {
+ static const eversion_t max(-1,-1);
+ return max;
+ }
+
+ operator ceph_eversion() {
+ ceph_eversion c;
+ c.epoch = epoch;
+ c.version = version;
+ return c;
+ }
+
+ std::string get_key_name() const;
+
+ // key must point to the beginning of a block of 32 chars
+ inline void get_key_name(char* key) const {
+ // Below is equivalent of sprintf("%010u.%020llu");
+ key[31] = 0;
+ ritoa<uint64_t, 10, 20>(version, key + 31);
+ key[10] = '.';
+ ritoa<uint32_t, 10, 10>(epoch, key + 10);
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+#if defined(CEPH_LITTLE_ENDIAN)
+ bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
+#else
+ using ceph::encode;
+ encode(version, bl);
+ encode(epoch, bl);
+#endif
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+#if defined(CEPH_LITTLE_ENDIAN)
+ bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
+#else
+ using ceph::decode;
+ decode(version, bl);
+ decode(epoch, bl);
+#endif
+ }
+ void decode(ceph::buffer::list& bl) {
+ auto p = std::cbegin(bl);
+ decode(p);
+ }
+};
+WRITE_CLASS_ENCODER(eversion_t)
+
+inline bool operator==(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) && (l.version == r.version);
+}
+inline bool operator!=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch != r.epoch) || (l.version != r.version);
+}
+inline bool operator<(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
+}
+inline bool operator<=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
+}
+inline bool operator>(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
+}
+inline bool operator>=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
+}
+inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
+ return out << e.epoch << "'" << e.version;
+}
+
+/**
+ * objectstore_perf_stat_t
+ *
+ * current perf information about the osd
+ */
+struct objectstore_perf_stat_t {
+ // cur_op_latency is in ns since double add/sub are not associative
+ uint64_t os_commit_latency_ns;
+ uint64_t os_apply_latency_ns;
+
+ objectstore_perf_stat_t() :
+ os_commit_latency_ns(0), os_apply_latency_ns(0) {}
+
+ bool operator==(const objectstore_perf_stat_t &r) const {
+ return os_commit_latency_ns == r.os_commit_latency_ns &&
+ os_apply_latency_ns == r.os_apply_latency_ns;
+ }
+
+ void add(const objectstore_perf_stat_t &o) {
+ os_commit_latency_ns += o.os_commit_latency_ns;
+ os_apply_latency_ns += o.os_apply_latency_ns;
+ }
+ void sub(const objectstore_perf_stat_t &o) {
+ os_commit_latency_ns -= o.os_commit_latency_ns;
+ os_apply_latency_ns -= o.os_apply_latency_ns;
+ }
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
+
+/*
+ * pg states
+ */
+#define PG_STATE_CREATING (1ULL << 0) // creating
+#define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
+#define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
+#define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
+#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
+#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
+#define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
+#define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
+//#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
+#define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
+#define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
+#define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
+#define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
+#define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
+#define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
+#define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
+#define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
+#define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
+#define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
+#define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
+#define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
+#define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
+#define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
+#define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
+#define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
+#define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
+#define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
+#define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
+#define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
+#define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
+#define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
+#define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
+#define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
+#define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
+
+std::string pg_state_string(uint64_t state);
+std::string pg_vector_string(const std::vector<int32_t> &a);
+std::optional<uint64_t> pg_string_state(const std::string& state);
+
+
+/*
+ * pool_snap_info_t
+ *
+ * attributes for a single pool snapshot.
+ */
+struct pool_snap_info_t {
+ snapid_t snapid;
+ utime_t stamp;
+ std::string name;
+
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ static void generate_test_instances(std::list<pool_snap_info_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
+ return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
+}
+
+
+/*
+ * pool_opts_t
+ *
+ * pool options.
+ */
+
+// The order of items in the list is important, therefore,
+// you should always add to the end of the list when adding new options.
+
+class pool_opts_t {
+public:
+ enum key_t {
+ SCRUB_MIN_INTERVAL,
+ SCRUB_MAX_INTERVAL,
+ DEEP_SCRUB_INTERVAL,
+ RECOVERY_PRIORITY,
+ RECOVERY_OP_PRIORITY,
+ SCRUB_PRIORITY,
+ COMPRESSION_MODE,
+ COMPRESSION_ALGORITHM,
+ COMPRESSION_REQUIRED_RATIO,
+ COMPRESSION_MAX_BLOB_SIZE,
+ COMPRESSION_MIN_BLOB_SIZE,
+ CSUM_TYPE,
+ CSUM_MAX_BLOCK,
+ CSUM_MIN_BLOCK,
+ FINGERPRINT_ALGORITHM,
+ PG_NUM_MIN, // min pg_num
+ TARGET_SIZE_BYTES, // total bytes in pool
+ TARGET_SIZE_RATIO, // fraction of total cluster
+ PG_AUTOSCALE_BIAS,
+ READ_LEASE_INTERVAL,
+ DEDUP_TIER,
+ DEDUP_CHUNK_ALGORITHM,
+ DEDUP_CDC_CHUNK_SIZE,
+ PG_NUM_MAX, // max pg_num
+ };
+
+ enum type_t {
+ STR,
+ INT,
+ DOUBLE,
+ };
+
+ struct opt_desc_t {
+ key_t key;
+ type_t type;
+
+ opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
+
+ bool operator==(const opt_desc_t& rhs) const {
+ return key == rhs.key && type == rhs.type;
+ }
+ };
+
+ typedef boost::variant<std::string,int64_t,double> value_t;
+
+ static bool is_opt_name(const std::string& name);
+ static opt_desc_t get_opt_desc(const std::string& name);
+
+ pool_opts_t() : opts() {}
+
+ bool is_set(key_t key) const;
+
+ template<typename T>
+ void set(key_t key, const T &val) {
+ value_t value = val;
+ opts[key] = value;
+ }
+
+ template<typename T>
+ bool get(key_t key, T *val) const {
+ opts_t::const_iterator i = opts.find(key);
+ if (i == opts.end()) {
+ return false;
+ }
+ *val = boost::get<T>(i->second);
+ return true;
+ }
+
+ template<typename T>
+ T value_or(key_t key, T&& default_value) const {
+ auto i = opts.find(key);
+ if (i == opts.end()) {
+ return std::forward<T>(default_value);
+ }
+ return boost::get<T>(i->second);
+ }
+
+ const value_t& get(key_t key) const;
+
+ bool unset(key_t key);
+
+ void dump(const std::string& name, ceph::Formatter *f) const;
+
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+
+private:
+ typedef std::map<key_t, value_t> opts_t;
+ opts_t opts;
+
+ friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
+};
+WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
+
+struct pg_merge_meta_t {
+ pg_t source_pgid;
+ epoch_t ready_epoch = 0;
+ epoch_t last_epoch_started = 0;
+ epoch_t last_epoch_clean = 0;
+ eversion_t source_version;
+ eversion_t target_version;
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(source_pgid, bl);
+ encode(ready_epoch, bl);
+ encode(last_epoch_started, bl);
+ encode(last_epoch_clean, bl);
+ encode(source_version, bl);
+ encode(target_version, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(source_pgid, p);
+ decode(ready_epoch, p);
+ decode(last_epoch_started, p);
+ decode(last_epoch_clean, p);
+ decode(source_version, p);
+ decode(target_version, p);
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("source_pgid") << source_pgid;
+ f->dump_unsigned("ready_epoch", ready_epoch);
+ f->dump_unsigned("last_epoch_started", last_epoch_started);
+ f->dump_unsigned("last_epoch_clean", last_epoch_clean);
+ f->dump_stream("source_version") << source_version;
+ f->dump_stream("target_version") << target_version;
+ }
+};
+WRITE_CLASS_ENCODER(pg_merge_meta_t)
+
+class OSDMap;
+
+/*
+ * pg_pool
+ */
+struct pg_pool_t {
+ static const char *APPLICATION_NAME_CEPHFS;
+ static const char *APPLICATION_NAME_RBD;
+ static const char *APPLICATION_NAME_RGW;
+
+ enum {
+ TYPE_REPLICATED = 1, // replication
+ //TYPE_RAID4 = 2, // raid4 (never implemented)
+ TYPE_ERASURE = 3, // erasure-coded
+ };
+ static constexpr uint32_t pg_CRUSH_ITEM_NONE = 0x7fffffff; /* can't import crush.h here */
+ static std::string_view get_type_name(int t) {
+ switch (t) {
+ case TYPE_REPLICATED: return "replicated";
+ //case TYPE_RAID4: return "raid4";
+ case TYPE_ERASURE: return "erasure";
+ default: return "???";
+ }
+ }
+ std::string_view get_type_name() const {
+ return get_type_name(type);
+ }
+
+ enum {
+ FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
+ FLAG_FULL = 1<<1, // pool is full
+ FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
+ FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
+ FLAG_NODELETE = 1<<4, // pool can't be deleted
+ FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
+ FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
+ FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
+ FLAG_NOSCRUB = 1<<8, // block periodic scrub
+ FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
+ FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
+ FLAG_NEARFULL = 1<<11, // pool is nearfull
+ FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
+ FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
+ FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
+ FLAG_CREATING = 1<<15, // initial pool PGs are being created
+ FLAG_BULK = 1<<17, //pool is large
+ };
+
+ static const char *get_flag_name(int f) {
+ switch (f) {
+ case FLAG_HASHPSPOOL: return "hashpspool";
+ case FLAG_FULL: return "full";
+ case FLAG_EC_OVERWRITES: return "ec_overwrites";
+ case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
+ case FLAG_NODELETE: return "nodelete";
+ case FLAG_NOPGCHANGE: return "nopgchange";
+ case FLAG_NOSIZECHANGE: return "nosizechange";
+ case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
+ case FLAG_NOSCRUB: return "noscrub";
+ case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
+ case FLAG_FULL_QUOTA: return "full_quota";
+ case FLAG_NEARFULL: return "nearfull";
+ case FLAG_BACKFILLFULL: return "backfillfull";
+ case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
+ case FLAG_POOL_SNAPS: return "pool_snaps";
+ case FLAG_CREATING: return "creating";
+ case FLAG_BULK: return "bulk";
+ default: return "???";
+ }
+ }
+ static std::string get_flags_string(uint64_t f) {
+ std::string s;
+ for (unsigned n=0; f && n<64; ++n) {
+ if (f & (1ull << n)) {
+ if (s.length())
+ s += ",";
+ s += get_flag_name(1ull << n);
+ }
+ }
+ return s;
+ }
+ std::string get_flags_string() const {
+ return get_flags_string(flags);
+ }
+ static uint64_t get_flag_by_name(const std::string& name) {
+ if (name == "hashpspool")
+ return FLAG_HASHPSPOOL;
+ if (name == "full")
+ return FLAG_FULL;
+ if (name == "ec_overwrites")
+ return FLAG_EC_OVERWRITES;
+ if (name == "incomplete_clones")
+ return FLAG_INCOMPLETE_CLONES;
+ if (name == "nodelete")
+ return FLAG_NODELETE;
+ if (name == "nopgchange")
+ return FLAG_NOPGCHANGE;
+ if (name == "nosizechange")
+ return FLAG_NOSIZECHANGE;
+ if (name == "write_fadvise_dontneed")
+ return FLAG_WRITE_FADVISE_DONTNEED;
+ if (name == "noscrub")
+ return FLAG_NOSCRUB;
+ if (name == "nodeep-scrub")
+ return FLAG_NODEEP_SCRUB;
+ if (name == "full_quota")
+ return FLAG_FULL_QUOTA;
+ if (name == "nearfull")
+ return FLAG_NEARFULL;
+ if (name == "backfillfull")
+ return FLAG_BACKFILLFULL;
+ if (name == "selfmanaged_snaps")
+ return FLAG_SELFMANAGED_SNAPS;
+ if (name == "pool_snaps")
+ return FLAG_POOL_SNAPS;
+ if (name == "creating")
+ return FLAG_CREATING;
+ if (name == "bulk")
+ return FLAG_BULK;
+ return 0;
+ }
+
+ /// converts the acting/up vector to a set of pg shards
+ void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
+
+ typedef enum {
+ CACHEMODE_NONE = 0, ///< no caching
+ CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
+ CACHEMODE_FORWARD = 2, ///< forward if not in cache
+ CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
+ CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
+ CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
+ CACHEMODE_PROXY = 6, ///< proxy if not in cache
+ } cache_mode_t;
+ static const char *get_cache_mode_name(cache_mode_t m) {
+ switch (m) {
+ case CACHEMODE_NONE: return "none";
+ case CACHEMODE_WRITEBACK: return "writeback";
+ case CACHEMODE_FORWARD: return "forward";
+ case CACHEMODE_READONLY: return "readonly";
+ case CACHEMODE_READFORWARD: return "readforward";
+ case CACHEMODE_READPROXY: return "readproxy";
+ case CACHEMODE_PROXY: return "proxy";
+ default: return "unknown";
+ }
+ }
+ static cache_mode_t get_cache_mode_from_str(const std::string& s) {
+ if (s == "none")
+ return CACHEMODE_NONE;
+ if (s == "writeback")
+ return CACHEMODE_WRITEBACK;
+ if (s == "forward")
+ return CACHEMODE_FORWARD;
+ if (s == "readonly")
+ return CACHEMODE_READONLY;
+ if (s == "readforward")
+ return CACHEMODE_READFORWARD;
+ if (s == "readproxy")
+ return CACHEMODE_READPROXY;
+ if (s == "proxy")
+ return CACHEMODE_PROXY;
+ return (cache_mode_t)-1;
+ }
+ const char *get_cache_mode_name() const {
+ return get_cache_mode_name(cache_mode);
+ }
+ bool cache_mode_requires_hit_set() const {
+ switch (cache_mode) {
+ case CACHEMODE_NONE:
+ case CACHEMODE_FORWARD:
+ case CACHEMODE_READONLY:
+ case CACHEMODE_PROXY:
+ return false;
+ case CACHEMODE_WRITEBACK:
+ case CACHEMODE_READFORWARD:
+ case CACHEMODE_READPROXY:
+ return true;
+ default:
+ ceph_abort_msg("implement me");
+ }
+ }
+
+ enum class pg_autoscale_mode_t : uint8_t {
+ OFF = 0,
+ WARN = 1,
+ ON = 2,
+ UNKNOWN = UINT8_MAX,
+ };
+ static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
+ switch (m) {
+ case pg_autoscale_mode_t::OFF: return "off";
+ case pg_autoscale_mode_t::ON: return "on";
+ case pg_autoscale_mode_t::WARN: return "warn";
+ default: return "???";
+ }
+ }
+ static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
+ if (m == "off") {
+ return pg_autoscale_mode_t::OFF;
+ }
+ if (m == "warn") {
+ return pg_autoscale_mode_t::WARN;
+ }
+ if (m == "on") {
+ return pg_autoscale_mode_t::ON;
+ }
+ return pg_autoscale_mode_t::UNKNOWN;
+ }
+
+ utime_t create_time;
+ uint64_t flags = 0; ///< FLAG_*
+ __u8 type = 0; ///< TYPE_*
+ __u8 size = 0, min_size = 0; ///< number of osds in each pg
+ __u8 crush_rule = 0; ///< crush placement rule
+ __u8 object_hash = 0; ///< hash mapping object name to ps
+ pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
+
+private:
+ __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
+ __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
+ __u32 pg_num_target = 0; ///< pg_num we should converge toward
+ __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
+
+public:
+ std::map<std::string, std::string> properties; ///< OBSOLETE
+ std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
+ epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes
+ // If non-zero, require OSDs in at least this many different instances...
+ uint32_t peering_crush_bucket_count = 0;
+ // of this bucket type...
+ uint32_t peering_crush_bucket_barrier = 0;
+ // including this one
+ int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE;
+ // The per-bucket replica count is calculated with this "target"
+ // instead of the above crush_bucket_count. This means we can maintain a
+ // target size of 4 without attempting to place them all in 1 DC
+ uint32_t peering_crush_bucket_target = 0;
+ /// last epoch that forced clients to resend
+ epoch_t last_force_op_resend = 0;
+ /// last epoch that forced clients to resend (pre-nautilus clients only)
+ epoch_t last_force_op_resend_prenautilus = 0;
+ /// last epoch that forced clients to resend (pre-luminous clients only)
+ epoch_t last_force_op_resend_preluminous = 0;
+
+ /// metadata for the most recent PG merge
+ pg_merge_meta_t last_pg_merge_meta;
+
+ snapid_t snap_seq = 0; ///< seq for per-pool snapshot
+ epoch_t snap_epoch = 0; ///< osdmap epoch of last snap
+ uint64_t auid = 0; ///< who owns the pg
+
+ uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
+ uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
+
+ /*
+ * Pool snaps (global to this pool). These define a SnapContext for
+ * the pool, unless the client manually specifies an alternate
+ * context.
+ */
+ std::map<snapid_t, pool_snap_info_t> snaps;
+ /*
+ * Alternatively, if we are defining non-pool snaps (e.g. via the
+ * Ceph MDS), we must track @removed_snaps (since @snaps is not
+ * used). Snaps and removed_snaps are to be used exclusive of each
+ * other!
+ */
+ interval_set<snapid_t> removed_snaps;
+
+ unsigned pg_num_mask = 0, pgp_num_mask = 0;
+
+ std::set<uint64_t> tiers; ///< pools that are tiers of us
+ int64_t tier_of = -1; ///< pool for which we are a tier
+ // Note that write wins for read+write ops
+ int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to
+ int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to
+ cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode
+
+ bool is_tier() const { return tier_of >= 0; }
+ bool has_tiers() const { return !tiers.empty(); }
+ void clear_tier() {
+ tier_of = -1;
+ clear_read_tier();
+ clear_write_tier();
+ clear_tier_tunables();
+ }
+ bool has_read_tier() const { return read_tier >= 0; }
+ void clear_read_tier() { read_tier = -1; }
+ bool has_write_tier() const { return write_tier >= 0; }
+ void clear_write_tier() { write_tier = -1; }
+ void clear_tier_tunables() {
+ if (cache_mode != CACHEMODE_NONE)
+ flags |= FLAG_INCOMPLETE_CLONES;
+ cache_mode = CACHEMODE_NONE;
+
+ target_max_bytes = 0;
+ target_max_objects = 0;
+ cache_target_dirty_ratio_micro = 0;
+ cache_target_dirty_high_ratio_micro = 0;
+ cache_target_full_ratio_micro = 0;
+ hit_set_params = HitSet::Params();
+ hit_set_period = 0;
+ hit_set_count = 0;
+ hit_set_grade_decay_rate = 0;
+ hit_set_search_last_n = 0;
+ grade_table.resize(0);
+ }
+
+ bool is_stretch_pool() const {
+ return peering_crush_bucket_count != 0;
+ }
+
+ bool stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
+ std::ostream *out) const;
+ bool stretch_set_can_peer(const vector<int>& want, const OSDMap& osdmap,
+ std::ostream *out) const {
+ if (!is_stretch_pool()) return true;
+ set<int> swant;
+ for (auto i : want) swant.insert(i);
+ return stretch_set_can_peer(swant, osdmap, out);
+ }
+
+ uint64_t target_max_bytes = 0; ///< tiering: target max pool size
+ uint64_t target_max_objects = 0; ///< tiering: target max pool size
+
+ uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
+ uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed
+ uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest
+
+ uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush
+ uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict
+
+ HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
+ uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds)
+ uint32_t hit_set_count = 0; ///< number of periods to retain
+ bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object
+ uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read
+ uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write
+ uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
+ ///< temperature count,the follow hit_set's priority decay
+ ///< by this params than pre hit_set
+ uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature
+
+ uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes
+
+ uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
+ ///< user does not specify any expected value
+ bool fast_read = false; ///< whether turn on fast read on the pool or not
+
+ pool_opts_t opts; ///< options
+
+ typedef enum {
+ TYPE_FINGERPRINT_NONE = 0,
+ TYPE_FINGERPRINT_SHA1 = 1,
+ TYPE_FINGERPRINT_SHA256 = 2,
+ TYPE_FINGERPRINT_SHA512 = 3,
+ } fingerprint_t;
+ static fingerprint_t get_fingerprint_from_str(const std::string& s) {
+ if (s == "none")
+ return TYPE_FINGERPRINT_NONE;
+ if (s == "sha1")
+ return TYPE_FINGERPRINT_SHA1;
+ if (s == "sha256")
+ return TYPE_FINGERPRINT_SHA256;
+ if (s == "sha512")
+ return TYPE_FINGERPRINT_SHA512;
+ return (fingerprint_t)-1;
+ }
+ const fingerprint_t get_fingerprint_type() const {
+ std::string fp_str;
+ opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
+ return get_fingerprint_from_str(fp_str);
+ }
+ const char *get_fingerprint_name() const {
+ std::string fp_str;
+ fingerprint_t fp_t;
+ opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
+ fp_t = get_fingerprint_from_str(fp_str);
+ return get_fingerprint_name(fp_t);
+ }
+ static const char *get_fingerprint_name(fingerprint_t m) {
+ switch (m) {
+ case TYPE_FINGERPRINT_NONE: return "none";
+ case TYPE_FINGERPRINT_SHA1: return "sha1";
+ case TYPE_FINGERPRINT_SHA256: return "sha256";
+ case TYPE_FINGERPRINT_SHA512: return "sha512";
+ default: return "unknown";
+ }
+ }
+
+ typedef enum {
+ TYPE_DEDUP_CHUNK_NONE = 0,
+ TYPE_DEDUP_CHUNK_FASTCDC = 1,
+ TYPE_DEDUP_CHUNK_FIXEDCDC = 2,
+ } dedup_chunk_algo_t;
+ static dedup_chunk_algo_t get_dedup_chunk_algorithm_from_str(const std::string& s) {
+ if (s == "none")
+ return TYPE_DEDUP_CHUNK_NONE;
+ if (s == "fastcdc")
+ return TYPE_DEDUP_CHUNK_FASTCDC;
+ if (s == "fixed")
+ return TYPE_DEDUP_CHUNK_FIXEDCDC;
+ return (dedup_chunk_algo_t)-1;
+ }
+ const dedup_chunk_algo_t get_dedup_chunk_algorithm_type() const {
+ std::string algo_str;
+ opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &algo_str);
+ return get_dedup_chunk_algorithm_from_str(algo_str);
+ }
+ const char *get_dedup_chunk_algorithm_name() const {
+ std::string dedup_chunk_algo_str;
+ dedup_chunk_algo_t dedup_chunk_algo_t;
+ opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &dedup_chunk_algo_str);
+ dedup_chunk_algo_t = get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str);
+ return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t);
+ }
+ static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m) {
+ switch (m) {
+ case TYPE_DEDUP_CHUNK_NONE: return "none";
+ case TYPE_DEDUP_CHUNK_FASTCDC: return "fastcdc";
+ case TYPE_DEDUP_CHUNK_FIXEDCDC: return "fixed";
+ default: return "unknown";
+ }
+ }
+
+ int64_t get_dedup_tier() const {
+ int64_t tier_id = 0;
+ opts.get(pool_opts_t::DEDUP_TIER, &tier_id);
+ return tier_id;
+ }
+ int64_t get_dedup_cdc_chunk_size() const {
+ int64_t chunk_size = 0;
+ opts.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE, &chunk_size);
+ return chunk_size;
+ }
+
+ /// application -> key/value metadata
+ std::map<std::string, std::map<std::string, std::string>> application_metadata;
+
+private:
+ std::vector<uint32_t> grade_table;
+
+public:
+ uint32_t get_grade(unsigned i) const {
+ if (grade_table.size() <= i)
+ return 0;
+ return grade_table[i];
+ }
+ void calc_grade_table() {
+ unsigned v = 1000000;
+ grade_table.resize(hit_set_count);
+ for (unsigned i = 0; i < hit_set_count; i++) {
+ v = v * (1 - (hit_set_grade_decay_rate / 100.0));
+ grade_table[i] = v;
+ }
+ }
+
+ pg_pool_t() = default;
+
+ void dump(ceph::Formatter *f) const;
+
+ const utime_t &get_create_time() const { return create_time; }
+ uint64_t get_flags() const { return flags; }
+ bool has_flag(uint64_t f) const { return flags & f; }
+ void set_flag(uint64_t f) { flags |= f; }
+ void unset_flag(uint64_t f) { flags &= ~f; }
+
+ bool require_rollback() const {
+ return is_erasure();
+ }
+
+ /// true if incomplete clones may be present
+ bool allow_incomplete_clones() const {
+ return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+ }
+
+ unsigned get_type() const { return type; }
+ unsigned get_size() const { return size; }
+ unsigned get_min_size() const { return min_size; }
+ int get_crush_rule() const { return crush_rule; }
+ int get_object_hash() const { return object_hash; }
+ const char *get_object_hash_name() const {
+ return ceph_str_hash_name(get_object_hash());
+ }
+ epoch_t get_last_change() const { return last_change; }
+ epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
+ epoch_t get_last_force_op_resend_prenautilus() const {
+ return last_force_op_resend_prenautilus;
+ }
+ epoch_t get_last_force_op_resend_preluminous() const {
+ return last_force_op_resend_preluminous;
+ }
+ epoch_t get_snap_epoch() const { return snap_epoch; }
+ snapid_t get_snap_seq() const { return snap_seq; }
+ uint64_t get_auid() const { return auid; }
+
+ void set_snap_seq(snapid_t s) { snap_seq = s; }
+ void set_snap_epoch(epoch_t e) { snap_epoch = e; }
+
+ void set_stripe_width(uint32_t s) { stripe_width = s; }
+ uint32_t get_stripe_width() const { return stripe_width; }
+
+ bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
+ bool is_erasure() const { return get_type() == TYPE_ERASURE; }
+
+ bool supports_omap() const {
+ return !(get_type() == TYPE_ERASURE);
+ }
+
+ bool requires_aligned_append() const {
+ return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
+ }
+ uint64_t required_alignment() const { return stripe_width; }
+
+ bool allows_ecoverwrites() const {
+ return has_flag(FLAG_EC_OVERWRITES);
+ }
+
+ bool can_shift_osds() const {
+ switch (get_type()) {
+ case TYPE_REPLICATED:
+ return true;
+ case TYPE_ERASURE:
+ return false;
+ default:
+ ceph_abort_msg("unhandled pool type");
+ }
+ }
+
+ unsigned get_pg_num() const { return pg_num; }
+ unsigned get_pgp_num() const { return pgp_num; }
+ unsigned get_pg_num_target() const { return pg_num_target; }
+ unsigned get_pgp_num_target() const { return pgp_num_target; }
+ unsigned get_pg_num_pending() const { return pg_num_pending; }
+
+ unsigned get_pg_num_mask() const { return pg_num_mask; }
+ unsigned get_pgp_num_mask() const { return pgp_num_mask; }
+
+ // if pg_num is not a multiple of two, pgs are not equally sized.
+ // return, for a given pg, the fraction (denominator) of the total
+ // pool size that it represents.
+ unsigned get_pg_num_divisor(pg_t pgid) const;
+
+ bool is_pending_merge(pg_t pgid, bool *target) const;
+
+ void set_pg_num(int p) {
+ pg_num = p;
+ pg_num_pending = p;
+ calc_pg_masks();
+ }
+ void set_pgp_num(int p) {
+ pgp_num = p;
+ calc_pg_masks();
+ }
+ void set_pg_num_pending(int p) {
+ pg_num_pending = p;
+ calc_pg_masks();
+ }
+ void set_pg_num_target(int p) {
+ pg_num_target = p;
+ }
+ void set_pgp_num_target(int p) {
+ pgp_num_target = p;
+ }
+ void dec_pg_num(pg_t source_pgid,
+ epoch_t ready_epoch,
+ eversion_t source_version,
+ eversion_t target_version,
+ epoch_t last_epoch_started,
+ epoch_t last_epoch_clean) {
+ --pg_num;
+ last_pg_merge_meta.source_pgid = source_pgid;
+ last_pg_merge_meta.ready_epoch = ready_epoch;
+ last_pg_merge_meta.source_version = source_version;
+ last_pg_merge_meta.target_version = target_version;
+ last_pg_merge_meta.last_epoch_started = last_epoch_started;
+ last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
+ calc_pg_masks();
+ }
+
+ void set_quota_max_bytes(uint64_t m) {
+ quota_max_bytes = m;
+ }
+ uint64_t get_quota_max_bytes() {
+ return quota_max_bytes;
+ }
+
+ void set_quota_max_objects(uint64_t m) {
+ quota_max_objects = m;
+ }
+ uint64_t get_quota_max_objects() {
+ return quota_max_objects;
+ }
+
+ void set_last_force_op_resend(uint64_t t) {
+ last_force_op_resend = t;
+ last_force_op_resend_prenautilus = t;
+ last_force_op_resend_preluminous = t;
+ }
+
+ void calc_pg_masks();
+
+ /*
+ * we have two snap modes:
+ * - pool global snaps
+ * - snap existence/non-existence defined by snaps[] and snap_seq
+ * - user managed snaps
+ * - removal governed by removed_snaps
+ *
+ * we know which mode we're using based on whether removed_snaps is empty.
+ * If nothing has been created, both functions report false.
+ */
+ bool is_pool_snaps_mode() const;
+ bool is_unmanaged_snaps_mode() const;
+ bool is_removed_snap(snapid_t s) const;
+
+ snapid_t snap_exists(std::string_view s) const;
+ void add_snap(const char *n, utime_t stamp);
+ uint64_t add_unmanaged_snap(bool preoctopus_compat);
+ void remove_snap(snapid_t s);
+ void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
+
+ SnapContext get_snap_context() const;
+
+ /// hash a object name+namespace key to a hash position
+ uint32_t hash_key(const std::string& key, const std::string& ns) const;
+
+ /// round a hash position down to a pg num
+ uint32_t raw_hash_to_pg(uint32_t v) const;
+
+ /*
+ * map a raw pg (with full precision ps) into an actual pg, for storage
+ */
+ pg_t raw_pg_to_pg(pg_t pg) const;
+
+ /*
+ * map raw pg (full precision ps) into a placement seed. include
+ * pool id in that value so that different pools don't use the same
+ * seeds.
+ */
+ ps_t raw_pg_to_pps(pg_t pg) const;
+
+ /// choose a random hash position within a pg
+ uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+
+ static void generate_test_instances(std::list<pg_pool_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
+
+
+/**
+ * a summation of object stats
+ *
+ * This is just a container for object stats; we don't know what for.
+ *
+ * If you add members in object_stat_sum_t, you should make sure there are
+ * not padding among these members.
+ * You should also modify the padding_check function.
+
+ */
+struct object_stat_sum_t {
+ /**************************************************************************
+ * WARNING: be sure to update operator==, floor, and split when
+ * adding/removing fields!
+ **************************************************************************/
+ int64_t num_bytes; // in bytes
+ int64_t num_objects;
+ int64_t num_object_clones;
+ int64_t num_object_copies; // num_objects * num_replicas
+ int64_t num_objects_missing_on_primary;
+ int64_t num_objects_degraded;
+ int64_t num_objects_unfound;
+ int64_t num_rd;
+ int64_t num_rd_kb;
+ int64_t num_wr;
+ int64_t num_wr_kb;
+ int64_t num_scrub_errors; // total deep and shallow scrub errors
+ int64_t num_objects_recovered;
+ int64_t num_bytes_recovered;
+ int64_t num_keys_recovered;
+ int64_t num_shallow_scrub_errors;
+ int64_t num_deep_scrub_errors;
+ int64_t num_objects_dirty;
+ int64_t num_whiteouts;
+ int64_t num_objects_omap;
+ int64_t num_objects_hit_set_archive;
+ int64_t num_objects_misplaced;
+ int64_t num_bytes_hit_set_archive;
+ int64_t num_flush;
+ int64_t num_flush_kb;
+ int64_t num_evict;
+ int64_t num_evict_kb;
+ int64_t num_promote;
+ int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
+ int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
+ int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
+ int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
+ int64_t num_objects_pinned;
+ int64_t num_objects_missing;
+ int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
+ int64_t num_large_omap_objects = 0;
+ int64_t num_objects_manifest = 0;
+ int64_t num_omap_bytes = 0;
+ int64_t num_omap_keys = 0;
+ int64_t num_objects_repaired = 0;
+
+ object_stat_sum_t()
+ : num_bytes(0),
+ num_objects(0), num_object_clones(0), num_object_copies(0),
+ num_objects_missing_on_primary(0), num_objects_degraded(0),
+ num_objects_unfound(0),
+ num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
+ num_scrub_errors(0),
+ num_objects_recovered(0),
+ num_bytes_recovered(0),
+ num_keys_recovered(0),
+ num_shallow_scrub_errors(0),
+ num_deep_scrub_errors(0),
+ num_objects_dirty(0),
+ num_whiteouts(0),
+ num_objects_omap(0),
+ num_objects_hit_set_archive(0),
+ num_objects_misplaced(0),
+ num_bytes_hit_set_archive(0),
+ num_flush(0),
+ num_flush_kb(0),
+ num_evict(0),
+ num_evict_kb(0),
+ num_promote(0),
+ num_flush_mode_high(0), num_flush_mode_low(0),
+ num_evict_mode_some(0), num_evict_mode_full(0),
+ num_objects_pinned(0),
+ num_objects_missing(0),
+ num_legacy_snapsets(0)
+ {}
+
+ void floor(int64_t f) {
+#define FLOOR(x) if (x < f) x = f
+ FLOOR(num_bytes);
+ FLOOR(num_objects);
+ FLOOR(num_object_clones);
+ FLOOR(num_object_copies);
+ FLOOR(num_objects_missing_on_primary);
+ FLOOR(num_objects_missing);
+ FLOOR(num_objects_degraded);
+ FLOOR(num_objects_misplaced);
+ FLOOR(num_objects_unfound);
+ FLOOR(num_rd);
+ FLOOR(num_rd_kb);
+ FLOOR(num_wr);
+ FLOOR(num_wr_kb);
+ FLOOR(num_large_omap_objects);
+ FLOOR(num_objects_manifest);
+ FLOOR(num_omap_bytes);
+ FLOOR(num_omap_keys);
+ FLOOR(num_shallow_scrub_errors);
+ FLOOR(num_deep_scrub_errors);
+ num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
+ FLOOR(num_objects_recovered);
+ FLOOR(num_bytes_recovered);
+ FLOOR(num_keys_recovered);
+ FLOOR(num_objects_dirty);
+ FLOOR(num_whiteouts);
+ FLOOR(num_objects_omap);
+ FLOOR(num_objects_hit_set_archive);
+ FLOOR(num_bytes_hit_set_archive);
+ FLOOR(num_flush);
+ FLOOR(num_flush_kb);
+ FLOOR(num_evict);
+ FLOOR(num_evict_kb);
+ FLOOR(num_promote);
+ FLOOR(num_flush_mode_high);
+ FLOOR(num_flush_mode_low);
+ FLOOR(num_evict_mode_some);
+ FLOOR(num_evict_mode_full);
+ FLOOR(num_objects_pinned);
+ FLOOR(num_legacy_snapsets);
+ FLOOR(num_objects_repaired);
+#undef FLOOR
+ }
+
+ void split(std::vector<object_stat_sum_t> &out) const {
+#define SPLIT(PARAM) \
+ for (unsigned i = 0; i < out.size(); ++i) { \
+ out[i].PARAM = PARAM / out.size(); \
+ if (i < (PARAM % out.size())) { \
+ out[i].PARAM++; \
+ } \
+ }
+#define SPLIT_PRESERVE_NONZERO(PARAM) \
+ for (unsigned i = 0; i < out.size(); ++i) { \
+ if (PARAM) \
+ out[i].PARAM = 1 + PARAM / out.size(); \
+ else \
+ out[i].PARAM = 0; \
+ }
+
+ SPLIT(num_bytes);
+ SPLIT(num_objects);
+ SPLIT(num_object_clones);
+ SPLIT(num_object_copies);
+ SPLIT(num_objects_missing_on_primary);
+ SPLIT(num_objects_missing);
+ SPLIT(num_objects_degraded);
+ SPLIT(num_objects_misplaced);
+ SPLIT(num_objects_unfound);
+ SPLIT(num_rd);
+ SPLIT(num_rd_kb);
+ SPLIT(num_wr);
+ SPLIT(num_wr_kb);
+ SPLIT(num_large_omap_objects);
+ SPLIT(num_objects_manifest);
+ SPLIT(num_omap_bytes);
+ SPLIT(num_omap_keys);
+ SPLIT(num_objects_repaired);
+ SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
+ SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
+ for (unsigned i = 0; i < out.size(); ++i) {
+ out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
+ out[i].num_deep_scrub_errors;
+ }
+ SPLIT(num_objects_recovered);
+ SPLIT(num_bytes_recovered);
+ SPLIT(num_keys_recovered);
+ SPLIT(num_objects_dirty);
+ SPLIT(num_whiteouts);
+ SPLIT(num_objects_omap);
+ SPLIT(num_objects_hit_set_archive);
+ SPLIT(num_bytes_hit_set_archive);
+ SPLIT(num_flush);
+ SPLIT(num_flush_kb);
+ SPLIT(num_evict);
+ SPLIT(num_evict_kb);
+ SPLIT(num_promote);
+ SPLIT(num_flush_mode_high);
+ SPLIT(num_flush_mode_low);
+ SPLIT(num_evict_mode_some);
+ SPLIT(num_evict_mode_full);
+ SPLIT(num_objects_pinned);
+ SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
+#undef SPLIT
+#undef SPLIT_PRESERVE_NONZERO
+ }
+
+ void clear() {
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(this, 0, sizeof(*this));
+ }
+
+ void calc_copies(int nrep) {
+ num_object_copies = nrep * num_objects;
+ }
+
+ bool is_zero() const {
+ return mem_is_zero((char*)this, sizeof(*this));
+ }
+
+ void add(const object_stat_sum_t& o);
+ void sub(const object_stat_sum_t& o);
+
+ void dump(ceph::Formatter *f) const;
+ void padding_check() {
+ static_assert(
+ sizeof(object_stat_sum_t) ==
+ sizeof(num_bytes) +
+ sizeof(num_objects) +
+ sizeof(num_object_clones) +
+ sizeof(num_object_copies) +
+ sizeof(num_objects_missing_on_primary) +
+ sizeof(num_objects_degraded) +
+ sizeof(num_objects_unfound) +
+ sizeof(num_rd) +
+ sizeof(num_rd_kb) +
+ sizeof(num_wr) +
+ sizeof(num_wr_kb) +
+ sizeof(num_scrub_errors) +
+ sizeof(num_large_omap_objects) +
+ sizeof(num_objects_manifest) +
+ sizeof(num_omap_bytes) +
+ sizeof(num_omap_keys) +
+ sizeof(num_objects_repaired) +
+ sizeof(num_objects_recovered) +
+ sizeof(num_bytes_recovered) +
+ sizeof(num_keys_recovered) +
+ sizeof(num_shallow_scrub_errors) +
+ sizeof(num_deep_scrub_errors) +
+ sizeof(num_objects_dirty) +
+ sizeof(num_whiteouts) +
+ sizeof(num_objects_omap) +
+ sizeof(num_objects_hit_set_archive) +
+ sizeof(num_objects_misplaced) +
+ sizeof(num_bytes_hit_set_archive) +
+ sizeof(num_flush) +
+ sizeof(num_flush_kb) +
+ sizeof(num_evict) +
+ sizeof(num_evict_kb) +
+ sizeof(num_promote) +
+ sizeof(num_flush_mode_high) +
+ sizeof(num_flush_mode_low) +
+ sizeof(num_evict_mode_some) +
+ sizeof(num_evict_mode_full) +
+ sizeof(num_objects_pinned) +
+ sizeof(num_objects_missing) +
+ sizeof(num_legacy_snapsets)
+ ,
+ "object_stat_sum_t have padding");
+ }
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ static void generate_test_instances(std::list<object_stat_sum_t*>& o);
+};
+WRITE_CLASS_ENCODER(object_stat_sum_t)
+
+bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
+
+/**
+ * a collection of object stat sums
+ *
+ * This is a collection of stat sums over different categories.
+ */
+struct object_stat_collection_t {
+ /**************************************************************************
+ * WARNING: be sure to update the operator== when adding/removing fields! *
+ **************************************************************************/
+ object_stat_sum_t sum;
+
+ void calc_copies(int nrep) {
+ sum.calc_copies(nrep);
+ }
+
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ static void generate_test_instances(std::list<object_stat_collection_t*>& o);
+
+ bool is_zero() const {
+ return sum.is_zero();
+ }
+
+ void clear() {
+ sum.clear();
+ }
+
+ void floor(int64_t f) {
+ sum.floor(f);
+ }
+
+ void add(const object_stat_sum_t& o) {
+ sum.add(o);
+ }
+
+ void add(const object_stat_collection_t& o) {
+ sum.add(o.sum);
+ }
+ void sub(const object_stat_collection_t& o) {
+ sum.sub(o.sum);
+ }
+};
+WRITE_CLASS_ENCODER(object_stat_collection_t)
+
+inline bool operator==(const object_stat_collection_t& l,
+ const object_stat_collection_t& r) {
+ return l.sum == r.sum;
+}
+
+
+/** pg_stat
+ * aggregate stats for a single PG.
+ */
+struct pg_stat_t {
+ /**************************************************************************
+ * WARNING: be sure to update the operator== when adding/removing fields! *
+ **************************************************************************/
+ eversion_t version;
+ version_t reported_seq; // sequence number
+ epoch_t reported_epoch; // epoch of this report
+ uint64_t state;
+ utime_t last_fresh; // last reported
+ utime_t last_change; // new state != previous state
+ utime_t last_active; // state & PG_STATE_ACTIVE
+ utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
+ utime_t last_clean; // state & PG_STATE_CLEAN
+ utime_t last_unstale; // (state & PG_STATE_STALE) == 0
+ utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
+ utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
+
+ eversion_t log_start; // (log_start,version]
+ eversion_t ondisk_log_start; // there may be more on disk
+
+ epoch_t created;
+ epoch_t last_epoch_clean;
+ pg_t parent;
+ __u32 parent_split_bits;
+
+ eversion_t last_scrub;
+ eversion_t last_deep_scrub;
+ utime_t last_scrub_stamp;
+ utime_t last_deep_scrub_stamp;
+ utime_t last_clean_scrub_stamp;
+
+ object_stat_collection_t stats;
+
+ int64_t log_size;
+ int64_t ondisk_log_size; // >= active_log_size
+
+ std::vector<int32_t> up, acting;
+ std::vector<pg_shard_t> avail_no_missing;
+ std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
+ epoch_t mapping_epoch;
+
+ std::vector<int32_t> blocked_by; ///< osds on which the pg is blocked
+
+ interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
+
+ utime_t last_became_active;
+ utime_t last_became_peered;
+
+ /// up, acting primaries
+ int32_t up_primary;
+ int32_t acting_primary;
+
+ // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
+ // absurd already, so cap it to 2^32 and save 4 bytes at the same time
+ uint32_t snaptrimq_len;
+
+ bool stats_invalid:1;
+ /// true if num_objects_dirty is not accurate (because it was not
+ /// maintained starting from pool creation)
+ bool dirty_stats_invalid:1;
+ bool omap_stats_invalid:1;
+ bool hitset_stats_invalid:1;
+ bool hitset_bytes_stats_invalid:1;
+ bool pin_stats_invalid:1;
+ bool manifest_stats_invalid:1;
+
+ pg_stat_t()
+ : reported_seq(0),
+ reported_epoch(0),
+ state(0),
+ created(0), last_epoch_clean(0),
+ parent_split_bits(0),
+ log_size(0), ondisk_log_size(0),
+ mapping_epoch(0),
+ up_primary(-1),
+ acting_primary(-1),
+ snaptrimq_len(0),
+ stats_invalid(false),
+ dirty_stats_invalid(false),
+ omap_stats_invalid(false),
+ hitset_stats_invalid(false),
+ hitset_bytes_stats_invalid(false),
+ pin_stats_invalid(false),
+ manifest_stats_invalid(false)
+ { }
+
+ epoch_t get_effective_last_epoch_clean() const {
+ if (state & PG_STATE_CLEAN) {
+ // we are clean as of this report, and should thus take the
+ // reported epoch
+ return reported_epoch;
+ } else {
+ return last_epoch_clean;
+ }
+ }
+
+ std::pair<epoch_t, version_t> get_version_pair() const {
+ return { reported_epoch, reported_seq };
+ }
+
+ void floor(int64_t f) {
+ stats.floor(f);
+ if (log_size < f)
+ log_size = f;
+ if (ondisk_log_size < f)
+ ondisk_log_size = f;
+ if (snaptrimq_len < f)
+ snaptrimq_len = f;
+ }
+
+ void add_sub_invalid_flags(const pg_stat_t& o) {
+ // adding (or subtracting!) invalid stats render our stats invalid too
+ stats_invalid |= o.stats_invalid;
+ dirty_stats_invalid |= o.dirty_stats_invalid;
+ omap_stats_invalid |= o.omap_stats_invalid;
+ hitset_stats_invalid |= o.hitset_stats_invalid;
+ hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
+ pin_stats_invalid |= o.pin_stats_invalid;
+ manifest_stats_invalid |= o.manifest_stats_invalid;
+ }
+ void add(const pg_stat_t& o) {
+ stats.add(o.stats);
+ log_size += o.log_size;
+ ondisk_log_size += o.ondisk_log_size;
+ snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
+ (uint64_t)(1ull << 31));
+ add_sub_invalid_flags(o);
+ }
+ void sub(const pg_stat_t& o) {
+ stats.sub(o.stats);
+ log_size -= o.log_size;
+ ondisk_log_size -= o.ondisk_log_size;
+ if (o.snaptrimq_len < snaptrimq_len) {
+ snaptrimq_len -= o.snaptrimq_len;
+ } else {
+ snaptrimq_len = 0;
+ }
+ add_sub_invalid_flags(o);
+ }
+
+ bool is_acting_osd(int32_t osd, bool primary) const;
+ void dump(ceph::Formatter *f) const;
+ void dump_brief(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ static void generate_test_instances(std::list<pg_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_stat_t)
+
+bool operator==(const pg_stat_t& l, const pg_stat_t& r);
+
+/** store_statfs_t
+ * ObjectStore full statfs information
+ */
+struct store_statfs_t
+{
+ uint64_t total = 0; ///< Total bytes
+ uint64_t available = 0; ///< Free bytes available
+ uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
+
+ int64_t allocated = 0; ///< Bytes allocated by the store
+
+ int64_t data_stored = 0; ///< Bytes actually stored by the user
+ int64_t data_compressed = 0; ///< Bytes stored after compression
+ int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
+ int64_t data_compressed_original = 0; ///< Bytes that were compressed
+
+ int64_t omap_allocated = 0; ///< approx usage of omap data
+ int64_t internal_metadata = 0; ///< approx usage of internal metadata
+
+ void reset() {
+ *this = store_statfs_t();
+ }
+ void floor(int64_t f) {
+#define FLOOR(x) if (int64_t(x) < f) x = f
+ FLOOR(total);
+ FLOOR(available);
+ FLOOR(internally_reserved);
+ FLOOR(allocated);
+ FLOOR(data_stored);
+ FLOOR(data_compressed);
+ FLOOR(data_compressed_allocated);
+ FLOOR(data_compressed_original);
+
+ FLOOR(omap_allocated);
+ FLOOR(internal_metadata);
+#undef FLOOR
+ }
+
+ bool operator ==(const store_statfs_t& other) const;
+ bool is_zero() const {
+ return *this == store_statfs_t();
+ }
+
+ uint64_t get_used() const {
+ return total - available - internally_reserved;
+ }
+
+ // this accumulates both actually used and statfs's internally_reserved
+ uint64_t get_used_raw() const {
+ return total - available;
+ }
+
+ float get_used_raw_ratio() const {
+ if (total) {
+ return (float)get_used_raw() / (float)total;
+ } else {
+ return 0.0;
+ }
+ }
+
+ // helpers to ease legacy code porting
+ uint64_t kb_avail() const {
+ return available >> 10;
+ }
+ uint64_t kb() const {
+ return total >> 10;
+ }
+ uint64_t kb_used() const {
+ return (total - available - internally_reserved) >> 10;
+ }
+ uint64_t kb_used_raw() const {
+ return get_used_raw() >> 10;
+ }
+
+ uint64_t kb_used_data() const {
+ return allocated >> 10;
+ }
+ uint64_t kb_used_omap() const {
+ return omap_allocated >> 10;
+ }
+
+ uint64_t kb_used_internal_metadata() const {
+ return internal_metadata >> 10;
+ }
+
+ void add(const store_statfs_t& o) {
+ total += o.total;
+ available += o.available;
+ internally_reserved += o.internally_reserved;
+ allocated += o.allocated;
+ data_stored += o.data_stored;
+ data_compressed += o.data_compressed;
+ data_compressed_allocated += o.data_compressed_allocated;
+ data_compressed_original += o.data_compressed_original;
+ omap_allocated += o.omap_allocated;
+ internal_metadata += o.internal_metadata;
+ }
+ void sub(const store_statfs_t& o) {
+ total -= o.total;
+ available -= o.available;
+ internally_reserved -= o.internally_reserved;
+ allocated -= o.allocated;
+ data_stored -= o.data_stored;
+ data_compressed -= o.data_compressed;
+ data_compressed_allocated -= o.data_compressed_allocated;
+ data_compressed_original -= o.data_compressed_original;
+ omap_allocated -= o.omap_allocated;
+ internal_metadata -= o.internal_metadata;
+ }
+ void dump(ceph::Formatter *f) const;
+ DENC(store_statfs_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.total, p);
+ denc(v.available, p);
+ denc(v.internally_reserved, p);
+ denc(v.allocated, p);
+ denc(v.data_stored, p);
+ denc(v.data_compressed, p);
+ denc(v.data_compressed_allocated, p);
+ denc(v.data_compressed_original, p);
+ denc(v.omap_allocated, p);
+ denc(v.internal_metadata, p);
+ DENC_FINISH(p);
+ }
+ static void generate_test_instances(std::list<store_statfs_t*>& o);
+};
+WRITE_CLASS_DENC(store_statfs_t)
+
+std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
+
+/** osd_stat
+ * aggregate stats for an osd
+ */
+struct osd_stat_t {
+ store_statfs_t statfs;
+ std::vector<int> hb_peers;
+ int32_t snap_trim_queue_len, num_snap_trimming;
+ uint64_t num_shards_repaired;
+
+ pow2_hist_t op_queue_age_hist;
+
+ objectstore_perf_stat_t os_perf_stat;
+ osd_alerts_t os_alerts;
+
+ epoch_t up_from = 0;
+ uint64_t seq = 0;
+
+ uint32_t num_pgs = 0;
+
+ uint32_t num_osds = 0;
+ uint32_t num_per_pool_osds = 0;
+ uint32_t num_per_pool_omap_osds = 0;
+
+ struct Interfaces {
+ uint32_t last_update; // in seconds
+ uint32_t back_pingtime[3];
+ uint32_t back_min[3];
+ uint32_t back_max[3];
+ uint32_t back_last;
+ uint32_t front_pingtime[3];
+ uint32_t front_min[3];
+ uint32_t front_max[3];
+ uint32_t front_last;
+ };
+ std::map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
+
+ osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
+ num_shards_repaired(0) {}
+
+ void add(const osd_stat_t& o) {
+ statfs.add(o.statfs);
+ snap_trim_queue_len += o.snap_trim_queue_len;
+ num_snap_trimming += o.num_snap_trimming;
+ num_shards_repaired += o.num_shards_repaired;
+ op_queue_age_hist.add(o.op_queue_age_hist);
+ os_perf_stat.add(o.os_perf_stat);
+ num_pgs += o.num_pgs;
+ num_osds += o.num_osds;
+ num_per_pool_osds += o.num_per_pool_osds;
+ num_per_pool_omap_osds += o.num_per_pool_omap_osds;
+ for (const auto& a : o.os_alerts) {
+ auto& target = os_alerts[a.first];
+ for (auto& i : a.second) {
+ target.emplace(i.first, i.second);
+ }
+ }
+ }
+ void sub(const osd_stat_t& o) {
+ statfs.sub(o.statfs);
+ snap_trim_queue_len -= o.snap_trim_queue_len;
+ num_snap_trimming -= o.num_snap_trimming;
+ num_shards_repaired -= o.num_shards_repaired;
+ op_queue_age_hist.sub(o.op_queue_age_hist);
+ os_perf_stat.sub(o.os_perf_stat);
+ num_pgs -= o.num_pgs;
+ num_osds -= o.num_osds;
+ num_per_pool_osds -= o.num_per_pool_osds;
+ num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
+ for (const auto& a : o.os_alerts) {
+ auto& target = os_alerts[a.first];
+ for (auto& i : a.second) {
+ target.erase(i.first);
+ }
+ if (target.empty()) {
+ os_alerts.erase(a.first);
+ }
+ }
+ }
+ void dump(ceph::Formatter *f, bool with_net = true) const;
+ void dump_ping_time(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ static void generate_test_instances(std::list<osd_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
+
+inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
+ return l.statfs == r.statfs &&
+ l.snap_trim_queue_len == r.snap_trim_queue_len &&
+ l.num_snap_trimming == r.num_snap_trimming &&
+ l.num_shards_repaired == r.num_shards_repaired &&
+ l.hb_peers == r.hb_peers &&
+ l.op_queue_age_hist == r.op_queue_age_hist &&
+ l.os_perf_stat == r.os_perf_stat &&
+ l.num_pgs == r.num_pgs &&
+ l.num_osds == r.num_osds &&
+ l.num_per_pool_osds == r.num_per_pool_osds &&
+ l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
+}
+inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
+ return !(l == r);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
+ return out << "osd_stat(" << s.statfs << ", "
+ << "peers " << s.hb_peers
+ << " op hist " << s.op_queue_age_hist.h
+ << ")";
+}
+
+/*
+ * summation over an entire pool
+ */
+struct pool_stat_t {
+ object_stat_collection_t stats;
+ store_statfs_t store_stats;
+ int64_t log_size;
+ int64_t ondisk_log_size; // >= active_log_size
+ int32_t up; ///< number of up replicas or shards
+ int32_t acting; ///< number of acting replicas or shards
+ int32_t num_store_stats; ///< amount of store_stats accumulated
+
+ pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
+ num_store_stats(0)
+ { }
+
+ void floor(int64_t f) {
+ stats.floor(f);
+ store_stats.floor(f);
+ if (log_size < f)
+ log_size = f;
+ if (ondisk_log_size < f)
+ ondisk_log_size = f;
+ if (up < f)
+ up = f;
+ if (acting < f)
+ acting = f;
+ if (num_store_stats < f)
+ num_store_stats = f;
+ }
+
+ void add(const store_statfs_t& o) {
+ store_stats.add(o);
+ ++num_store_stats;
+ }
+ void sub(const store_statfs_t& o) {
+ store_stats.sub(o);
+ --num_store_stats;
+ }
+
+ void add(const pg_stat_t& o) {
+ stats.add(o.stats);
+ log_size += o.log_size;
+ ondisk_log_size += o.ondisk_log_size;
+ up += o.up.size();
+ acting += o.acting.size();
+ }
+ void sub(const pg_stat_t& o) {
+ stats.sub(o.stats);
+ log_size -= o.log_size;
+ ondisk_log_size -= o.ondisk_log_size;
+ up -= o.up.size();
+ acting -= o.acting.size();
+ }
+
+ bool is_zero() const {
+ return (stats.is_zero() &&
+ store_stats.is_zero() &&
+ log_size == 0 &&
+ ondisk_log_size == 0 &&
+ up == 0 &&
+ acting == 0 &&
+ num_store_stats == 0);
+ }
+
+ // helper accessors to retrieve used/netto bytes depending on the
+ // collection method: new per-pool objectstore report or legacy PG
+ // summation at OSD.
+ // In legacy mode used and netto values are the same. But for new per-pool
+ // collection 'used' provides amount of space ALLOCATED at all related OSDs
+ // and 'netto' is amount of stored user data.
+ uint64_t get_allocated_data_bytes(bool per_pool) const {
+ if (per_pool) {
+ return store_stats.allocated;
+ } else {
+ // legacy mode, use numbers from 'stats'
+ return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
+ }
+ }
+ uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
+ if (per_pool_omap) {
+ return store_stats.omap_allocated;
+ } else {
+ // omap is not broken out by pool by nautilus bluestore; report the
+ // scrub value. this will be imprecise in that it won't account for
+ // any storage overhead/efficiency.
+ return stats.sum.num_omap_bytes;
+ }
+ }
+ uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
+ bool per_pool) const {
+ // NOTE: we need the space amp factor so that we can work backwards from
+ // the raw utilization to the amount of data that the user actually stored.
+ if (per_pool) {
+ return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
+ } else {
+ // legacy mode, use numbers from 'stats'. note that we do NOT use the
+ // raw_used_rate factor here because we are working from the PG stats
+ // directly.
+ return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
+ }
+ }
+ uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
+ bool per_pool_omap) const {
+ if (per_pool_omap) {
+ return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
+ } else {
+ // omap usage is lazily reported during scrub; this value may lag.
+ return stats.sum.num_omap_bytes;
+ }
+ }
+
+ void dump(ceph::Formatter *f) const;
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ static void generate_test_instances(std::list<pool_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
+
+
+// -----------------------------------------
+
+/**
+ * pg_hit_set_info_t - information about a single recorded HitSet
+ *
+ * Track basic metadata about a HitSet, like the number of insertions
+ * and the time range it covers.
+ */
+struct pg_hit_set_info_t {
+ utime_t begin, end; ///< time interval
+ eversion_t version; ///< version this HitSet object was written
+ bool using_gmt; ///< use gmt for creating the hit_set archive object name
+
+ friend bool operator==(const pg_hit_set_info_t& l,
+ const pg_hit_set_info_t& r) {
+ return
+ l.begin == r.begin &&
+ l.end == r.end &&
+ l.version == r.version &&
+ l.using_gmt == r.using_gmt;
+ }
+
+ explicit pg_hit_set_info_t(bool using_gmt = true)
+ : using_gmt(using_gmt) {}
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_hit_set_info_t)
+
+/**
+ * pg_hit_set_history_t - information about a history of hitsets
+ *
+ * Include information about the currently accumulating hit set as well
+ * as archived/historical ones.
+ */
+struct pg_hit_set_history_t {
+ eversion_t current_last_update; ///< last version inserted into current set
+ std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
+
+ friend bool operator==(const pg_hit_set_history_t& l,
+ const pg_hit_set_history_t& r) {
+ return
+ l.current_last_update == r.current_last_update &&
+ l.history == r.history;
+ }
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_hit_set_history_t)
+
+
+// -----------------------------------------
+
+/**
+ * pg_history_t - information about recent pg peering/mapping history
+ *
+ * This is aggressively shared between OSDs to bound the amount of past
+ * history they need to worry about.
+ */
+struct pg_history_t {
+ epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg)
+ epoch_t epoch_pool_created = 0; // epoch in which *pool* was created
+ // (note: may be pg creation epoch for
+ // pre-luminous clusters)
+ epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
+ // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
+ epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
+ epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean.
+ epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
+ epoch_t last_epoch_split = 0;; // as parent or child
+ epoch_t last_epoch_marked_full = 0;; // pool or cluster
+
+ /**
+ * In the event of a map discontinuity, same_*_since may reflect the first
+ * map the osd has seen in the new map sequence rather than the actual start
+ * of the interval. This is ok since a discontinuity at epoch e means there
+ * must have been a clean interval between e and now and that we cannot be
+ * in the active set during the interval containing e.
+ */
+ epoch_t same_up_since = 0;; // same acting set since
+ epoch_t same_interval_since = 0;; // same acting AND up set since
+ epoch_t same_primary_since = 0;; // same primary at least back through this epoch.
+
+ eversion_t last_scrub;
+ eversion_t last_deep_scrub;
+ utime_t last_scrub_stamp;
+ utime_t last_deep_scrub_stamp;
+ utime_t last_clean_scrub_stamp;
+
+ /// upper bound on how long prior interval readable (relative to encode time)
+ ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
+
+ friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
+ return
+ l.epoch_created == r.epoch_created &&
+ l.epoch_pool_created == r.epoch_pool_created &&
+ l.last_epoch_started == r.last_epoch_started &&
+ l.last_interval_started == r.last_interval_started &&
+ l.last_epoch_clean == r.last_epoch_clean &&
+ l.last_interval_clean == r.last_interval_clean &&
+ l.last_epoch_split == r.last_epoch_split &&
+ l.last_epoch_marked_full == r.last_epoch_marked_full &&
+ l.same_up_since == r.same_up_since &&
+ l.same_interval_since == r.same_interval_since &&
+ l.same_primary_since == r.same_primary_since &&
+ l.last_scrub == r.last_scrub &&
+ l.last_deep_scrub == r.last_deep_scrub &&
+ l.last_scrub_stamp == r.last_scrub_stamp &&
+ l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
+ l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
+ l.prior_readable_until_ub == r.prior_readable_until_ub;
+ }
+
+ pg_history_t() {}
+ pg_history_t(epoch_t created, utime_t stamp)
+ : epoch_created(created),
+ epoch_pool_created(created),
+ same_up_since(created),
+ same_interval_since(created),
+ same_primary_since(created),
+ last_scrub_stamp(stamp),
+ last_deep_scrub_stamp(stamp),
+ last_clean_scrub_stamp(stamp) {}
+
+ bool merge(const pg_history_t &other) {
+ // Here, we only update the fields which cannot be calculated from the OSDmap.
+ bool modified = false;
+ if (epoch_created < other.epoch_created) {
+ epoch_created = other.epoch_created;
+ modified = true;
+ }
+ if (epoch_pool_created < other.epoch_pool_created) {
+ // FIXME: for jewel compat only; this should either be 0 or always the
+ // same value across all pg instances.
+ epoch_pool_created = other.epoch_pool_created;
+ modified = true;
+ }
+ if (last_epoch_started < other.last_epoch_started) {
+ last_epoch_started = other.last_epoch_started;
+ modified = true;
+ }
+ if (last_interval_started < other.last_interval_started) {
+ last_interval_started = other.last_interval_started;
+ // if we are learning about a newer *started* interval, our
+ // readable_until_ub is obsolete
+ prior_readable_until_ub = other.prior_readable_until_ub;
+ modified = true;
+ } else if (other.last_interval_started == last_interval_started &&
+ other.prior_readable_until_ub < prior_readable_until_ub) {
+ // if other is the *same* interval, than pull our upper bound in
+ // if they have a tighter bound.
+ prior_readable_until_ub = other.prior_readable_until_ub;
+ modified = true;
+ }
+ if (last_epoch_clean < other.last_epoch_clean) {
+ last_epoch_clean = other.last_epoch_clean;
+ modified = true;
+ }
+ if (last_interval_clean < other.last_interval_clean) {
+ last_interval_clean = other.last_interval_clean;
+ modified = true;
+ }
+ if (last_epoch_split < other.last_epoch_split) {
+ last_epoch_split = other.last_epoch_split;
+ modified = true;
+ }
+ if (last_epoch_marked_full < other.last_epoch_marked_full) {
+ last_epoch_marked_full = other.last_epoch_marked_full;
+ modified = true;
+ }
+ if (other.last_scrub > last_scrub) {
+ last_scrub = other.last_scrub;
+ modified = true;
+ }
+ if (other.last_scrub_stamp > last_scrub_stamp) {
+ last_scrub_stamp = other.last_scrub_stamp;
+ modified = true;
+ }
+ if (other.last_deep_scrub > last_deep_scrub) {
+ last_deep_scrub = other.last_deep_scrub;
+ modified = true;
+ }
+ if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
+ last_deep_scrub_stamp = other.last_deep_scrub_stamp;
+ modified = true;
+ }
+ if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
+ last_clean_scrub_stamp = other.last_clean_scrub_stamp;
+ modified = true;
+ }
+ return modified;
+ }
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_history_t*>& o);
+
+ ceph::signedspan refresh_prior_readable_until_ub(
+ ceph::signedspan now, ///< now, relative to osd startup_time
+ ceph::signedspan ub) { ///< ub, relative to osd startup_time
+ if (now >= ub) {
+ // prior interval(s) are unreadable; we can zero the upper bound
+ prior_readable_until_ub = ceph::signedspan::zero();
+ return ceph::signedspan::zero();
+ } else {
+ prior_readable_until_ub = ub - now;
+ return ub;
+ }
+ }
+ ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
+ if (prior_readable_until_ub == ceph::signedspan::zero()) {
+ return ceph::signedspan::zero();
+ }
+ return now + prior_readable_until_ub;
+ }
+};
+WRITE_CLASS_ENCODER(pg_history_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
+ out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
+ << " lis/c=" << h.last_interval_started
+ << "/" << h.last_interval_clean
+ << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
+ << "/" << h.last_epoch_marked_full
+ << " sis=" << h.same_interval_since;
+ if (h.prior_readable_until_ub != ceph::timespan::zero()) {
+ out << " pruub=" << h.prior_readable_until_ub;
+ }
+ return out;
+}
+
+
+/**
+ * pg_info_t - summary of PG statistics.
+ *
+ * some notes:
+ * - last_complete implies we have all objects that existed as of that
+ * stamp, OR a newer object, OR have already applied a later delete.
+ * - if last_complete >= log.tail, then we know pg contents thru log.head.
+ * otherwise, we have no idea what the pg is supposed to contain.
+ */
+struct pg_info_t {
+ spg_t pgid;
+ eversion_t last_update; ///< last object version applied to store.
+ eversion_t last_complete; ///< last version pg was complete through.
+ epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
+ epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
+
+ version_t last_user_version; ///< last user object version applied to store
+
+ eversion_t log_tail; ///< oldest log entry.
+
+ hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
+
+ interval_set<snapid_t> purged_snaps;
+
+ pg_stat_t stats;
+
+ pg_history_t history;
+ pg_hit_set_history_t hit_set;
+
+ friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
+ return
+ l.pgid == r.pgid &&
+ l.last_update == r.last_update &&
+ l.last_complete == r.last_complete &&
+ l.last_epoch_started == r.last_epoch_started &&
+ l.last_interval_started == r.last_interval_started &&
+ l.last_user_version == r.last_user_version &&
+ l.log_tail == r.log_tail &&
+ l.last_backfill == r.last_backfill &&
+ l.purged_snaps == r.purged_snaps &&
+ l.stats == r.stats &&
+ l.history == r.history &&
+ l.hit_set == r.hit_set;
+ }
+
+ pg_info_t()
+ : last_epoch_started(0),
+ last_interval_started(0),
+ last_user_version(0),
+ last_backfill(hobject_t::get_max())
+ { }
+ // cppcheck-suppress noExplicitConstructor
+ pg_info_t(spg_t p)
+ : pgid(p),
+ last_epoch_started(0),
+ last_interval_started(0),
+ last_user_version(0),
+ last_backfill(hobject_t::get_max())
+ { }
+
+ void set_last_backfill(hobject_t pos) {
+ last_backfill = pos;
+ }
+
+ bool is_empty() const { return last_update.version == 0; }
+ bool dne() const { return history.epoch_created == 0; }
+
+ bool has_missing() const { return last_complete != last_update; }
+ bool is_incomplete() const { return !last_backfill.is_max(); }
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_info_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_info_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi)
+{
+ out << pgi.pgid << "(";
+ if (pgi.dne())
+ out << " DNE";
+ if (pgi.is_empty())
+ out << " empty";
+ else {
+ out << " v " << pgi.last_update;
+ if (pgi.last_complete != pgi.last_update)
+ out << " lc " << pgi.last_complete;
+ out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
+ }
+ if (pgi.is_incomplete())
+ out << " lb " << pgi.last_backfill;
+ //out << " c " << pgi.epoch_created;
+ out << " local-lis/les=" << pgi.last_interval_started
+ << "/" << pgi.last_epoch_started;
+ out << " n=" << pgi.stats.stats.sum.num_objects;
+ out << " " << pgi.history
+ << ")";
+ return out;
+}
+
+/**
+ * pg_fast_info_t - common pg_info_t fields
+ *
+ * These are the fields of pg_info_t (and children) that are updated for
+ * most IO operations.
+ *
+ * ** WARNING **
+ * Because we rely on these fields to be applied to the normal
+ * info struct, adding a new field here that is not also new in info
+ * means that we must set an incompat OSD feature bit!
+ */
+struct pg_fast_info_t {
+ eversion_t last_update;
+ eversion_t last_complete;
+ version_t last_user_version;
+ struct { // pg_stat_t stats
+ eversion_t version;
+ version_t reported_seq;
+ utime_t last_fresh;
+ utime_t last_active;
+ utime_t last_peered;
+ utime_t last_clean;
+ utime_t last_unstale;
+ utime_t last_undegraded;
+ utime_t last_fullsized;
+ int64_t log_size; // (also ondisk_log_size, which has the same value)
+ struct { // object_stat_collection_t stats;
+ struct { // objct_stat_sum_t sum
+ int64_t num_bytes; // in bytes
+ int64_t num_objects;
+ int64_t num_object_copies;
+ int64_t num_rd;
+ int64_t num_rd_kb;
+ int64_t num_wr;
+ int64_t num_wr_kb;
+ int64_t num_objects_dirty;
+ } sum;
+ } stats;
+ } stats;
+
+ void populate_from(const pg_info_t& info) {
+ last_update = info.last_update;
+ last_complete = info.last_complete;
+ last_user_version = info.last_user_version;
+ stats.version = info.stats.version;
+ stats.reported_seq = info.stats.reported_seq;
+ stats.last_fresh = info.stats.last_fresh;
+ stats.last_active = info.stats.last_active;
+ stats.last_peered = info.stats.last_peered;
+ stats.last_clean = info.stats.last_clean;
+ stats.last_unstale = info.stats.last_unstale;
+ stats.last_undegraded = info.stats.last_undegraded;
+ stats.last_fullsized = info.stats.last_fullsized;
+ stats.log_size = info.stats.log_size;
+ stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
+ stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
+ stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
+ stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
+ stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
+ stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
+ stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
+ stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
+ }
+
+ bool try_apply_to(pg_info_t* info) {
+ if (last_update <= info->last_update)
+ return false;
+ info->last_update = last_update;
+ info->last_complete = last_complete;
+ info->last_user_version = last_user_version;
+ info->stats.version = stats.version;
+ info->stats.reported_seq = stats.reported_seq;
+ info->stats.last_fresh = stats.last_fresh;
+ info->stats.last_active = stats.last_active;
+ info->stats.last_peered = stats.last_peered;
+ info->stats.last_clean = stats.last_clean;
+ info->stats.last_unstale = stats.last_unstale;
+ info->stats.last_undegraded = stats.last_undegraded;
+ info->stats.last_fullsized = stats.last_fullsized;
+ info->stats.log_size = stats.log_size;
+ info->stats.ondisk_log_size = stats.log_size;
+ info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
+ info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
+ info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
+ info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
+ info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
+ info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
+ info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
+ info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
+ return true;
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(last_update, bl);
+ encode(last_complete, bl);
+ encode(last_user_version, bl);
+ encode(stats.version, bl);
+ encode(stats.reported_seq, bl);
+ encode(stats.last_fresh, bl);
+ encode(stats.last_active, bl);
+ encode(stats.last_peered, bl);
+ encode(stats.last_clean, bl);
+ encode(stats.last_unstale, bl);
+ encode(stats.last_undegraded, bl);
+ encode(stats.last_fullsized, bl);
+ encode(stats.log_size, bl);
+ encode(stats.stats.sum.num_bytes, bl);
+ encode(stats.stats.sum.num_objects, bl);
+ encode(stats.stats.sum.num_object_copies, bl);
+ encode(stats.stats.sum.num_rd, bl);
+ encode(stats.stats.sum.num_rd_kb, bl);
+ encode(stats.stats.sum.num_wr, bl);
+ encode(stats.stats.sum.num_wr_kb, bl);
+ encode(stats.stats.sum.num_objects_dirty, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(last_update, p);
+ decode(last_complete, p);
+ decode(last_user_version, p);
+ decode(stats.version, p);
+ decode(stats.reported_seq, p);
+ decode(stats.last_fresh, p);
+ decode(stats.last_active, p);
+ decode(stats.last_peered, p);
+ decode(stats.last_clean, p);
+ decode(stats.last_unstale, p);
+ decode(stats.last_undegraded, p);
+ decode(stats.last_fullsized, p);
+ decode(stats.log_size, p);
+ decode(stats.stats.sum.num_bytes, p);
+ decode(stats.stats.sum.num_objects, p);
+ decode(stats.stats.sum.num_object_copies, p);
+ decode(stats.stats.sum.num_rd, p);
+ decode(stats.stats.sum.num_rd_kb, p);
+ decode(stats.stats.sum.num_wr, p);
+ decode(stats.stats.sum.num_wr_kb, p);
+ decode(stats.stats.sum.num_objects_dirty, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(pg_fast_info_t)
+
+
+/**
+ * PastIntervals -- information needed to determine the PriorSet and
+ * the might_have_unfound set
+ */
+class PastIntervals {
+#ifdef WITH_SEASTAR
+ using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
+#else
+ using OSDMapRef = std::shared_ptr<const OSDMap>;
+#endif
+public:
+ struct pg_interval_t {
+ std::vector<int32_t> up, acting;
+ epoch_t first, last;
+ bool maybe_went_rw;
+ int32_t primary;
+ int32_t up_primary;
+
+ pg_interval_t()
+ : first(0), last(0),
+ maybe_went_rw(false),
+ primary(-1),
+ up_primary(-1)
+ {}
+
+ pg_interval_t(
+ std::vector<int32_t> &&up,
+ std::vector<int32_t> &&acting,
+ epoch_t first,
+ epoch_t last,
+ bool maybe_went_rw,
+ int32_t primary,
+ int32_t up_primary)
+ : up(up), acting(acting), first(first), last(last),
+ maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
+ {}
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_interval_t*>& o);
+ };
+
+ PastIntervals();
+ PastIntervals(PastIntervals &&rhs) = default;
+ PastIntervals &operator=(PastIntervals &&rhs) = default;
+
+ PastIntervals(const PastIntervals &rhs);
+ PastIntervals &operator=(const PastIntervals &rhs);
+
+ class interval_rep {
+ public:
+ virtual size_t size() const = 0;
+ virtual bool empty() const = 0;
+ virtual void clear() = 0;
+ virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
+ virtual std::set<pg_shard_t> get_all_participants(
+ bool ec_pool) const = 0;
+ virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
+ virtual std::unique_ptr<interval_rep> clone() const = 0;
+ virtual std::ostream &print(std::ostream &out) const = 0;
+ virtual void encode(ceph::buffer::list &bl) const = 0;
+ virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
+ virtual void dump(ceph::Formatter *f) const = 0;
+ virtual void iterate_mayberw_back_to(
+ epoch_t les,
+ std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
+
+ virtual bool has_full_intervals() const { return false; }
+ virtual void iterate_all_intervals(
+ std::function<void(const pg_interval_t &)> &&f) const {
+ ceph_assert(!has_full_intervals());
+ ceph_abort_msg("not valid for this implementation");
+ }
+ virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
+
+ virtual ~interval_rep() {}
+ };
+ friend class pi_compact_rep;
+private:
+
+ std::unique_ptr<interval_rep> past_intervals;
+
+ explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
+
+public:
+ void add_interval(bool ec_pool, const pg_interval_t &interval) {
+ ceph_assert(past_intervals);
+ return past_intervals->add_interval(ec_pool, interval);
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ ENCODE_START(1, 1, bl);
+ if (past_intervals) {
+ __u8 type = 2;
+ encode(type, bl);
+ past_intervals->encode(bl);
+ } else {
+ encode((__u8)0, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &bl);
+
+ void dump(ceph::Formatter *f) const {
+ ceph_assert(past_intervals);
+ past_intervals->dump(f);
+ }
+ static void generate_test_instances(std::list<PastIntervals *> & o);
+
+ /**
+ * Determines whether there is an interval change
+ */
+ static bool is_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const std::vector<int> &old_acting,
+ const std::vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const std::vector<int> &old_up,
+ const std::vector<int> &new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ unsigned old_pg_num,
+ unsigned new_pg_num,
+ unsigned old_pg_num_pending,
+ unsigned new_pg_num_pending,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
+ uint32_t old_crush_count,
+ uint32_t new_crush_count,
+ uint32_t old_crush_target,
+ uint32_t new_crush_target,
+ uint32_t old_crush_barrier,
+ uint32_t new_crush_barrier,
+ int32_t old_crush_member,
+ int32_t new_crush_member,
+ pg_t pgid
+ );
+
+ /**
+ * Determines whether there is an interval change
+ */
+ static bool is_new_interval(
+ int old_acting_primary, ///< [in] primary as of lastmap
+ int new_acting_primary, ///< [in] primary as of lastmap
+ const std::vector<int> &old_acting, ///< [in] acting as of lastmap
+ const std::vector<int> &new_acting, ///< [in] acting as of osdmap
+ int old_up_primary, ///< [in] up primary of lastmap
+ int new_up_primary, ///< [in] up primary of osdmap
+ const std::vector<int> &old_up, ///< [in] up as of lastmap
+ const std::vector<int> &new_up, ///< [in] up as of osdmap
+ const OSDMap *osdmap, ///< [in] current map
+ const OSDMap *lastmap, ///< [in] last map
+ pg_t pgid ///< [in] pgid for pg
+ );
+
+ /**
+ * Integrates a new map into *past_intervals, returns true
+ * if an interval was closed out.
+ */
+ static bool check_new_interval(
+ int old_acting_primary, ///< [in] primary as of lastmap
+ int new_acting_primary, ///< [in] primary as of osdmap
+ const std::vector<int> &old_acting, ///< [in] acting as of lastmap
+ const std::vector<int> &new_acting, ///< [in] acting as of osdmap
+ int old_up_primary, ///< [in] up primary of lastmap
+ int new_up_primary, ///< [in] up primary of osdmap
+ const std::vector<int> &old_up, ///< [in] up as of lastmap
+ const std::vector<int> &new_up, ///< [in] up as of osdmap
+ epoch_t same_interval_since, ///< [in] as of osdmap
+ epoch_t last_epoch_clean, ///< [in] current
+ const OSDMap *osdmap, ///< [in] current map
+ const OSDMap *lastmap, ///< [in] last map
+ pg_t pgid, ///< [in] pgid for pg
+ const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
+ PastIntervals *past_intervals, ///< [out] intervals
+ std::ostream *out = 0 ///< [out] debug ostream
+ );
+ static bool check_new_interval(
+ int old_acting_primary, ///< [in] primary as of lastmap
+ int new_acting_primary, ///< [in] primary as of osdmap
+ const std::vector<int> &old_acting, ///< [in] acting as of lastmap
+ const std::vector<int> &new_acting, ///< [in] acting as of osdmap
+ int old_up_primary, ///< [in] up primary of lastmap
+ int new_up_primary, ///< [in] up primary of osdmap
+ const std::vector<int> &old_up, ///< [in] up as of lastmap
+ const std::vector<int> &new_up, ///< [in] up as of osdmap
+ epoch_t same_interval_since, ///< [in] as of osdmap
+ epoch_t last_epoch_clean, ///< [in] current
+ OSDMapRef osdmap, ///< [in] current map
+ OSDMapRef lastmap, ///< [in] last map
+ pg_t pgid, ///< [in] pgid for pg
+ const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
+ PastIntervals *past_intervals, ///< [out] intervals
+ std::ostream *out = 0 ///< [out] debug ostream
+ ) {
+ return check_new_interval(
+ old_acting_primary, new_acting_primary,
+ old_acting, new_acting,
+ old_up_primary, new_up_primary,
+ old_up, new_up,
+ same_interval_since, last_epoch_clean,
+ osdmap.get(), lastmap.get(),
+ pgid,
+ could_have_gone_active,
+ past_intervals,
+ out);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
+
+ template <typename F>
+ void iterate_mayberw_back_to(
+ epoch_t les,
+ F &&f) const {
+ ceph_assert(past_intervals);
+ past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
+ }
+ void clear() {
+ ceph_assert(past_intervals);
+ past_intervals->clear();
+ }
+
+ /**
+ * Should return a value which gives an indication of the amount
+ * of state contained
+ */
+ size_t size() const {
+ ceph_assert(past_intervals);
+ return past_intervals->size();
+ }
+
+ bool empty() const {
+ ceph_assert(past_intervals);
+ return past_intervals->empty();
+ }
+
+ void swap(PastIntervals &other) {
+ using std::swap;
+ swap(other.past_intervals, past_intervals);
+ }
+
+ /**
+ * Return all shards which have been in the acting set back to the
+ * latest epoch to which we have trimmed except for pg_whoami
+ */
+ std::set<pg_shard_t> get_might_have_unfound(
+ pg_shard_t pg_whoami,
+ bool ec_pool) const {
+ ceph_assert(past_intervals);
+ auto ret = past_intervals->get_all_participants(ec_pool);
+ ret.erase(pg_whoami);
+ return ret;
+ }
+
+ /**
+ * Return all shards which we might want to talk to for peering
+ */
+ std::set<pg_shard_t> get_all_probe(
+ bool ec_pool) const {
+ ceph_assert(past_intervals);
+ return past_intervals->get_all_participants(ec_pool);
+ }
+
+ /* Return the set of epochs [start, end) represented by the
+ * past_interval set.
+ */
+ std::pair<epoch_t, epoch_t> get_bounds() const {
+ ceph_assert(past_intervals);
+ return past_intervals->get_bounds();
+ }
+
+ void adjust_start_backwards(epoch_t last_epoch_clean) {
+ ceph_assert(past_intervals);
+ past_intervals->adjust_start_backwards(last_epoch_clean);
+ }
+
+ enum osd_state_t {
+ UP,
+ DOWN,
+ DNE,
+ LOST
+ };
+ struct PriorSet {
+ bool ec_pool = false;
+ std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
+ std::set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
+ std::map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
+
+ bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
+ const IsPGRecoverablePredicate* pcontdec = nullptr;
+
+ PriorSet() = default;
+ PriorSet(PriorSet &&) = default;
+ PriorSet &operator=(PriorSet &&) = default;
+
+ PriorSet &operator=(const PriorSet &) = delete;
+ PriorSet(const PriorSet &) = delete;
+
+ bool operator==(const PriorSet &rhs) const {
+ return (ec_pool == rhs.ec_pool) &&
+ (probe == rhs.probe) &&
+ (down == rhs.down) &&
+ (blocked_by == rhs.blocked_by) &&
+ (pg_down == rhs.pg_down);
+ }
+
+ bool affected_by_map(
+ const OSDMap &osdmap,
+ const DoutPrefixProvider *dpp) const;
+
+ // For verifying tests
+ PriorSet(
+ bool ec_pool,
+ std::set<pg_shard_t> probe,
+ std::set<int> down,
+ std::map<int, epoch_t> blocked_by,
+ bool pg_down,
+ const IsPGRecoverablePredicate *pcontdec)
+ : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
+ pg_down(pg_down), pcontdec(pcontdec) {}
+
+ private:
+ template <typename F>
+ PriorSet(
+ const PastIntervals &past_intervals,
+ bool ec_pool,
+ epoch_t last_epoch_started,
+ const IsPGRecoverablePredicate *c,
+ F f,
+ const std::vector<int> &up,
+ const std::vector<int> &acting,
+ const DoutPrefixProvider *dpp);
+
+ friend class PastIntervals;
+ };
+
+ template <typename... Args>
+ PriorSet get_prior_set(Args&&... args) const {
+ return PriorSet(*this, std::forward<Args>(args)...);
+ }
+};
+WRITE_CLASS_ENCODER(PastIntervals)
+
+std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
+std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
+std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
+
+template <typename F>
+PastIntervals::PriorSet::PriorSet(
+ const PastIntervals &past_intervals,
+ bool ec_pool,
+ epoch_t last_epoch_started,
+ const IsPGRecoverablePredicate *c,
+ F f,
+ const std::vector<int> &up,
+ const std::vector<int> &acting,
+ const DoutPrefixProvider *dpp)
+ : ec_pool(ec_pool), pg_down(false), pcontdec(c)
+{
+ /*
+ * We have to be careful to gracefully deal with situations like
+ * so. Say we have a power outage or something that takes out both
+ * OSDs, but the monitor doesn't mark them down in the same epoch.
+ * The history may look like
+ *
+ * 1: A B
+ * 2: B
+ * 3: let's say B dies for good, too (say, from the power spike)
+ * 4: A
+ *
+ * which makes it look like B may have applied updates to the PG
+ * that we need in order to proceed. This sucks...
+ *
+ * To minimize the risk of this happening, we CANNOT go active if
+ * _any_ OSDs in the prior set are down until we send an MOSDAlive
+ * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
+ * Then, we have something like
+ *
+ * 1: A B
+ * 2: B up_thru[B]=0
+ * 3:
+ * 4: A
+ *
+ * -> we can ignore B, bc it couldn't have gone active (alive_thru
+ * still 0).
+ *
+ * or,
+ *
+ * 1: A B
+ * 2: B up_thru[B]=0
+ * 3: B up_thru[B]=2
+ * 4:
+ * 5: A
+ *
+ * -> we must wait for B, bc it was alive through 2, and could have
+ * written to the pg.
+ *
+ * If B is really dead, then an administrator will need to manually
+ * intervene by marking the OSD as "lost."
+ */
+
+ // Include current acting and up nodes... not because they may
+ // contain old data (this interval hasn't gone active, obviously),
+ // but because we want their pg_info to inform choose_acting(), and
+ // so that we know what they do/do not have explicitly before
+ // sending them any new info/logs/whatever.
+ for (unsigned i = 0; i < acting.size(); i++) {
+ if (acting[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
+ probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ // It may be possible to exclude the up nodes, but let's keep them in
+ // there for now.
+ for (unsigned i = 0; i < up.size(); i++) {
+ if (up[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
+ probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+
+ std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
+ ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
+ for (auto &&i: all_probe) {
+ switch (f(0, i.osd, nullptr)) {
+ case UP: {
+ probe.insert(i);
+ break;
+ }
+ case DNE:
+ case LOST:
+ case DOWN: {
+ down.insert(i.osd);
+ break;
+ }
+ }
+ }
+
+ past_intervals.iterate_mayberw_back_to(
+ last_epoch_started,
+ [&](epoch_t start, const std::set<pg_shard_t> &acting) {
+ ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
+ << ", acting: " << acting << dendl;
+
+ // look at candidate osds during this interval. each falls into
+ // one of three categories: up, down (but potentially
+ // interesting), or lost (down, but we won't wait for it).
+ std::set<pg_shard_t> up_now;
+ std::map<int, epoch_t> candidate_blocked_by;
+ // any candidates down now (that might have useful data)
+ bool any_down_now = false;
+
+ // consider ACTING osds
+ for (auto &&so: acting) {
+ epoch_t lost_at = 0;
+ switch (f(start, so.osd, &lost_at)) {
+ case UP: {
+ // include past acting osds if they are up.
+ up_now.insert(so);
+ break;
+ }
+ case DNE: {
+ ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
+ << " no longer exists" << dendl;
+ break;
+ }
+ case LOST: {
+ ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
+ << " is down, but lost_at " << lost_at << dendl;
+ up_now.insert(so);
+ break;
+ }
+ case DOWN: {
+ ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
+ << " is down" << dendl;
+ candidate_blocked_by[so.osd] = lost_at;
+ any_down_now = true;
+ break;
+ }
+ }
+ }
+
+ // if not enough osds survived this interval, and we may have gone rw,
+ // then we need to wait for one of those osds to recover to
+ // ensure that we haven't lost any information.
+ if (!(*pcontdec)(up_now) && any_down_now) {
+ // fixme: how do we identify a "clean" shutdown anyway?
+ ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
+ << " insufficient up; including down osds" << dendl;
+ ceph_assert(!candidate_blocked_by.empty());
+ pg_down = true;
+ blocked_by.insert(
+ candidate_blocked_by.begin(),
+ candidate_blocked_by.end());
+ }
+ });
+
+ ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
+ << " down " << down
+ << " blocked_by " << blocked_by
+ << (pg_down ? " pg_down":"")
+ << dendl;
+}
+
+struct pg_notify_t {
+ epoch_t query_epoch;
+ epoch_t epoch_sent;
+ pg_info_t info;
+ shard_id_t to;
+ shard_id_t from;
+ PastIntervals past_intervals;
+ pg_notify_t() :
+ query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
+ from(shard_id_t::NO_SHARD) {}
+ pg_notify_t(
+ shard_id_t to,
+ shard_id_t from,
+ epoch_t query_epoch,
+ epoch_t epoch_sent,
+ const pg_info_t &info,
+ const PastIntervals& pi)
+ : query_epoch(query_epoch),
+ epoch_sent(epoch_sent),
+ info(info), to(to), from(from),
+ past_intervals(pi) {
+ ceph_assert(from == info.pgid.shard);
+ }
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_notify_t*> &o);
+};
+WRITE_CLASS_ENCODER(pg_notify_t)
+std::ostream &operator<<(std::ostream &lhs, const pg_notify_t &notify);
+
+
+/**
+ * pg_query_t - used to ask a peer for information about a pg.
+ *
+ * note: if version=0, type=LOG, then we just provide our full log.
+ */
+struct pg_query_t {
+ enum {
+ INFO = 0,
+ LOG = 1,
+ MISSING = 4,
+ FULLLOG = 5,
+ };
+ std::string_view get_type_name() const {
+ switch (type) {
+ case INFO: return "info";
+ case LOG: return "log";
+ case MISSING: return "missing";
+ case FULLLOG: return "fulllog";
+ default: return "???";
+ }
+ }
+
+ __s32 type;
+ eversion_t since;
+ pg_history_t history;
+ epoch_t epoch_sent;
+ shard_id_t to;
+ shard_id_t from;
+
+ pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
+ from(shard_id_t::NO_SHARD) {}
+ pg_query_t(
+ int t,
+ shard_id_t to,
+ shard_id_t from,
+ const pg_history_t& h,
+ epoch_t epoch_sent)
+ : type(t),
+ history(h),
+ epoch_sent(epoch_sent),
+ to(to), from(from) {
+ ceph_assert(t != LOG);
+ }
+ pg_query_t(
+ int t,
+ shard_id_t to,
+ shard_id_t from,
+ eversion_t s,
+ const pg_history_t& h,
+ epoch_t epoch_sent)
+ : type(t), since(s), history(h),
+ epoch_sent(epoch_sent), to(to), from(from) {
+ ceph_assert(t == LOG);
+ }
+
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_query_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
+ out << "query(" << q.get_type_name() << " " << q.since;
+ if (q.type == pg_query_t::LOG)
+ out << " " << q.history;
+ out << " epoch_sent " << q.epoch_sent;
+ out << ")";
+ return out;
+}
+
+/**
+ * pg_lease_t - readable lease metadata, from primary -> non-primary
+ *
+ * This metadata serves to increase either or both of the lease expiration
+ * and upper bound on the non-primary.
+ */
+struct pg_lease_t {
+ /// pg readable_until value; replicas must not be readable beyond this
+ ceph::signedspan readable_until = ceph::signedspan::zero();
+
+ /// upper bound on any acting osd's readable_until
+ ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+ /// duration of the lease (in case clock deltas aren't available)
+ ceph::signedspan interval = ceph::signedspan::zero();
+
+ pg_lease_t() {}
+ pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
+ ceph::signedspan i)
+ : readable_until(ru),
+ readable_until_ub(ruub),
+ interval(i) {}
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_lease_t*>& o);
+
+ friend std::ostream& operator<<(std::ostream& out, const pg_lease_t& l) {
+ return out << "pg_lease(ru " << l.readable_until
+ << " ub " << l.readable_until_ub
+ << " int " << l.interval << ")";
+ }
+};
+WRITE_CLASS_ENCODER(pg_lease_t)
+
+/**
+ * pg_lease_ack_t - lease ack, from non-primary -> primary
+ *
+ * This metadata acknowledges to the primary what a non-primary's noted
+ * upper bound is.
+ */
+struct pg_lease_ack_t {
+ /// highest upper bound non-primary has recorded (primary's clock)
+ ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+ pg_lease_ack_t() {}
+ pg_lease_ack_t(ceph::signedspan ub)
+ : readable_until_ub(ub) {}
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
+
+ friend std::ostream& operator<<(std::ostream& out, const pg_lease_ack_t& l) {
+ return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
+ }
+};
+WRITE_CLASS_ENCODER(pg_lease_ack_t)
+
+
+
+class PGBackend;
+class ObjectModDesc {
+ bool can_local_rollback;
+ bool rollback_info_completed;
+
+ // version required to decode, reflected in encode/decode version
+ __u8 max_required_version = 1;
+public:
+ class Visitor {
+ public:
+ virtual void append(uint64_t old_offset) {}
+ virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
+ virtual void rmobject(version_t old_version) {}
+ /**
+ * Used to support the unfound_lost_delete log event: if the stashed
+ * version exists, we unstash it, otherwise, we do nothing. This way
+ * each replica rolls back to whatever state it had prior to the attempt
+ * at mark unfound lost delete
+ */
+ virtual void try_rmobject(version_t old_version) {
+ rmobject(old_version);
+ }
+ virtual void create() {}
+ virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
+ virtual void rollback_extents(
+ version_t gen,
+ const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
+ virtual ~Visitor() {}
+ };
+ void visit(Visitor *visitor) const;
+ mutable ceph::buffer::list bl;
+ enum ModID {
+ APPEND = 1,
+ SETATTRS = 2,
+ DELETE = 3,
+ CREATE = 4,
+ UPDATE_SNAPS = 5,
+ TRY_DELETE = 6,
+ ROLLBACK_EXTENTS = 7
+ };
+ ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
+ bl.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+ void claim(ObjectModDesc &other) {
+ bl = std::move(other.bl);
+ can_local_rollback = other.can_local_rollback;
+ rollback_info_completed = other.rollback_info_completed;
+ }
+ void claim_append(ObjectModDesc &other) {
+ if (!can_local_rollback || rollback_info_completed)
+ return;
+ if (!other.can_local_rollback) {
+ mark_unrollbackable();
+ return;
+ }
+ bl.claim_append(other.bl);
+ rollback_info_completed = other.rollback_info_completed;
+ }
+ void swap(ObjectModDesc &other) {
+ bl.swap(other.bl);
+
+ using std::swap;
+ swap(other.can_local_rollback, can_local_rollback);
+ swap(other.rollback_info_completed, rollback_info_completed);
+ swap(other.max_required_version, max_required_version);
+ }
+ void append_id(ModID id) {
+ using ceph::encode;
+ uint8_t _id(id);
+ encode(_id, bl);
+ }
+ void append(uint64_t old_size) {
+ if (!can_local_rollback || rollback_info_completed)
+ return;
+ ENCODE_START(1, 1, bl);
+ append_id(APPEND);
+ encode(old_size, bl);
+ ENCODE_FINISH(bl);
+ }
+ void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
+ if (!can_local_rollback || rollback_info_completed)
+ return;
+ ENCODE_START(1, 1, bl);
+ append_id(SETATTRS);
+ encode(old_attrs, bl);
+ ENCODE_FINISH(bl);
+ }
+ bool rmobject(version_t deletion_version) {
+ if (!can_local_rollback || rollback_info_completed)
+ return false;
+ ENCODE_START(1, 1, bl);
+ append_id(DELETE);
+ encode(deletion_version, bl);
+ ENCODE_FINISH(bl);
+ rollback_info_completed = true;
+ return true;
+ }
+ bool try_rmobject(version_t deletion_version) {
+ if (!can_local_rollback || rollback_info_completed)
+ return false;
+ ENCODE_START(1, 1, bl);
+ append_id(TRY_DELETE);
+ encode(deletion_version, bl);
+ ENCODE_FINISH(bl);
+ rollback_info_completed = true;
+ return true;
+ }
+ void create() {
+ if (!can_local_rollback || rollback_info_completed)
+ return;
+ rollback_info_completed = true;
+ ENCODE_START(1, 1, bl);
+ append_id(CREATE);
+ ENCODE_FINISH(bl);
+ }
+ void update_snaps(const std::set<snapid_t> &old_snaps) {
+ if (!can_local_rollback || rollback_info_completed)
+ return;
+ ENCODE_START(1, 1, bl);
+ append_id(UPDATE_SNAPS);
+ encode(old_snaps, bl);
+ ENCODE_FINISH(bl);
+ }
+ void rollback_extents(
+ version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
+ ceph_assert(can_local_rollback);
+ ceph_assert(!rollback_info_completed);
+ if (max_required_version < 2)
+ max_required_version = 2;
+ ENCODE_START(2, 2, bl);
+ append_id(ROLLBACK_EXTENTS);
+ encode(gen, bl);
+ encode(extents, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ // cannot be rolled back
+ void mark_unrollbackable() {
+ can_local_rollback = false;
+ bl.clear();
+ }
+ bool can_rollback() const {
+ return can_local_rollback;
+ }
+ bool empty() const {
+ return can_local_rollback && (bl.length() == 0);
+ }
+
+ bool requires_kraken() const {
+ return max_required_version >= 2;
+ }
+
+ /**
+ * Create fresh copy of bl bytes to avoid keeping large buffers around
+ * in the case that bl contains ptrs which point into a much larger
+ * message buffer
+ */
+ void trim_bl() const {
+ if (bl.length() > 0)
+ bl.rebuild();
+ }
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ObjectModDesc*>& o);
+};
+WRITE_CLASS_ENCODER(ObjectModDesc)
+
+class ObjectCleanRegions {
+private:
+ bool new_object;
+ bool clean_omap;
+ interval_set<uint64_t> clean_offsets;
+ static std::atomic<uint32_t> max_num_intervals;
+
+ /**
+ * trim the number of intervals if clean_offsets.num_intervals()
+ * exceeds the given upbound max_num_intervals
+ * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
+ * then new interval [30~10] will evict out the shortest one [20~5]
+ * finally, clean_offsets becomes {[5~10], [30~10]}
+ */
+ void trim();
+ friend std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
+public:
+ ObjectCleanRegions() : new_object(false), clean_omap(true) {
+ clean_offsets.insert(0, (uint64_t)-1);
+ }
+ ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
+ : new_object(false), clean_omap(co) {
+ clean_offsets.insert(offset, len);
+ }
+ bool operator==(const ObjectCleanRegions &orc) const {
+ return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
+ }
+ static void set_max_num_intervals(uint32_t num);
+ void merge(const ObjectCleanRegions &other);
+ void mark_data_region_dirty(uint64_t offset, uint64_t len);
+ void mark_omap_dirty();
+ void mark_object_new();
+ void mark_fully_dirty();
+ interval_set<uint64_t> get_dirty_regions() const;
+ bool omap_is_dirty() const;
+ bool object_is_exist() const;
+ bool is_clean_region(uint64_t offset, uint64_t len) const;
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ObjectCleanRegions*>& o);
+};
+WRITE_CLASS_ENCODER(ObjectCleanRegions)
+std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
+
+
+struct OSDOp {
+ ceph_osd_op op;
+ sobject_t soid;
+
+ ceph::buffer::list indata, outdata;
+ errorcode32_t rval = 0;
+
+ OSDOp() {
+ // FIPS zeroization audit 20191115: this memset clean for security
+ memset(&op, 0, sizeof(ceph_osd_op));
+ }
+
+ OSDOp(const int op_code) {
+ // FIPS zeroization audit 20191115: this memset clean for security
+ memset(&op, 0, sizeof(ceph_osd_op));
+ op.op = op_code;
+ }
+
+ /**
+ * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
+ *
+ * @param ops [out] vector of OSDOps
+ * @param in [in] combined data buffer
+ */
+ template<typename V>
+ static void split_osd_op_vector_in_data(V& ops,
+ ceph::buffer::list& in) {
+ ceph::buffer::list::iterator datap = in.begin();
+ for (unsigned i = 0; i < ops.size(); i++) {
+ if (ops[i].op.payload_len) {
+ datap.copy(ops[i].op.payload_len, ops[i].indata);
+ }
+ }
+ }
+
+ /**
+ * merge indata members of a vector of OSDOp into a single ceph::buffer::list
+ *
+ * Notably this also encodes certain other OSDOp data into the data
+ * buffer, including the sobject_t soid.
+ *
+ * @param ops [in] vector of OSDOps
+ * @param out [out] combined data buffer
+ */
+ template<typename V>
+ static void merge_osd_op_vector_in_data(V& ops, ceph::buffer::list& out) {
+ for (unsigned i = 0; i < ops.size(); i++) {
+ if (ops[i].indata.length()) {
+ ops[i].op.payload_len = ops[i].indata.length();
+ out.append(ops[i].indata);
+ }
+ }
+ }
+
+ /**
+ * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
+ *
+ * @param ops [out] vector of OSDOps
+ * @param in [in] combined data buffer
+ */
+ static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
+
+ /**
+ * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
+ *
+ * @param ops [in] vector of OSDOps
+ * @param out [out] combined data buffer
+ */
+ static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
+
+ /**
+ * Clear data as much as possible, leave minimal data for historical op dump
+ *
+ * @param ops [in] vector of OSDOps
+ */
+ template<typename V>
+ static void clear_data(V& ops) {
+ for (unsigned i = 0; i < ops.size(); i++) {
+ OSDOp& op = ops[i];
+ op.outdata.clear();
+ if (ceph_osd_op_type_attr(op.op.op) &&
+ op.op.xattr.name_len &&
+ op.indata.length() >= op.op.xattr.name_len) {
+ ceph::buffer::list bl;
+ bl.push_back(ceph::buffer::ptr_node::create(op.op.xattr.name_len));
+ bl.begin().copy_in(op.op.xattr.name_len, op.indata);
+ op.indata = std::move(bl);
+ } else if (ceph_osd_op_type_exec(op.op.op) &&
+ op.op.cls.class_len &&
+ op.indata.length() >
+ (op.op.cls.class_len + op.op.cls.method_len)) {
+ __u8 len = op.op.cls.class_len + op.op.cls.method_len;
+ ceph::buffer::list bl;
+ bl.push_back(ceph::buffer::ptr_node::create(len));
+ bl.begin().copy_in(len, op.indata);
+ op.indata = std::move(bl);
+ } else {
+ op.indata.clear();
+ }
+ }
+ }
+};
+std::ostream& operator<<(std::ostream& out, const OSDOp& op);
+
+struct pg_log_op_return_item_t {
+ int32_t rval;
+ ceph::buffer::list bl;
+ void encode(ceph::buffer::list& p) const {
+ using ceph::encode;
+ encode(rval, p);
+ encode(bl, p);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ using ceph::decode;
+ decode(rval, p);
+ decode(bl, p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_int("rval", rval);
+ f->dump_unsigned("bl_length", bl.length());
+ }
+ friend bool operator==(const pg_log_op_return_item_t& lhs,
+ const pg_log_op_return_item_t& rhs) {
+ return lhs.rval == rhs.rval &&
+ lhs.bl.contents_equal(rhs.bl);
+ }
+ friend bool operator!=(const pg_log_op_return_item_t& lhs,
+ const pg_log_op_return_item_t& rhs) {
+ return !(lhs == rhs);
+ }
+ friend std::ostream& operator<<(std::ostream& out, const pg_log_op_return_item_t& i) {
+ return out << "r=" << i.rval << "+" << i.bl.length() << "b";
+ }
+};
+WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
+
+/**
+ * pg_log_entry_t - single entry/event in pg log
+ *
+ */
+struct pg_log_entry_t {
+ enum {
+ MODIFY = 1, // some unspecified modification (but not *all* modifications)
+ CLONE = 2, // cloned object from head
+ DELETE = 3, // deleted object
+ //BACKLOG = 4, // event invented by generate_backlog [obsolete]
+ LOST_REVERT = 5, // lost new version, revert to an older version.
+ LOST_DELETE = 6, // lost new version, revert to no object (deleted).
+ LOST_MARK = 7, // lost new version, now EIO
+ PROMOTE = 8, // promoted object from another tier
+ CLEAN = 9, // mark an object clean
+ ERROR = 10, // write that returned an error
+ };
+ static const char *get_op_name(int op) {
+ switch (op) {
+ case MODIFY:
+ return "modify";
+ case PROMOTE:
+ return "promote";
+ case CLONE:
+ return "clone";
+ case DELETE:
+ return "delete";
+ case LOST_REVERT:
+ return "l_revert";
+ case LOST_DELETE:
+ return "l_delete";
+ case LOST_MARK:
+ return "l_mark";
+ case CLEAN:
+ return "clean";
+ case ERROR:
+ return "error";
+ default:
+ return "unknown";
+ }
+ }
+ const char *get_op_name() const {
+ return get_op_name(op);
+ }
+
+ // describes state for a locally-rollbackable entry
+ ObjectModDesc mod_desc;
+ ceph::buffer::list snaps; // only for clone entries
+ hobject_t soid;
+ osd_reqid_t reqid; // caller+tid to uniquely identify request
+ mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
+
+ /// map extra_reqids by index to error return code (if any)
+ mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
+
+ eversion_t version, prior_version, reverting_to;
+ version_t user_version; // the user version for this entry
+ utime_t mtime; // this is the _user_ mtime, mind you
+ int32_t return_code; // only stored for ERRORs for dup detection
+
+ std::vector<pg_log_op_return_item_t> op_returns;
+
+ __s32 op;
+ bool invalid_hash; // only when decoding sobject_t based entries
+ bool invalid_pool; // only when decoding pool-less hobject based entries
+ ObjectCleanRegions clean_regions;
+
+ pg_log_entry_t()
+ : user_version(0), return_code(0), op(0),
+ invalid_hash(false), invalid_pool(false) {
+ snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+ pg_log_entry_t(int _op, const hobject_t& _soid,
+ const eversion_t& v, const eversion_t& pv,
+ version_t uv,
+ const osd_reqid_t& rid, const utime_t& mt,
+ int return_code)
+ : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
+ mtime(mt), return_code(return_code), op(_op),
+ invalid_hash(false), invalid_pool(false) {
+ snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+
+ bool is_clone() const { return op == CLONE; }
+ bool is_modify() const { return op == MODIFY; }
+ bool is_promote() const { return op == PROMOTE; }
+ bool is_clean() const { return op == CLEAN; }
+ bool is_lost_revert() const { return op == LOST_REVERT; }
+ bool is_lost_delete() const { return op == LOST_DELETE; }
+ bool is_lost_mark() const { return op == LOST_MARK; }
+ bool is_error() const { return op == ERROR; }
+
+ bool is_update() const {
+ return
+ is_clone() || is_modify() || is_promote() || is_clean() ||
+ is_lost_revert() || is_lost_mark();
+ }
+ bool is_delete() const {
+ return op == DELETE || op == LOST_DELETE;
+ }
+
+ bool can_rollback() const {
+ return mod_desc.can_rollback();
+ }
+
+ void mark_unrollbackable() {
+ mod_desc.mark_unrollbackable();
+ }
+
+ bool requires_kraken() const {
+ return mod_desc.requires_kraken();
+ }
+
+ // Errors are only used for dup detection, whereas
+ // the index by objects is used by recovery, copy_get,
+ // and other facilities that don't expect or need to
+ // be aware of error entries.
+ bool object_is_indexed() const {
+ return !is_error();
+ }
+
+ bool reqid_is_indexed() const {
+ return reqid != osd_reqid_t() &&
+ (op == MODIFY || op == DELETE || op == ERROR);
+ }
+
+ void set_op_returns(const std::vector<OSDOp>& ops) {
+ op_returns.resize(ops.size());
+ for (unsigned i = 0; i < ops.size(); ++i) {
+ op_returns[i].rval = ops[i].rval;
+ op_returns[i].bl = ops[i].outdata;
+ }
+ }
+
+ std::string get_key_name() const;
+ void encode_with_checksum(ceph::buffer::list& bl) const;
+ void decode_with_checksum(ceph::buffer::list::const_iterator& p);
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_log_entry_t*>& o);
+
+};
+WRITE_CLASS_ENCODER(pg_log_entry_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
+
+struct pg_log_dup_t {
+ osd_reqid_t reqid; // caller+tid to uniquely identify request
+ eversion_t version;
+ version_t user_version; // the user version for this entry
+ int32_t return_code; // only stored for ERRORs for dup detection
+
+ std::vector<pg_log_op_return_item_t> op_returns;
+
+ pg_log_dup_t()
+ : user_version(0), return_code(0)
+ {}
+ explicit pg_log_dup_t(const pg_log_entry_t& entry)
+ : reqid(entry.reqid), version(entry.version),
+ user_version(entry.user_version),
+ return_code(entry.return_code),
+ op_returns(entry.op_returns)
+ {}
+ pg_log_dup_t(const eversion_t& v, version_t uv,
+ const osd_reqid_t& rid, int return_code)
+ : reqid(rid), version(v), user_version(uv),
+ return_code(return_code)
+ {}
+
+ std::string get_key_name() const;
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_log_dup_t*>& o);
+
+ bool operator==(const pg_log_dup_t &rhs) const {
+ return reqid == rhs.reqid &&
+ version == rhs.version &&
+ user_version == rhs.user_version &&
+ return_code == rhs.return_code &&
+ op_returns == rhs.op_returns;
+ }
+ bool operator!=(const pg_log_dup_t &rhs) const {
+ return !(*this == rhs);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+};
+WRITE_CLASS_ENCODER(pg_log_dup_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+
+/**
+ * pg_log_t - incremental log of recent pg changes.
+ *
+ * serves as a recovery queue for recent changes.
+ */
+struct pg_log_t {
+ /*
+ * head - newest entry (update|delete)
+ * tail - entry previous to oldest (update|delete) for which we have
+ * complete negative information.
+ * i.e. we can infer pg contents for any store whose last_update >= tail.
+ */
+ eversion_t head; // newest entry
+ eversion_t tail; // version prior to oldest
+
+protected:
+ // We can rollback rollback-able entries > can_rollback_to
+ eversion_t can_rollback_to;
+
+ // always <= can_rollback_to, indicates how far stashed rollback
+ // data can be found
+ eversion_t rollback_info_trimmed_to;
+
+public:
+ // the actual log
+ mempool::osd_pglog::list<pg_log_entry_t> log;
+
+ // entries just for dup op detection ordered oldest to newest
+ mempool::osd_pglog::list<pg_log_dup_t> dups;
+
+ pg_log_t() = default;
+ pg_log_t(const eversion_t &last_update,
+ const eversion_t &log_tail,
+ const eversion_t &can_rollback_to,
+ const eversion_t &rollback_info_trimmed_to,
+ mempool::osd_pglog::list<pg_log_entry_t> &&entries,
+ mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
+ : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
+ rollback_info_trimmed_to(rollback_info_trimmed_to),
+ log(std::move(entries)), dups(std::move(dup_entries)) {}
+ pg_log_t(const eversion_t &last_update,
+ const eversion_t &log_tail,
+ const eversion_t &can_rollback_to,
+ const eversion_t &rollback_info_trimmed_to,
+ const std::list<pg_log_entry_t> &entries,
+ const std::list<pg_log_dup_t> &dup_entries)
+ : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
+ rollback_info_trimmed_to(rollback_info_trimmed_to) {
+ for (auto &&entry: entries) {
+ log.push_back(entry);
+ }
+ for (auto &&entry: dup_entries) {
+ dups.push_back(entry);
+ }
+ }
+
+ void clear() {
+ eversion_t z;
+ rollback_info_trimmed_to = can_rollback_to = head = tail = z;
+ log.clear();
+ dups.clear();
+ }
+
+ eversion_t get_rollback_info_trimmed_to() const {
+ return rollback_info_trimmed_to;
+ }
+ eversion_t get_can_rollback_to() const {
+ return can_rollback_to;
+ }
+
+
+ pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
+ mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
+ oldlog.swap(log);
+
+ eversion_t old_tail;
+ unsigned mask = ~((~0)<<split_bits);
+ for (auto i = oldlog.begin();
+ i != oldlog.end();
+ ) {
+ if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
+ childlog.push_back(*i);
+ } else {
+ log.push_back(*i);
+ }
+ oldlog.erase(i++);
+ }
+
+ // osd_reqid is unique, so it doesn't matter if there are extra
+ // dup entries in each pg. To avoid storing oid with the dup
+ // entries, just copy the whole list.
+ auto childdups(dups);
+
+ return pg_log_t(
+ head,
+ tail,
+ can_rollback_to,
+ rollback_info_trimmed_to,
+ std::move(childlog),
+ std::move(childdups));
+ }
+
+ mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
+ ceph_assert(newhead >= tail);
+
+ mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
+ mempool::osd_pglog::list<pg_log_entry_t> divergent;
+ while (true) {
+ if (p == log.begin()) {
+ // yikes, the whole thing is divergent!
+ using std::swap;
+ swap(divergent, log);
+ break;
+ }
+ --p;
+ if (p->version.version <= newhead.version) {
+ /*
+ * look at eversion.version here. we want to avoid a situation like:
+ * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
+ * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
+ * lower_bound = 100'9
+ * i.e, same request, different version. If the eversion.version is > the
+ * lower_bound, we it is divergent.
+ */
+ ++p;
+ divergent.splice(divergent.begin(), log, p, log.end());
+ break;
+ }
+ ceph_assert(p->version > newhead);
+ }
+ head = newhead;
+
+ if (can_rollback_to > newhead)
+ can_rollback_to = newhead;
+
+ if (rollback_info_trimmed_to > newhead)
+ rollback_info_trimmed_to = newhead;
+
+ return divergent;
+ }
+
+ void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
+ log.clear();
+
+ // sort and merge dups
+ std::multimap<eversion_t,pg_log_dup_t> sorted;
+ for (auto& d : dups) {
+ sorted.emplace(d.version, d);
+ }
+ for (auto l : slogs) {
+ for (auto& d : l->dups) {
+ sorted.emplace(d.version, d);
+ }
+ }
+ dups.clear();
+ for (auto& i : sorted) {
+ dups.push_back(i.second);
+ }
+
+ head = last_update;
+ tail = last_update;
+ can_rollback_to = last_update;
+ rollback_info_trimmed_to = last_update;
+ }
+
+ bool empty() const {
+ return log.empty();
+ }
+
+ bool null() const {
+ return head.version == 0 && head.epoch == 0;
+ }
+
+ uint64_t approx_size() const {
+ return head.version - tail.version;
+ }
+
+ static void filter_log(spg_t import_pgid, const OSDMap &curmap,
+ const std::string &hit_set_namespace, const pg_log_t &in,
+ pg_log_t &out, pg_log_t &reject);
+
+ /**
+ * copy entries from the tail of another pg_log_t
+ *
+ * @param other pg_log_t to copy from
+ * @param from copy entries after this version
+ */
+ void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
+
+ /**
+ * copy up to N entries
+ *
+ * @param other source log
+ * @param max max number of entries to copy
+ */
+ void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
+
+ std::ostream& print(std::ostream& out) const;
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_log_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_log_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
+{
+ out << "log((" << log.tail << "," << log.head << "], crt="
+ << log.get_can_rollback_to() << ")";
+ return out;
+}
+
+
+/**
+ * pg_missing_t - summary of missing objects.
+ *
+ * kept in memory, as a supplement to pg_log_t
+ * also used to pass missing info in messages.
+ */
+struct pg_missing_item {
+ eversion_t need, have;
+ ObjectCleanRegions clean_regions;
+ enum missing_flags_t {
+ FLAG_NONE = 0,
+ FLAG_DELETE = 1,
+ } flags;
+ pg_missing_item() : flags(FLAG_NONE) {}
+ explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
+ pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
+ need(n), have(h) {
+ set_delete(is_delete);
+ if (old_style)
+ clean_regions.mark_fully_dirty();
+ }
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const {
+ using ceph::encode;
+ if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+ // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
+ // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
+ // possible. This can be replaced with the legacy encoding
+ encode(eversion_t(), bl);
+ encode(eversion_t(-1, -1), bl);
+ encode(need, bl);
+ encode(have, bl);
+ encode(static_cast<uint8_t>(flags), bl);
+ encode(clean_regions, bl);
+ } else {
+ encode(eversion_t(), bl);
+ encode(need, bl);
+ encode(have, bl);
+ encode(static_cast<uint8_t>(flags), bl);
+ }
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ eversion_t e, l;
+ decode(e, bl);
+ decode(l, bl);
+ if(l == eversion_t(-1, -1)) {
+ // support all
+ decode(need, bl);
+ decode(have, bl);
+ uint8_t f;
+ decode(f, bl);
+ flags = static_cast<missing_flags_t>(f);
+ decode(clean_regions, bl);
+ } else {
+ // support OSD_RECOVERY_DELETES
+ need = l;
+ decode(have, bl);
+ uint8_t f;
+ decode(f, bl);
+ flags = static_cast<missing_flags_t>(f);
+ clean_regions.mark_fully_dirty();
+ }
+ }
+
+ void set_delete(bool is_delete) {
+ flags = is_delete ? FLAG_DELETE : FLAG_NONE;
+ }
+
+ bool is_delete() const {
+ return (flags & FLAG_DELETE) == FLAG_DELETE;
+ }
+
+ std::string flag_str() const {
+ if (flags == FLAG_NONE) {
+ return "none";
+ } else {
+ return "delete";
+ }
+ }
+
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("need") << need;
+ f->dump_stream("have") << have;
+ f->dump_stream("flags") << flag_str();
+ f->dump_stream("clean_regions") << clean_regions;
+ }
+ static void generate_test_instances(std::list<pg_missing_item*>& o) {
+ o.push_back(new pg_missing_item);
+ o.push_back(new pg_missing_item);
+ o.back()->need = eversion_t(1, 2);
+ o.back()->have = eversion_t(1, 1);
+ o.push_back(new pg_missing_item);
+ o.back()->need = eversion_t(3, 5);
+ o.back()->have = eversion_t(3, 4);
+ o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
+ o.back()->clean_regions.mark_omap_dirty();
+ o.back()->flags = FLAG_DELETE;
+ }
+ bool operator==(const pg_missing_item &rhs) const {
+ return need == rhs.need && have == rhs.have && flags == rhs.flags;
+ }
+ bool operator!=(const pg_missing_item &rhs) const {
+ return !(*this == rhs);
+ }
+};
+WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
+std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
+
+class pg_missing_const_i {
+public:
+ virtual const std::map<hobject_t, pg_missing_item> &
+ get_items() const = 0;
+ virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
+ virtual bool get_may_include_deletes() const = 0;
+ virtual unsigned int num_missing() const = 0;
+ virtual bool have_missing() const = 0;
+ virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
+ virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
+ virtual ~pg_missing_const_i() {}
+};
+
+
+template <bool Track>
+class ChangeTracker {
+public:
+ void changed(const hobject_t &obj) {}
+ template <typename F>
+ void get_changed(F &&f) const {}
+ void flush() {}
+ bool is_clean() const {
+ return true;
+ }
+};
+template <>
+class ChangeTracker<true> {
+ std::set<hobject_t> _changed;
+public:
+ void changed(const hobject_t &obj) {
+ _changed.insert(obj);
+ }
+ template <typename F>
+ void get_changed(F &&f) const {
+ for (auto const &i: _changed) {
+ f(i);
+ }
+ }
+ void flush() {
+ _changed.clear();
+ }
+ bool is_clean() const {
+ return _changed.empty();
+ }
+};
+
+template <bool TrackChanges>
+class pg_missing_set : public pg_missing_const_i {
+ using item = pg_missing_item;
+ std::map<hobject_t, item> missing; // oid -> (need v, have v)
+ std::map<version_t, hobject_t> rmissing; // v -> oid
+ ChangeTracker<TrackChanges> tracker;
+
+public:
+ pg_missing_set() = default;
+
+ template <typename missing_type>
+ pg_missing_set(const missing_type &m) {
+ missing = m.get_items();
+ rmissing = m.get_rmissing();
+ may_include_deletes = m.get_may_include_deletes();
+ for (auto &&i: missing)
+ tracker.changed(i.first);
+ }
+
+ bool may_include_deletes = false;
+
+ const std::map<hobject_t, item> &get_items() const override {
+ return missing;
+ }
+ const std::map<version_t, hobject_t> &get_rmissing() const override {
+ return rmissing;
+ }
+ bool get_may_include_deletes() const override {
+ return may_include_deletes;
+ }
+ unsigned int num_missing() const override {
+ return missing.size();
+ }
+ bool have_missing() const override {
+ return !missing.empty();
+ }
+ void merge(const pg_log_entry_t& e) {
+ auto miter = missing.find(e.soid);
+ if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
+ miter->second.clean_regions.merge(e.clean_regions);
+ }
+ bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
+ auto iter = missing.find(oid);
+ if (iter == missing.end())
+ return false;
+ if (out)
+ *out = iter->second;
+ return true;
+ }
+ bool is_missing(const hobject_t& oid, eversion_t v) const override {
+ std::map<hobject_t, item>::const_iterator m =
+ missing.find(oid);
+ if (m == missing.end())
+ return false;
+ const item &item(m->second);
+ if (item.need > v)
+ return false;
+ return true;
+ }
+ eversion_t get_oldest_need() const {
+ if (missing.empty()) {
+ return eversion_t();
+ }
+ auto it = missing.find(rmissing.begin()->second);
+ ceph_assert(it != missing.end());
+ return it->second.need;
+ }
+
+ void claim(pg_missing_set&& o) {
+ static_assert(!TrackChanges, "Can't use claim with TrackChanges");
+ missing = std::move(o.missing);
+ rmissing = std::move(o.rmissing);
+ }
+
+ /*
+ * this needs to be called in log order as we extend the log. it
+ * assumes missing is accurate up through the previous log entry.
+ */
+ void add_next_event(const pg_log_entry_t& e) {
+ std::map<hobject_t, item>::iterator missing_it;
+ missing_it = missing.find(e.soid);
+ bool is_missing_divergent_item = missing_it != missing.end();
+ if (e.prior_version == eversion_t() || e.is_clone()) {
+ // new object.
+ if (is_missing_divergent_item) { // use iterator
+ rmissing.erase(missing_it->second.need.version);
+ // .have = nil
+ missing_it->second = item(e.version, eversion_t(), e.is_delete());
+ missing_it->second.clean_regions.mark_fully_dirty();
+ } else {
+ // create new element in missing map
+ // .have = nil
+ missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
+ missing[e.soid].clean_regions.mark_fully_dirty();
+ }
+ } else if (is_missing_divergent_item) {
+ // already missing (prior).
+ rmissing.erase((missing_it->second).need.version);
+ missing_it->second.need = e.version; // leave .have unchanged.
+ missing_it->second.set_delete(e.is_delete());
+ if (e.is_lost_revert())
+ missing_it->second.clean_regions.mark_fully_dirty();
+ else
+ missing_it->second.clean_regions.merge(e.clean_regions);
+ } else {
+ // not missing, we must have prior_version (if any)
+ ceph_assert(!is_missing_divergent_item);
+ missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
+ if (e.is_lost_revert())
+ missing[e.soid].clean_regions.mark_fully_dirty();
+ else
+ missing[e.soid].clean_regions = e.clean_regions;
+ }
+ rmissing[e.version.version] = e.soid;
+ tracker.changed(e.soid);
+ }
+
+ void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
+ auto p = missing.find(oid);
+ if (p != missing.end()) {
+ rmissing.erase((p->second).need.version);
+ p->second.need = need; // do not adjust .have
+ p->second.set_delete(is_delete);
+ p->second.clean_regions.mark_fully_dirty();
+ } else {
+ missing[oid] = item(need, eversion_t(), is_delete);
+ missing[oid].clean_regions.mark_fully_dirty();
+ }
+ rmissing[need.version] = oid;
+
+ tracker.changed(oid);
+ }
+
+ void revise_have(hobject_t oid, eversion_t have) {
+ auto p = missing.find(oid);
+ if (p != missing.end()) {
+ tracker.changed(oid);
+ (p->second).have = have;
+ }
+ }
+
+ void mark_fully_dirty(const hobject_t& oid) {
+ auto p = missing.find(oid);
+ if (p != missing.end()) {
+ tracker.changed(oid);
+ (p->second).clean_regions.mark_fully_dirty();
+ }
+ }
+
+ void add(const hobject_t& oid, eversion_t need, eversion_t have,
+ bool is_delete) {
+ missing[oid] = item(need, have, is_delete, true);
+ rmissing[need.version] = oid;
+ tracker.changed(oid);
+ }
+
+ void add(const hobject_t& oid, pg_missing_item&& item) {
+ rmissing[item.need.version] = oid;
+ missing.insert({oid, std::move(item)});
+ tracker.changed(oid);
+ }
+
+ void rm(const hobject_t& oid, eversion_t v) {
+ std::map<hobject_t, item>::iterator p = missing.find(oid);
+ if (p != missing.end() && p->second.need <= v)
+ rm(p);
+ }
+
+ void rm(std::map<hobject_t, item>::const_iterator m) {
+ tracker.changed(m->first);
+ rmissing.erase(m->second.need.version);
+ missing.erase(m);
+ }
+
+ void got(const hobject_t& oid, eversion_t v) {
+ std::map<hobject_t, item>::iterator p = missing.find(oid);
+ ceph_assert(p != missing.end());
+ ceph_assert(p->second.need <= v || p->second.is_delete());
+ got(p);
+ }
+
+ void got(std::map<hobject_t, item>::const_iterator m) {
+ tracker.changed(m->first);
+ rmissing.erase(m->second.need.version);
+ missing.erase(m);
+ }
+
+ void split_into(
+ pg_t child_pgid,
+ unsigned split_bits,
+ pg_missing_set *omissing) {
+ omissing->may_include_deletes = may_include_deletes;
+ unsigned mask = ~((~0)<<split_bits);
+ for (std::map<hobject_t, item>::iterator i = missing.begin();
+ i != missing.end();
+ ) {
+ if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
+ omissing->add(i->first, i->second.need, i->second.have,
+ i->second.is_delete());
+ rm(i++);
+ } else {
+ ++i;
+ }
+ }
+ }
+
+ void clear() {
+ for (auto const &i: missing)
+ tracker.changed(i.first);
+ missing.clear();
+ rmissing.clear();
+ }
+
+ void encode(ceph::buffer::list &bl, uint64_t features) const {
+ ENCODE_START(5, 2, bl)
+ encode(missing, bl, features);
+ encode(may_include_deletes, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
+ for (auto const &i: missing)
+ tracker.changed(i.first);
+ DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
+ decode(missing, bl);
+ if (struct_v >= 4) {
+ decode(may_include_deletes, bl);
+ }
+ DECODE_FINISH(bl);
+
+ if (struct_v < 3) {
+ // Handle hobject_t upgrade
+ std::map<hobject_t, item> tmp;
+ for (std::map<hobject_t, item>::iterator i =
+ missing.begin();
+ i != missing.end();
+ ) {
+ if (!i->first.is_max() && i->first.pool == -1) {
+ hobject_t to_insert(i->first);
+ to_insert.pool = pool;
+ tmp[to_insert] = i->second;
+ missing.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+ missing.insert(tmp.begin(), tmp.end());
+ }
+
+ for (std::map<hobject_t,item>::iterator it =
+ missing.begin();
+ it != missing.end();
+ ++it)
+ rmissing[it->second.need.version] = it->first;
+ for (auto const &i: missing)
+ tracker.changed(i.first);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->open_array_section("missing");
+ for (std::map<hobject_t,item>::const_iterator p =
+ missing.begin(); p != missing.end(); ++p) {
+ f->open_object_section("item");
+ f->dump_stream("object") << p->first;
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_bool("may_include_deletes", may_include_deletes);
+ }
+ template <typename F>
+ void filter_objects(F &&f) {
+ for (auto i = missing.begin(); i != missing.end();) {
+ if (f(i->first)) {
+ rm(i++);
+ } else {
+ ++i;
+ }
+ }
+ }
+ static void generate_test_instances(std::list<pg_missing_set*>& o) {
+ o.push_back(new pg_missing_set);
+ o.back()->may_include_deletes = true;
+ o.push_back(new pg_missing_set);
+ o.back()->add(
+ hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+ eversion_t(5, 6), eversion_t(5, 1), false);
+ o.back()->may_include_deletes = true;
+ o.push_back(new pg_missing_set);
+ o.back()->add(
+ hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+ eversion_t(5, 6), eversion_t(5, 1), true);
+ o.back()->may_include_deletes = true;
+ }
+ template <typename F>
+ void get_changed(F &&f) const {
+ tracker.get_changed(f);
+ }
+ void flush() {
+ tracker.flush();
+ }
+ bool is_clean() const {
+ return tracker.is_clean();
+ }
+ template <typename missing_t>
+ bool debug_verify_from_init(
+ const missing_t &init_missing,
+ std::ostream *oss) const {
+ if (!TrackChanges)
+ return true;
+ auto check_missing(init_missing.get_items());
+ tracker.get_changed([&](const hobject_t &hoid) {
+ check_missing.erase(hoid);
+ if (missing.count(hoid)) {
+ check_missing.insert(*(missing.find(hoid)));
+ }
+ });
+ bool ok = true;
+ if (check_missing.size() != missing.size()) {
+ if (oss) {
+ *oss << "Size mismatch, check: " << check_missing.size()
+ << ", actual: " << missing.size() << "\n";
+ }
+ ok = false;
+ }
+ for (auto &i: missing) {
+ if (!check_missing.count(i.first)) {
+ if (oss)
+ *oss << "check_missing missing " << i.first << "\n";
+ ok = false;
+ } else if (check_missing[i.first] != i.second) {
+ if (oss)
+ *oss << "check_missing missing item mismatch on " << i.first
+ << ", check: " << check_missing[i.first]
+ << ", actual: " << i.second << "\n";
+ ok = false;
+ }
+ }
+ if (oss && !ok) {
+ *oss << "check_missing: " << check_missing << "\n";
+ std::set<hobject_t> changed;
+ tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
+ *oss << "changed: " << changed << "\n";
+ }
+ return ok;
+ }
+};
+template <bool TrackChanges>
+void encode(
+ const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
+ ENCODE_DUMP_PRE();
+ c.encode(bl, features);
+ ENCODE_DUMP_POST(cl);
+}
+template <bool TrackChanges>
+void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
+ c.decode(p);
+}
+template <bool TrackChanges>
+std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
+{
+ out << "missing(" << missing.num_missing()
+ << " may_include_deletes = " << missing.may_include_deletes;
+ //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
+ out << ")";
+ return out;
+}
+
+using pg_missing_t = pg_missing_set<false>;
+using pg_missing_tracker_t = pg_missing_set<true>;
+
+
+
+
+/**
+ * pg list objects response format
+ *
+ */
+
+template<typename T>
+struct pg_nls_response_template {
+ collection_list_handle_t handle;
+ std::vector<T> entries;
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(handle, bl);
+ __u32 n = (__u32)entries.size();
+ encode(n, bl);
+ for (auto i = entries.begin(); i != entries.end(); ++i) {
+ encode(i->nspace, bl);
+ encode(i->oid, bl);
+ encode(i->locator, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(handle, bl);
+ __u32 n;
+ decode(n, bl);
+ entries.clear();
+ while (n--) {
+ T i;
+ decode(i.nspace, bl);
+ decode(i.oid, bl);
+ decode(i.locator, bl);
+ entries.push_back(i);
+ }
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("handle") << handle;
+ f->open_array_section("entries");
+ for (auto p = entries.begin(); p != entries.end(); ++p) {
+ f->open_object_section("object");
+ f->dump_string("namespace", p->nspace);
+ f->dump_string("object", p->oid);
+ f->dump_string("key", p->locator);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<pg_nls_response_template<T>*>& o) {
+ o.push_back(new pg_nls_response_template<T>);
+ o.push_back(new pg_nls_response_template<T>);
+ o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
+ o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
+ o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
+ o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
+ o.push_back(new pg_nls_response_template<T>);
+ o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
+ o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
+ o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
+ o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
+ o.push_back(new pg_nls_response_template<T>);
+ o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
+ o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
+ o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
+ o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
+ o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
+ o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
+ o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
+ }
+};
+
+using pg_nls_response_t = pg_nls_response_template<librados::ListObjectImpl>;
+
+WRITE_CLASS_ENCODER(pg_nls_response_t)
+
+// For backwards compatibility with older OSD requests
+struct pg_ls_response_t {
+ collection_list_handle_t handle;
+ std::list<std::pair<object_t, std::string> > entries;
+
+ void encode(ceph::buffer::list& bl) const {
+ using ceph::encode;
+ __u8 v = 1;
+ encode(v, bl);
+ encode(handle, bl);
+ encode(entries, bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ using ceph::decode;
+ __u8 v;
+ decode(v, bl);
+ ceph_assert(v == 1);
+ decode(handle, bl);
+ decode(entries, bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("handle") << handle;
+ f->open_array_section("entries");
+ for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
+ f->open_object_section("object");
+ f->dump_stream("object") << p->first;
+ f->dump_string("key", p->second);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
+ o.push_back(new pg_ls_response_t);
+ o.push_back(new pg_ls_response_t);
+ o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
+ o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
+ o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
+ }
+};
+
+WRITE_CLASS_ENCODER(pg_ls_response_t)
+
+/**
+ * object_copy_cursor_t
+ */
+struct object_copy_cursor_t {
+ uint64_t data_offset;
+ std::string omap_offset;
+ bool attr_complete;
+ bool data_complete;
+ bool omap_complete;
+
+ object_copy_cursor_t()
+ : data_offset(0),
+ attr_complete(false),
+ data_complete(false),
+ omap_complete(false)
+ {}
+
+ bool is_initial() const {
+ return !attr_complete && data_offset == 0 && omap_offset.empty();
+ }
+ bool is_complete() const {
+ return attr_complete && data_complete && omap_complete;
+ }
+
+ static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(object_copy_cursor_t)
+
+/**
+ * object_copy_data_t
+ *
+ * Return data from a copy request. The semantics are a little strange
+ * as a result of the encoding's heritage.
+ *
+ * In particular, the sender unconditionally fills in the cursor (from what
+ * it receives and sends), the size, and the mtime, but is responsible for
+ * figuring out whether it should put any data in the attrs, data, or
+ * omap members (corresponding to xattrs, object data, and the omap entries)
+ * based on external data (the client includes a max amount to return with
+ * the copy request). The client then looks into the attrs, data, and/or omap
+ * based on the contents of the cursor.
+ */
+struct object_copy_data_t {
+ enum {
+ FLAG_DATA_DIGEST = 1<<0,
+ FLAG_OMAP_DIGEST = 1<<1,
+ };
+ object_copy_cursor_t cursor;
+ uint64_t size;
+ utime_t mtime;
+ uint32_t data_digest, omap_digest;
+ uint32_t flags;
+ std::map<std::string, ceph::buffer::list> attrs;
+ ceph::buffer::list data;
+ ceph::buffer::list omap_header;
+ ceph::buffer::list omap_data;
+
+ /// which snaps we are defined for (if a snap and not the head)
+ std::vector<snapid_t> snaps;
+ /// latest snap seq for the object (if head)
+ snapid_t snap_seq;
+
+ /// recent reqids on this object
+ mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
+
+ /// map reqids by index to error return code (if any)
+ mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
+
+ uint64_t truncate_seq;
+ uint64_t truncate_size;
+
+public:
+ object_copy_data_t() :
+ size((uint64_t)-1), data_digest(-1),
+ omap_digest(-1), flags(0),
+ truncate_seq(0),
+ truncate_size(0) {}
+
+ static void generate_test_instances(std::list<object_copy_data_t*>& o);
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
+
+/**
+ * pg creation info
+ */
+struct pg_create_t {
+ epoch_t created; // epoch pg created
+ pg_t parent; // split from parent (if != pg_t())
+ __s32 split_bits;
+
+ pg_create_t()
+ : created(0), split_bits(0) {}
+ pg_create_t(unsigned c, pg_t p, int s)
+ : created(c), parent(p), split_bits(s) {}
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<pg_create_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_create_t)
+
+// -----------------------------------------
+
+class ObjectExtent {
+ /**
+ * ObjectExtents are used for specifying IO behavior against RADOS
+ * objects when one is using the ObjectCacher.
+ *
+ * To use this in a real system, *every member* must be filled
+ * out correctly. In particular, make sure to initialize the
+ * oloc correctly, as its default values are deliberate poison
+ * and will cause internal ObjectCacher asserts.
+ *
+ * Similarly, your buffer_extents vector *must* specify a total
+ * size equal to your length. If the buffer_extents inadvertently
+ * contain less space than the length member specifies, you
+ * will get unintelligible asserts deep in the ObjectCacher.
+ *
+ * If you are trying to do testing and don't care about actual
+ * RADOS function, the simplest thing to do is to initialize
+ * the ObjectExtent (truncate_size can be 0), create a single entry
+ * in buffer_extents matching the length, and set oloc.pool to 0.
+ */
+ public:
+ object_t oid; // object id
+ uint64_t objectno;
+ uint64_t offset; // in object
+ uint64_t length; // in object
+ uint64_t truncate_size; // in object
+
+ object_locator_t oloc; // object locator (pool etc)
+
+ std::vector<std::pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
+
+ ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
+ ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
+ oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
+{
+ return out << "extent("
+ << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
+ << " " << ex.offset << "~" << ex.length
+ << " -> " << ex.buffer_extents
+ << ")";
+}
+
+
+// ---------------------------------------
+
+class OSDSuperblock {
+public:
+ uuid_d cluster_fsid, osd_fsid;
+ int32_t whoami = -1; // my role in this fs.
+ epoch_t current_epoch = 0; // most recent epoch
+ epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have.
+ double weight = 0.0;
+
+ CompatSet compat_features;
+
+ // last interval over which i mounted and was then active
+ epoch_t mounted = 0; // last epoch i mounted
+ epoch_t clean_thru = 0; // epoch i was active and clean thru
+
+ epoch_t purged_snaps_last = 0;
+ utime_t last_purged_snaps_scrub;
+
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<OSDSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(OSDSuperblock)
+
+inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
+{
+ return out << "sb(" << sb.cluster_fsid
+ << " osd." << sb.whoami
+ << " " << sb.osd_fsid
+ << " e" << sb.current_epoch
+ << " [" << sb.oldest_map << "," << sb.newest_map << "]"
+ << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
+ << ")";
+}
+
+
+// -------
+
+
+
+
+
+
+/*
+ * attached to object head. describes most recent snap context, and
+ * set of existing clones.
+ */
+struct SnapSet {
+ snapid_t seq;
+ // NOTE: this is for pre-octopus compatibility only! remove in Q release
+ std::vector<snapid_t> snaps; // descending
+ std::vector<snapid_t> clones; // ascending
+ std::map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
+ std::map<snapid_t, uint64_t> clone_size;
+ std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
+
+ SnapSet() : seq(0) {}
+ explicit SnapSet(ceph::buffer::list& bl) {
+ auto p = std::cbegin(bl);
+ decode(p);
+ }
+
+ /// populate SnapSet from a librados::snap_set_t
+ void from_snap_set(const librados::snap_set_t& ss, bool legacy);
+
+ /// get space accounted to clone
+ uint64_t get_clone_bytes(snapid_t clone) const;
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<SnapSet*>& o);
+
+ SnapContext get_ssc_as_of(snapid_t as_of) const {
+ SnapContext out;
+ out.seq = as_of;
+ for (auto p = clone_snaps.rbegin();
+ p != clone_snaps.rend();
+ ++p) {
+ for (auto snap : p->second) {
+ if (snap <= as_of) {
+ out.snaps.push_back(snap);
+ }
+ }
+ }
+ return out;
+ }
+
+
+ SnapSet get_filtered(const pg_pool_t &pinfo) const;
+ void filter(const pg_pool_t &pinfo);
+};
+WRITE_CLASS_ENCODER(SnapSet)
+
+std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
+
+
+
+#define OI_ATTR "_"
+#define SS_ATTR "snapset"
+
+struct watch_info_t {
+ uint64_t cookie;
+ uint32_t timeout_seconds;
+ entity_addr_t addr;
+
+ watch_info_t() : cookie(0), timeout_seconds(0) { }
+ watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<watch_info_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
+
+static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
+ return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
+ && l.addr == r.addr;
+}
+
+static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
+ return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
+ << " " << w.addr << ")";
+}
+
+struct notify_info_t {
+ uint64_t cookie;
+ uint64_t notify_id;
+ uint32_t timeout;
+ ceph::buffer::list bl;
+};
+
+static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
+ return out << "notify(cookie " << n.cookie
+ << " notify" << n.notify_id
+ << " " << n.timeout << "s)";
+}
+
+class object_ref_delta_t {
+ std::map<hobject_t, int> ref_delta;
+
+public:
+ object_ref_delta_t() = default;
+ object_ref_delta_t(const object_ref_delta_t &) = default;
+ object_ref_delta_t(object_ref_delta_t &&) = default;
+
+ object_ref_delta_t(decltype(ref_delta) &&ref_delta)
+ : ref_delta(std::move(ref_delta)) {}
+ object_ref_delta_t(const decltype(ref_delta) &ref_delta)
+ : ref_delta(ref_delta) {}
+
+ object_ref_delta_t &operator=(const object_ref_delta_t &) = default;
+ object_ref_delta_t &operator=(object_ref_delta_t &&) = default;
+
+ void dec_ref(const hobject_t &hoid, unsigned num=1) {
+ mut_ref(hoid, -num);
+ }
+ void inc_ref(const hobject_t &hoid, unsigned num=1) {
+ mut_ref(hoid, num);
+ }
+ void mut_ref(const hobject_t &hoid, int num) {
+ [[maybe_unused]] auto [iter, _] = ref_delta.try_emplace(hoid, 0);
+ iter->second += num;
+ if (iter->second == 0)
+ ref_delta.erase(iter);
+ }
+
+ auto begin() const { return ref_delta.begin(); }
+ auto end() const { return ref_delta.end(); }
+ auto find(hobject_t &key) const { return ref_delta.find(key); }
+
+ bool operator==(const object_ref_delta_t &rhs) const {
+ return ref_delta == rhs.ref_delta;
+ }
+ bool operator!=(const object_ref_delta_t &rhs) const {
+ return !(*this == rhs);
+ }
+ bool is_empty() {
+ return ref_delta.empty();
+ }
+ uint64_t size() {
+ return ref_delta.size();
+ }
+ friend std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci);
+};
+
+struct chunk_info_t {
+ typedef enum {
+ FLAG_DIRTY = 1,
+ FLAG_MISSING = 2,
+ FLAG_HAS_REFERENCE = 4,
+ FLAG_HAS_FINGERPRINT = 8,
+ } cflag_t;
+ uint32_t offset;
+ uint32_t length;
+ hobject_t oid;
+ cflag_t flags; // FLAG_*
+
+ chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
+ chunk_info_t(uint32_t offset, uint32_t length, hobject_t oid) :
+ offset(offset), length(length), oid(oid), flags((cflag_t)0) { }
+
+ static std::string get_flag_string(uint64_t flags) {
+ std::string r;
+ if (flags & FLAG_DIRTY) {
+ r += "|dirty";
+ }
+ if (flags & FLAG_MISSING) {
+ r += "|missing";
+ }
+ if (flags & FLAG_HAS_REFERENCE) {
+ r += "|has_reference";
+ }
+ if (flags & FLAG_HAS_FINGERPRINT) {
+ r += "|has_fingerprint";
+ }
+ if (r.length())
+ return r.substr(1);
+ return r;
+ }
+ bool test_flag(cflag_t f) const {
+ return (flags & f) == f;
+ }
+ void set_flag(cflag_t f) {
+ flags = (cflag_t)(flags | f);
+ }
+ void set_flags(cflag_t f) {
+ flags = f;
+ }
+ void clear_flag(cflag_t f) {
+ flags = (cflag_t)(flags & ~f);
+ }
+ void clear_flags() {
+ flags = (cflag_t)0;
+ }
+ bool is_dirty() const {
+ return test_flag(FLAG_DIRTY);
+ }
+ bool is_missing() const {
+ return test_flag(FLAG_MISSING);
+ }
+ bool has_reference() const {
+ return test_flag(FLAG_HAS_REFERENCE);
+ }
+ bool has_fingerprint() const {
+ return test_flag(FLAG_HAS_FINGERPRINT);
+ }
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
+ bool operator==(const chunk_info_t& cit) const;
+ bool operator!=(const chunk_info_t& cit) const {
+ return !(cit == *this);
+ }
+};
+WRITE_CLASS_ENCODER(chunk_info_t)
+std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
+
+struct object_info_t;
+struct object_manifest_t {
+ enum {
+ TYPE_NONE = 0,
+ TYPE_REDIRECT = 1,
+ TYPE_CHUNKED = 2,
+ };
+ uint8_t type; // redirect, chunked, ...
+ hobject_t redirect_target;
+ std::map<uint64_t, chunk_info_t> chunk_map;
+
+ object_manifest_t() : type(0) { }
+ object_manifest_t(uint8_t type, const hobject_t& redirect_target)
+ : type(type), redirect_target(redirect_target) { }
+
+ bool is_empty() const {
+ return type == TYPE_NONE;
+ }
+ bool is_redirect() const {
+ return type == TYPE_REDIRECT;
+ }
+ bool is_chunked() const {
+ return type == TYPE_CHUNKED;
+ }
+ static std::string_view get_type_name(uint8_t m) {
+ switch (m) {
+ case TYPE_NONE: return "none";
+ case TYPE_REDIRECT: return "redirect";
+ case TYPE_CHUNKED: return "chunked";
+ default: return "unknown";
+ }
+ }
+ std::string_view get_type_name() const {
+ return get_type_name(type);
+ }
+ void clear() {
+ type = 0;
+ redirect_target = hobject_t();
+ chunk_map.clear();
+ }
+
+ /**
+ * calc_refs_to_inc_on_set
+ *
+ * Takes a manifest and returns the set of refs to
+ * increment upon set-chunk
+ *
+ * l should be nullptr if there are no clones, or
+ * l and g may each be null if the corresponding clone does not exist.
+ * *this contains the set of new references to set
+ *
+ */
+ void calc_refs_to_inc_on_set(
+ const object_manifest_t* g, ///< [in] manifest for clone > *this
+ const object_manifest_t* l, ///< [in] manifest for clone < *this
+ object_ref_delta_t &delta ///< [out] set of refs to drop
+ ) const;
+
+ /**
+ * calc_refs_to_drop_on_modify
+ *
+ * Takes a manifest and returns the set of refs to
+ * drop upon modification
+ *
+ * l should be nullptr if there are no clones, or
+ * l may be null if the corresponding clone does not exist.
+ *
+ */
+ void calc_refs_to_drop_on_modify(
+ const object_manifest_t* l, ///< [in] manifest for previous clone
+ const ObjectCleanRegions& clean_regions, ///< [in] clean regions
+ object_ref_delta_t &delta ///< [out] set of refs to drop
+ ) const;
+
+ /**
+ * calc_refs_to_drop_on_removal
+ *
+ * Takes the two adjacent manifests and returns the set of refs to
+ * drop upon removal of the clone containing *this.
+ *
+ * g should be nullptr if *this is on HEAD, l should be nullptr if
+ * *this is on the oldest clone (or head if there are no clones).
+ */
+ void calc_refs_to_drop_on_removal(
+ const object_manifest_t* g, ///< [in] manifest for clone > *this
+ const object_manifest_t* l, ///< [in] manifest for clone < *this
+ object_ref_delta_t &delta ///< [out] set of refs to drop
+ ) const;
+
+ static void generate_test_instances(std::list<object_manifest_t*>& o);
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ void dump(ceph::Formatter *f) const;
+ friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
+};
+WRITE_CLASS_ENCODER(object_manifest_t)
+std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
+
+struct object_info_t {
+ hobject_t soid;
+ eversion_t version, prior_version;
+ version_t user_version;
+ osd_reqid_t last_reqid;
+
+ uint64_t size;
+ utime_t mtime;
+ utime_t local_mtime; // local mtime
+
+ // note: these are currently encoded into a total 16 bits; see
+ // encode()/decode() for the weirdness.
+ typedef enum {
+ FLAG_LOST = 1<<0,
+ FLAG_WHITEOUT = 1<<1, // object logically does not exist
+ FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
+ FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
+ FLAG_DATA_DIGEST = 1<<4, // has data crc
+ FLAG_OMAP_DIGEST = 1<<5, // has omap crc
+ FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
+ FLAG_MANIFEST = 1<<7, // has manifest
+ FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
+ FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
+ } flag_t;
+
+ flag_t flags;
+
+ static std::string get_flag_string(flag_t flags) {
+ std::string s;
+ std::vector<std::string> sv = get_flag_vector(flags);
+ for (auto ss : sv) {
+ s += std::string("|") + ss;
+ }
+ if (s.length())
+ return s.substr(1);
+ return s;
+ }
+ static std::vector<std::string> get_flag_vector(flag_t flags) {
+ std::vector<std::string> sv;
+ if (flags & FLAG_LOST)
+ sv.insert(sv.end(), "lost");
+ if (flags & FLAG_WHITEOUT)
+ sv.insert(sv.end(), "whiteout");
+ if (flags & FLAG_DIRTY)
+ sv.insert(sv.end(), "dirty");
+ if (flags & FLAG_USES_TMAP)
+ sv.insert(sv.end(), "uses_tmap");
+ if (flags & FLAG_OMAP)
+ sv.insert(sv.end(), "omap");
+ if (flags & FLAG_DATA_DIGEST)
+ sv.insert(sv.end(), "data_digest");
+ if (flags & FLAG_OMAP_DIGEST)
+ sv.insert(sv.end(), "omap_digest");
+ if (flags & FLAG_CACHE_PIN)
+ sv.insert(sv.end(), "cache_pin");
+ if (flags & FLAG_MANIFEST)
+ sv.insert(sv.end(), "manifest");
+ if (flags & FLAG_REDIRECT_HAS_REFERENCE)
+ sv.insert(sv.end(), "redirect_has_reference");
+ return sv;
+ }
+ std::string get_flag_string() const {
+ return get_flag_string(flags);
+ }
+
+ uint64_t truncate_seq, truncate_size;
+
+ std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
+
+ // opportunistic checksums; may or may not be present
+ __u32 data_digest; ///< data crc32c
+ __u32 omap_digest; ///< omap crc32c
+
+ // alloc hint attribute
+ uint64_t expected_object_size, expected_write_size;
+ uint32_t alloc_hint_flags;
+
+ struct object_manifest_t manifest;
+
+ void copy_user_bits(const object_info_t& other);
+
+ bool test_flag(flag_t f) const {
+ return (flags & f) == f;
+ }
+ void set_flag(flag_t f) {
+ flags = (flag_t)(flags | f);
+ }
+ void clear_flag(flag_t f) {
+ flags = (flag_t)(flags & ~f);
+ }
+ bool is_lost() const {
+ return test_flag(FLAG_LOST);
+ }
+ bool is_whiteout() const {
+ return test_flag(FLAG_WHITEOUT);
+ }
+ bool is_dirty() const {
+ return test_flag(FLAG_DIRTY);
+ }
+ bool is_omap() const {
+ return test_flag(FLAG_OMAP);
+ }
+ bool is_data_digest() const {
+ return test_flag(FLAG_DATA_DIGEST);
+ }
+ bool is_omap_digest() const {
+ return test_flag(FLAG_OMAP_DIGEST);
+ }
+ bool is_cache_pinned() const {
+ return test_flag(FLAG_CACHE_PIN);
+ }
+ bool has_manifest() const {
+ return test_flag(FLAG_MANIFEST);
+ }
+ void set_data_digest(__u32 d) {
+ set_flag(FLAG_DATA_DIGEST);
+ data_digest = d;
+ }
+ void set_omap_digest(__u32 d) {
+ set_flag(FLAG_OMAP_DIGEST);
+ omap_digest = d;
+ }
+ void clear_data_digest() {
+ clear_flag(FLAG_DATA_DIGEST);
+ data_digest = -1;
+ }
+ void clear_omap_digest() {
+ clear_flag(FLAG_OMAP_DIGEST);
+ omap_digest = -1;
+ }
+ void new_object() {
+ clear_data_digest();
+ clear_omap_digest();
+ }
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void decode(const ceph::buffer::list& bl) {
+ auto p = std::cbegin(bl);
+ decode(p);
+ }
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<object_info_t*>& o);
+
+ explicit object_info_t()
+ : user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0),
+ data_digest(-1), omap_digest(-1),
+ expected_object_size(0), expected_write_size(0),
+ alloc_hint_flags(0)
+ {}
+
+ explicit object_info_t(const hobject_t& s)
+ : soid(s),
+ user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0),
+ data_digest(-1), omap_digest(-1),
+ expected_object_size(0), expected_write_size(0),
+ alloc_hint_flags(0)
+ {}
+
+ explicit object_info_t(ceph::buffer::list& bl) {
+ decode(bl);
+ }
+};
+WRITE_CLASS_ENCODER_FEATURES(object_info_t)
+
+std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
+
+
+
+// Object recovery
+struct ObjectRecoveryInfo {
+ hobject_t soid;
+ eversion_t version;
+ uint64_t size;
+ object_info_t oi;
+ SnapSet ss; // only populated if soid is_snap()
+ interval_set<uint64_t> copy_subset;
+ std::map<hobject_t, interval_set<uint64_t>> clone_subset;
+ bool object_exist;
+
+ ObjectRecoveryInfo() : size(0), object_exist(true) { }
+
+ static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
+ std::ostream &print(std::ostream &out) const;
+ void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
+std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
+
+struct ObjectRecoveryProgress {
+ uint64_t data_recovered_to;
+ std::string omap_recovered_to;
+ bool first;
+ bool data_complete;
+ bool omap_complete;
+ bool error = false;
+
+ ObjectRecoveryProgress()
+ : data_recovered_to(0),
+ first(true),
+ data_complete(false), omap_complete(false) { }
+
+ bool is_complete(const ObjectRecoveryInfo& info) const {
+ return (data_recovered_to >= (
+ info.copy_subset.empty() ?
+ 0 : info.copy_subset.range_end())) &&
+ omap_complete;
+ }
+
+ static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ std::ostream &print(std::ostream &out) const;
+ void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
+std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
+
+struct PushReplyOp {
+ hobject_t soid;
+
+ static void generate_test_instances(std::list<PushReplyOp*>& o);
+ void encode(ceph::buffer::list &bl) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ std::ostream &print(std::ostream &out) const;
+ void dump(ceph::Formatter *f) const;
+
+ uint64_t cost(CephContext *cct) const;
+};
+WRITE_CLASS_ENCODER(PushReplyOp)
+std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
+
+struct PullOp {
+ hobject_t soid;
+
+ ObjectRecoveryInfo recovery_info;
+ ObjectRecoveryProgress recovery_progress;
+
+ static void generate_test_instances(std::list<PullOp*>& o);
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ std::ostream &print(std::ostream &out) const;
+ void dump(ceph::Formatter *f) const;
+
+ uint64_t cost(CephContext *cct) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(PullOp)
+std::ostream& operator<<(std::ostream& out, const PullOp &op);
+
+struct PushOp {
+ hobject_t soid;
+ eversion_t version;
+ ceph::buffer::list data;
+ interval_set<uint64_t> data_included;
+ ceph::buffer::list omap_header;
+ std::map<std::string, ceph::buffer::list> omap_entries;
+ std::map<std::string, ceph::buffer::list> attrset;
+
+ ObjectRecoveryInfo recovery_info;
+ ObjectRecoveryProgress before_progress;
+ ObjectRecoveryProgress after_progress;
+
+ static void generate_test_instances(std::list<PushOp*>& o);
+ void encode(ceph::buffer::list &bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator &bl);
+ std::ostream &print(std::ostream &out) const;
+ void dump(ceph::Formatter *f) const;
+
+ uint64_t cost(CephContext *cct) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(PushOp)
+std::ostream& operator<<(std::ostream& out, const PushOp &op);
+
+enum class scrub_level_t : bool { shallow = false, deep = true };
+enum class scrub_type_t : bool { not_repair = false, do_repair = true };
+
+/*
+ * summarize pg contents for purposes of a scrub
+ */
+struct ScrubMap {
+ struct object {
+ std::map<std::string, ceph::buffer::ptr> attrs;
+ uint64_t size;
+ __u32 omap_digest; ///< omap crc32c
+ __u32 digest; ///< data crc32c
+ bool negative:1;
+ bool digest_present:1;
+ bool omap_digest_present:1;
+ bool read_error:1;
+ bool stat_error:1;
+ bool ec_hash_mismatch:1;
+ bool ec_size_mismatch:1;
+ bool large_omap_object_found:1;
+ uint64_t large_omap_object_key_count = 0;
+ uint64_t large_omap_object_value_size = 0;
+ uint64_t object_omap_bytes = 0;
+ uint64_t object_omap_keys = 0;
+
+ object() :
+ // Init invalid size so it won't match if we get a stat EIO error
+ size(-1), omap_digest(0), digest(0),
+ negative(false), digest_present(false), omap_digest_present(false),
+ read_error(false), stat_error(false), ec_hash_mismatch(false),
+ ec_size_mismatch(false), large_omap_object_found(false) {}
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<object*>& o);
+ };
+ WRITE_CLASS_ENCODER(object)
+
+ std::map<hobject_t,object> objects;
+ eversion_t valid_through;
+ eversion_t incr_since;
+ bool has_large_omap_object_errors:1;
+ bool has_omap_keys:1;
+
+ void merge_incr(const ScrubMap &l);
+ void clear_from(const hobject_t& start) {
+ objects.erase(objects.lower_bound(start), objects.end());
+ }
+ void insert(const ScrubMap &r) {
+ objects.insert(r.objects.begin(), r.objects.end());
+ }
+ void swap(ScrubMap &r) {
+ using std::swap;
+ swap(objects, r.objects);
+ swap(valid_through, r.valid_through);
+ swap(incr_since, r.incr_since);
+ }
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ScrubMap*>& o);
+};
+WRITE_CLASS_ENCODER(ScrubMap::object)
+WRITE_CLASS_ENCODER(ScrubMap)
+
+struct ScrubMapBuilder {
+ bool deep = false;
+ std::vector<hobject_t> ls;
+ size_t pos = 0;
+ int64_t data_pos = 0;
+ std::string omap_pos;
+ int ret = 0;
+ ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value
+ uint64_t omap_keys = 0;
+ uint64_t omap_bytes = 0;
+
+ bool empty() {
+ return ls.empty();
+ }
+ bool done() {
+ return pos >= ls.size();
+ }
+ void reset() {
+ *this = ScrubMapBuilder();
+ }
+
+ bool data_done() {
+ return data_pos < 0;
+ }
+
+ void next_object() {
+ ++pos;
+ data_pos = 0;
+ omap_pos.clear();
+ omap_keys = 0;
+ omap_bytes = 0;
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
+ out << "(" << pos.pos << "/" << pos.ls.size();
+ if (pos.pos < pos.ls.size()) {
+ out << " " << pos.ls[pos.pos];
+ }
+ if (pos.data_pos < 0) {
+ out << " byte " << pos.data_pos;
+ }
+ if (!pos.omap_pos.empty()) {
+ out << " key " << pos.omap_pos;
+ }
+ if (pos.deep) {
+ out << " deep";
+ }
+ if (pos.ret) {
+ out << " ret " << pos.ret;
+ }
+ return out << ")";
+ }
+};
+
+struct watch_item_t {
+ entity_name_t name;
+ uint64_t cookie;
+ uint32_t timeout_seconds;
+ entity_addr_t addr;
+
+ watch_item_t() : cookie(0), timeout_seconds(0) { }
+ watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
+ const entity_addr_t& addr)
+ : name(name), cookie(cookie), timeout_seconds(timeout),
+ addr(addr) { }
+
+ void encode(ceph::buffer::list &bl, uint64_t features) const {
+ ENCODE_START(2, 1, bl);
+ encode(name, bl);
+ encode(cookie, bl);
+ encode(timeout_seconds, bl);
+ encode(addr, bl, features);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator &bl) {
+ DECODE_START(2, bl);
+ decode(name, bl);
+ decode(cookie, bl);
+ decode(timeout_seconds, bl);
+ if (struct_v >= 2) {
+ decode(addr, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("watcher") << name;
+ f->dump_int("cookie", cookie);
+ f->dump_int("timeout", timeout_seconds);
+ f->open_object_section("addr");
+ addr.dump(f);
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<watch_item_t*>& o) {
+ entity_addr_t ea;
+ ea.set_type(entity_addr_t::TYPE_LEGACY);
+ ea.set_nonce(1000);
+ ea.set_family(AF_INET);
+ ea.set_in4_quad(0, 127);
+ ea.set_in4_quad(1, 0);
+ ea.set_in4_quad(2, 0);
+ ea.set_in4_quad(3, 1);
+ ea.set_port(1024);
+ o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
+ ea.set_nonce(1001);
+ ea.set_in4_quad(3, 2);
+ ea.set_port(1025);
+ o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
+ }
+};
+WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
+
+struct obj_watch_item_t {
+ hobject_t obj;
+ watch_item_t wi;
+};
+
+/**
+ * obj list watch response format
+ *
+ */
+struct obj_list_watch_response_t {
+ std::list<watch_item_t> entries;
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const {
+ ENCODE_START(1, 1, bl);
+ encode(entries, bl, features);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(entries, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->open_array_section("entries");
+ for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
+ f->open_object_section("watch");
+ p->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
+ entity_addr_t ea;
+ o.push_back(new obj_list_watch_response_t);
+ o.push_back(new obj_list_watch_response_t);
+ std::list<watch_item_t*> test_watchers;
+ watch_item_t::generate_test_instances(test_watchers);
+ for (auto &e : test_watchers) {
+ o.back()->entries.push_back(*e);
+ delete e;
+ }
+ }
+};
+WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
+
+struct clone_info {
+ snapid_t cloneid;
+ std::vector<snapid_t> snaps; // ascending
+ std::vector< std::pair<uint64_t,uint64_t> > overlap;
+ uint64_t size;
+
+ clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(cloneid, bl);
+ encode(snaps, bl);
+ encode(overlap, bl);
+ encode(size, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(cloneid, bl);
+ decode(snaps, bl);
+ decode(overlap, bl);
+ decode(size, bl);
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ if (cloneid == CEPH_NOSNAP)
+ f->dump_string("cloneid", "HEAD");
+ else
+ f->dump_unsigned("cloneid", cloneid.val);
+ f->open_array_section("snapshots");
+ for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
+ f->open_object_section("snap");
+ f->dump_unsigned("id", p->val);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("overlaps");
+ for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
+ q != overlap.end(); ++q) {
+ f->open_object_section("overlap");
+ f->dump_unsigned("offset", q->first);
+ f->dump_unsigned("length", q->second);
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_unsigned("size", size);
+ }
+ static void generate_test_instances(std::list<clone_info*>& o) {
+ o.push_back(new clone_info);
+ o.push_back(new clone_info);
+ o.back()->cloneid = 1;
+ o.back()->snaps.push_back(1);
+ o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
+ o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
+ o.back()->size = 16384;
+ o.push_back(new clone_info);
+ o.back()->cloneid = CEPH_NOSNAP;
+ o.back()->size = 32768;
+ }
+};
+WRITE_CLASS_ENCODER(clone_info)
+
+/**
+ * obj list snaps response format
+ *
+ */
+struct obj_list_snap_response_t {
+ std::vector<clone_info> clones; // ascending
+ snapid_t seq;
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(clones, bl);
+ encode(seq, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(clones, bl);
+ if (struct_v >= 2)
+ decode(seq, bl);
+ else
+ seq = CEPH_NOSNAP;
+ DECODE_FINISH(bl);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->open_array_section("clones");
+ for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
+ f->open_object_section("clone");
+ p->dump(f);
+ f->close_section();
+ }
+ f->dump_unsigned("seq", seq);
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
+ o.push_back(new obj_list_snap_response_t);
+ o.push_back(new obj_list_snap_response_t);
+ clone_info cl;
+ cl.cloneid = 1;
+ cl.snaps.push_back(1);
+ cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
+ cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
+ cl.size = 16384;
+ o.back()->clones.push_back(cl);
+ cl.cloneid = CEPH_NOSNAP;
+ cl.snaps.clear();
+ cl.overlap.clear();
+ cl.size = 32768;
+ o.back()->clones.push_back(cl);
+ o.back()->seq = 123;
+ }
+};
+
+WRITE_CLASS_ENCODER(obj_list_snap_response_t)
+
+// PromoteCounter
+
+struct PromoteCounter {
+ std::atomic<unsigned long long> attempts{0};
+ std::atomic<unsigned long long> objects{0};
+ std::atomic<unsigned long long> bytes{0};
+
+ void attempt() {
+ attempts++;
+ }
+
+ void finish(uint64_t size) {
+ objects++;
+ bytes += size;
+ }
+
+ void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
+ *a = attempts;
+ *o = objects;
+ *b = bytes;
+ attempts = *a / 2;
+ objects = *o / 2;
+ bytes = *b / 2;
+ }
+};
+
+struct pool_pg_num_history_t {
+ /// last epoch updated
+ epoch_t epoch = 0;
+ /// poolid -> epoch -> pg_num
+ std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
+ /// pair(epoch, poolid)
+ std::set<std::pair<epoch_t,int64_t>> deleted_pools;
+
+ void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
+ pg_nums[pool][epoch] = pg_num;
+ }
+ void log_pool_delete(epoch_t epoch, int64_t pool) {
+ deleted_pools.insert(std::make_pair(epoch, pool));
+ }
+
+ /// prune history based on oldest osdmap epoch in the cluster
+ void prune(epoch_t oldest_epoch) {
+ auto i = deleted_pools.begin();
+ while (i != deleted_pools.end()) {
+ if (i->first >= oldest_epoch) {
+ break;
+ }
+ pg_nums.erase(i->second);
+ i = deleted_pools.erase(i);
+ }
+ for (auto& j : pg_nums) {
+ auto k = j.second.lower_bound(oldest_epoch);
+ // keep this and the entry before it (just to be paranoid)
+ if (k != j.second.begin()) {
+ --k;
+ j.second.erase(j.second.begin(), k);
+ }
+ }
+ }
+
+ void encode(ceph::buffer::list& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(epoch, bl);
+ encode(pg_nums, bl);
+ encode(deleted_pools, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(ceph::buffer::list::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(epoch, p);
+ decode(pg_nums, p);
+ decode(deleted_pools, p);
+ DECODE_FINISH(p);
+ }
+ void dump(ceph::Formatter *f) const {
+ f->dump_unsigned("epoch", epoch);
+ f->open_object_section("pools");
+ for (auto& i : pg_nums) {
+ f->open_object_section("pool");
+ f->dump_unsigned("pool_id", i.first);
+ f->open_array_section("changes");
+ for (auto& j : i.second) {
+ f->open_object_section("change");
+ f->dump_unsigned("epoch", j.first);
+ f->dump_unsigned("pg_num", j.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("deleted_pools");
+ for (auto& i : deleted_pools) {
+ f->open_object_section("deletion");
+ f->dump_unsigned("pool_id", i.second);
+ f->dump_unsigned("epoch", i.first);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
+ ls.push_back(new pool_pg_num_history_t);
+ }
+ friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
+ return out << "pg_num_history(e" << h.epoch
+ << " pg_nums " << h.pg_nums
+ << " deleted_pools " << h.deleted_pools
+ << ")";
+ }
+};
+WRITE_CLASS_ENCODER(pool_pg_num_history_t)
+
+// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
+// easily skip them
+static const std::string_view infover_key = "_infover";
+static const std::string_view info_key = "_info";
+static const std::string_view biginfo_key = "_biginfo";
+static const std::string_view epoch_key = "_epoch";
+static const std::string_view fastinfo_key = "_fastinfo";
+
+static const __u8 pg_latest_struct_v = 10;
+// v10 is the new past_intervals encoding
+// v9 was fastinfo_key addition
+// v8 was the move to a per-pg pgmeta object
+// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
+// (first appeared in cuttlefish).
+static const __u8 pg_compat_struct_v = 10;
+
+int prepare_info_keymap(
+ CephContext* cct,
+ std::map<std::string,ceph::buffer::list> *km,
+ std::string *key_to_remove,
+ epoch_t epoch,
+ pg_info_t &info,
+ pg_info_t &last_written_info,
+ PastIntervals &past_intervals,
+ bool dirty_big_info,
+ bool dirty_epoch,
+ bool try_fast_info,
+ PerfCounters *logger = nullptr,
+ DoutPrefixProvider *dpp = nullptr);
+
+namespace ceph::os {
+ class Transaction;
+};
+
+void create_pg_collection(
+ ceph::os::Transaction& t, spg_t pgid, int bits);
+
+void init_pg_ondisk(
+ ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
+
+// omap specific stats
+struct omap_stat_t {
+ int large_omap_objects;
+ int64_t omap_bytes;
+ int64_t omap_keys;
+};
+
+// filter for pg listings
+class PGLSFilter {
+ CephContext* cct;
+protected:
+ std::string xattr;
+public:
+ PGLSFilter();
+ virtual ~PGLSFilter();
+ virtual bool filter(const hobject_t &obj,
+ const ceph::buffer::list& xattr_data) const = 0;
+
+ /**
+ * Arguments passed from the RADOS client. Implementations must
+ * handle any encoding errors, and return an appropriate error code,
+ * or 0 on valid input.
+ */
+ virtual int init(ceph::buffer::list::const_iterator &params) = 0;
+
+ /**
+ * xattr key, or empty string. If non-empty, this xattr will be fetched
+ * and the value passed into ::filter
+ */
+ virtual const std::string& get_xattr() const { return xattr; }
+
+ /**
+ * If true, objects without the named xattr (if xattr name is not empty)
+ * will be rejected without calling ::filter
+ */
+ virtual bool reject_empty_xattr() const { return true; }
+};
+
+class PGLSPlainFilter : public PGLSFilter {
+ std::string val;
+public:
+ int init(ceph::buffer::list::const_iterator &params) override;
+ ~PGLSPlainFilter() override {}
+ bool filter(const hobject_t& obj,
+ const ceph::buffer::list& xattr_data) const override;
+};
+
+// alias name for this structure:
+using missing_map_t = std::map<hobject_t,
+ std::pair<std::optional<uint32_t>,
+ std::optional<uint32_t>>>;
+
+#endif
diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc
new file mode 100644
index 000000000..20ab0a1aa
--- /dev/null
+++ b/src/osd/pg_scrubber.cc
@@ -0,0 +1,2384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 smarttab
+
+#include "./pg_scrubber.h" // the '.' notation used to affect clang-format order
+
+#include <iostream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "OSD.h"
+#include "ScrubStore.h"
+#include "scrub_machine.h"
+
+using namespace Scrub;
+using namespace std::chrono;
+using namespace std::chrono_literals;
+
+#define dout_context (m_osds->cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+template <class T>
+static ostream& _prefix(std::ostream* _dout, T* t)
+{
+ return t->gen_prefix(*_dout);
+}
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf)
+{
+ if (sf.auto_repair)
+ out << " AUTO_REPAIR";
+ if (sf.check_repair)
+ out << " CHECK_REPAIR";
+ if (sf.deep_scrub_on_error)
+ out << " DEEP_SCRUB_ON_ERROR";
+ if (sf.required)
+ out << " REQ_SCRUB";
+
+ return out;
+}
+
+ostream& operator<<(ostream& out, const requested_scrub_t& sf)
+{
+ if (sf.must_repair)
+ out << " MUST_REPAIR";
+ if (sf.auto_repair)
+ out << " planned AUTO_REPAIR";
+ if (sf.check_repair)
+ out << " planned CHECK_REPAIR";
+ if (sf.deep_scrub_on_error)
+ out << " planned DEEP_SCRUB_ON_ERROR";
+ if (sf.must_deep_scrub)
+ out << " MUST_DEEP_SCRUB";
+ if (sf.must_scrub)
+ out << " MUST_SCRUB";
+ if (sf.time_for_deep)
+ out << " TIME_FOR_DEEP";
+ if (sf.need_auto)
+ out << " NEED_AUTO";
+ if (sf.req_scrub)
+ out << " planned REQ_SCRUB";
+
+ return out;
+}
+
+/*
+ * if the incoming message is from a previous interval, it must mean
+ * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
+ * the stale message.
+ */
+bool PgScrubber::check_interval(epoch_t epoch_to_verify)
+{
+ return epoch_to_verify >= m_pg->get_same_interval_since();
+}
+
+bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
+{
+ if (!m_active) {
+ // not scrubbing. We can assume that the scrub was already terminated, and we
+ // can silently discard the incoming event.
+ return false;
+ }
+
+ // is this a message from before we started this scrub?
+ if (epoch_to_verify < m_epoch_start) {
+ return false;
+ }
+
+ // has a new interval started?
+ if (!check_interval(epoch_to_verify)) {
+ // if this is a new interval, on_change() has already terminated that
+ // old scrub.
+ return false;
+ }
+
+ ceph_assert(is_primary());
+
+ // were we instructed to abort?
+ return verify_against_abort(epoch_to_verify);
+}
+
+bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
+{
+ if (!should_abort()) {
+ return true;
+ }
+
+ dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
+ << " vs last-aborted: " << m_last_aborted << dendl;
+
+ // if we were not aware of the abort before - kill the scrub.
+ if (epoch_to_verify >= m_last_aborted) {
+ scrub_clear_state();
+ m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
+ }
+ return false;
+}
+
+bool PgScrubber::should_abort() const
+{
+ if (m_flags.required) {
+ return false; // not stopping 'required' scrubs for configuration changes
+ }
+
+ if (m_is_deep) {
+ if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+ m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
+ dout(10) << "nodeep_scrub set, aborting" << dendl;
+ return true;
+ }
+ } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+ m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+ dout(10) << "noscrub set, aborting" << dendl;
+ return true;
+ }
+
+ return false;
+}
+
+// initiating state-machine events --------------------------------
+
+/*
+ * a note re the checks performed before sending scrub-initiating messages:
+ *
+ * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
+ * possibly were in the queue while the PG changed state and became unavailable for
+ * scrubbing:
+ *
+ * The check_interval() catches all major changes to the PG. As for the other conditions
+ * we may check (and see is_message_relevant() above):
+ *
+ * - we are not 'active' yet, so must not check against is_active(), and:
+ *
+ * - the 'abort' flags were just verified (when the triggering message was queued). As
+ * those are only modified in human speeds - they need not be queried again.
+ *
+ * Some of the considerations above are also relevant to the replica-side initiation
+ * ('StartReplica' & 'StartReplicaNoWait').
+ */
+
+void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
+{
+ dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+ // we may have lost our Primary status while the message languished in the queue
+ if (check_interval(epoch_queued)) {
+ dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl;
+ reset_epoch(epoch_queued);
+ m_fsm->process_event(StartScrub{});
+ dout(10) << "scrubber event --<< StartScrub" << dendl;
+ } else {
+ // and just in case snap trimming was blocked by the aborted scrub
+ m_pg->snap_trimmer_scrub_complete();
+ clear_queued_or_active();
+ }
+}
+
+void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued)
+{
+ dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+ // we may have lost our Primary status while the message languished in the queue
+ if (check_interval(epoch_queued)) {
+ dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl;
+ reset_epoch(epoch_queued);
+ m_fsm->process_event(AfterRepairScrub{});
+ dout(10) << "scrubber event --<< AfterRepairScrub" << dendl;
+ } else {
+ m_pg->snap_trimmer_scrub_complete();
+ clear_queued_or_active();
+ }
+}
+void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(Unblocked{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(InternalSchedScrub{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+ << " token: " << token << dendl;
+ if (is_primary()) {
+ // shouldn't happen. Ignore
+ dout(1) << "got a replica scrub request while Primary!" << dendl;
+ return;
+ }
+
+ if (check_interval(epoch_queued) && is_token_current(token)) {
+ // save us some time by not waiting for updates if there are none
+ // to wait for. Affects the transition from NotActive into either
+ // ReplicaWaitUpdates or ActiveReplica.
+ if (pending_active_pushes())
+ m_fsm->process_event(StartReplica{});
+ else
+ m_fsm->process_event(StartReplicaNoWait{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+ << " token: " << token << dendl;
+ if (check_interval(epoch_queued) && is_token_current(token)) {
+ m_fsm->process_event(SchedReplica{}); // retest for map availability
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
+{
+ // note: Primary only
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(ActivePushesUpd{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::update_applied_notification(epoch_t epoch_queued)
+{
+ // note: Primary only
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(UpdatesApplied{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::digest_update_notification(epoch_t epoch_queued)
+{
+ // note: Primary only
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(DigestUpdate{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_local_map_done(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(Scrub::IntLocalMapDone{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(GotReplicas{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) {
+ m_fsm->process_event(ReplicaPushesUpd{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ // note: scrub is not active yet
+ if (check_interval(epoch_queued)) {
+ m_fsm->process_event(RemotesReserved{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) { // do not check for 'active'!
+ m_fsm->process_event(ReservationFailure{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_full_reset(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+ m_fsm->process_event(Scrub::FullReset{});
+
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_free(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) {
+ m_fsm->process_event(Scrub::SelectedChunkFree{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_busy(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (check_interval(epoch_queued)) {
+ m_fsm->process_event(Scrub::ChunkIsBusy{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_get_next_chunk(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+ if (is_message_relevant(epoch_queued)) {
+ m_fsm->process_event(Scrub::NextChunk{});
+ }
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+ // can't check for "active"
+
+ m_fsm->process_event(Scrub::ScrubFinished{});
+
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_maps_compared(epoch_t epoch_queued)
+{
+ dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+ m_fsm->process_event(Scrub::MapsCompared{});
+
+ dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+// -----------------
+
+bool PgScrubber::is_reserving() const
+{
+ return m_fsm->is_reserving();
+}
+
+void PgScrubber::reset_epoch(epoch_t epoch_queued)
+{
+ dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
+ m_fsm->assert_not_active();
+
+ m_epoch_start = epoch_queued;
+ m_needs_sleep = true;
+ m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+ update_op_mode_text();
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+ unsigned int qu_priority = m_flags.priority;
+
+ if (with_priority == Scrub::scrub_prio_t::high_priority) {
+ qu_priority =
+ std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority);
+ }
+ return qu_priority;
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const
+{
+ if (with_priority == Scrub::scrub_prio_t::high_priority) {
+ suggested_priority = std::max(suggested_priority,
+ (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+ }
+ return suggested_priority;
+}
+
+// ///////////////////////////////////////////////////////////////////// //
+// scrub-op registration handling
+
+bool PgScrubber::is_scrub_registered() const
+{
+ return !m_scrub_reg_stamp.is_zero();
+}
+
+void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
+{
+ if (!is_primary()) {
+ // normal. No warning is required.
+ return;
+ }
+
+ dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? "
+ << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp
+ << dendl;
+
+ ceph_assert(!is_scrub_registered());
+
+ utime_t reg_stamp;
+ bool must = false;
+
+ if (request_flags.must_scrub || request_flags.need_auto) {
+ // Set the smallest time that isn't utime_t()
+ reg_stamp = PgScrubber::scrub_must_stamp();
+ must = true;
+ } else if (m_pg->info.stats.stats_invalid &&
+ m_pg->cct->_conf->osd_scrub_invalid_stats) {
+ reg_stamp = ceph_clock_now();
+ must = true;
+ } else {
+ reg_stamp = m_pg->info.history.last_scrub_stamp;
+ }
+
+ dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
+ << " required:" << m_flags.required << " flags: " << request_flags
+ << " stamp: " << reg_stamp << dendl;
+
+ const double scrub_min_interval =
+ m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
+ const double scrub_max_interval =
+ m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
+
+ // note the sched_time, so we can locate this scrub, and remove it later
+ m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
+ scrub_max_interval, must);
+ dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
+ << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
+}
+
+void PgScrubber::unreg_next_scrub()
+{
+ if (is_scrub_registered()) {
+ dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
+ m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
+ m_scrub_reg_stamp = utime_t{};
+ }
+}
+
+void PgScrubber::scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags)
+{
+ dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
+ << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
+ << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
+ << dendl;
+
+ unreg_next_scrub();
+
+ req_flags.must_scrub = true;
+ req_flags.must_deep_scrub =
+ (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
+ req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
+ // User might intervene, so clear this
+ req_flags.need_auto = false;
+ req_flags.req_scrub = true;
+
+ dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
+
+ reg_next_scrub(req_flags);
+}
+
+void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
+{
+ dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
+ << is_scrub_registered() << dendl;
+
+ unreg_next_scrub();
+ req_flags.need_auto = true;
+ reg_next_scrub(req_flags);
+}
+
+bool PgScrubber::reserve_local()
+{
+ // try to create the reservation object (which translates into asking the
+ // OSD for the local scrub resource). If failing - undo it immediately
+
+ m_local_osd_resource.emplace(m_osds);
+ if (m_local_osd_resource->is_reserved()) {
+ dout(15) << __func__ << ": local resources reserved" << dendl;
+ return true;
+ }
+
+ dout(10) << __func__ << ": failed to reserve local scrub resources" << dendl;
+ m_local_osd_resource.reset();
+ return false;
+}
+
+// ----------------------------------------------------------------------------
+
+bool PgScrubber::has_pg_marked_new_updates() const
+{
+ auto last_applied = m_pg->recovery_state.get_last_update_applied();
+ dout(10) << __func__ << " recovery last: " << last_applied
+ << " vs. scrub's: " << m_subset_last_update << dendl;
+
+ return last_applied >= m_subset_last_update;
+}
+
+void PgScrubber::set_subset_last_update(eversion_t e)
+{
+ m_subset_last_update = e;
+ dout(15) << __func__ << " last-update: " << e << dendl;
+}
+
+void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
+{
+ // we are only interested in updates if we are the Primary, and in state
+ // WaitLastUpdate
+ if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
+ m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
+ dout(15) << __func__ << " update: " << applied_version
+ << " vs. required: " << m_subset_last_update << dendl;
+ }
+}
+
+/*
+ * The selected range is set directly into 'm_start' and 'm_end'
+ * setting:
+ * - m_subset_last_update
+ * - m_max_end
+ * - end
+ * - start
+ */
+bool PgScrubber::select_range()
+{
+ m_primary_scrubmap = ScrubMap{};
+ m_received_maps.clear();
+
+ /* get the start and end of our scrub chunk
+ *
+ * Our scrub chunk has an important restriction we're going to need to
+ * respect. We can't let head be start or end.
+ * Using a half-open interval means that if end == head,
+ * we'd scrub/lock head and the clone right next to head in different
+ * chunks which would allow us to miss clones created between
+ * scrubbing that chunk and scrubbing the chunk including head.
+ * This isn't true for any of the other clones since clones can
+ * only be created "just to the left of" head. There is one exception
+ * to this: promotion of clones which always happens to the left of the
+ * left-most clone, but promote_object checks the scrubber in that
+ * case, so it should be ok. Also, it's ok to "miss" clones at the
+ * left end of the range if we are a tier because they may legitimately
+ * not exist (see _scrub).
+ */
+ int min_idx = std::max<int64_t>(
+ 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
+
+ int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
+ preemption_data.chunk_divisor());
+
+ dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
+ << " Div: " << preemption_data.chunk_divisor() << dendl;
+
+ hobject_t start = m_start;
+ hobject_t candidate_end;
+ std::vector<hobject_t> objects;
+ int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
+ &candidate_end);
+ ceph_assert(ret >= 0);
+
+ if (!objects.empty()) {
+
+ hobject_t back = objects.back();
+ while (candidate_end.is_head() && candidate_end == back.get_head()) {
+ candidate_end = back;
+ objects.pop_back();
+ if (objects.empty()) {
+ ceph_assert(0 ==
+ "Somehow we got more than 2 objects which"
+ "have the same head but are not clones");
+ }
+ back = objects.back();
+ }
+
+ if (candidate_end.is_head()) {
+ ceph_assert(candidate_end != back.get_head());
+ candidate_end = candidate_end.get_object_boundary();
+ }
+
+ } else {
+ ceph_assert(candidate_end.is_max());
+ }
+
+ // is that range free for us? if not - we will be rescheduled later by whoever
+ // triggered us this time
+
+ if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
+ // we'll be requeued by whatever made us unavailable for scrub
+ dout(10) << __func__ << ": scrub blocked somewhere in range "
+ << "[" << m_start << ", " << candidate_end << ")" << dendl;
+ return false;
+ }
+
+ m_end = candidate_end;
+ if (m_end > m_max_end)
+ m_max_end = m_end;
+
+ dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
+ << m_max_end << dendl;
+ return true;
+}
+
+void PgScrubber::select_range_n_notify()
+{
+ if (select_range()) {
+ // the next chunk to handle is not blocked
+ dout(20) << __func__ << ": selection OK" << dendl;
+ m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
+
+ } else {
+ // we will wait for the objects range to become available for scrubbing
+ dout(10) << __func__ << ": selected chunk is busy" << dendl;
+ m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
+ }
+}
+
+bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
+{
+ if (soid < m_start || soid >= m_end) {
+ return false;
+ }
+
+ dout(20) << __func__ << " " << soid << " can preempt? "
+ << preemption_data.is_preemptable() << " already preempted? "
+ << preemption_data.was_preempted() << dendl;
+
+ if (preemption_data.was_preempted()) {
+ // otherwise - write requests arriving while 'already preempted' is set
+ // but 'preemptable' is not - will not be allowed to continue, and will
+ // not be requeued on time.
+ return false;
+ }
+
+ if (preemption_data.is_preemptable()) {
+
+ dout(10) << __func__ << " " << soid << " preempted" << dendl;
+
+ // signal the preemption
+ preemption_data.do_preempt();
+ m_end = m_start; // free the range we were scrubbing
+
+ return false;
+ }
+ return true;
+}
+
+bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
+{
+ // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
+ return (start < m_max_end && end >= m_start);
+}
+
+/**
+ * if we are required to sleep:
+ * arrange a callback sometimes later.
+ * be sure to be able to identify a stale callback.
+ * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
+ * anyway.
+ */
+void PgScrubber::add_delayed_scheduling()
+{
+ m_end = m_start; // not blocking any range now
+
+ milliseconds sleep_time{0ms};
+ if (m_needs_sleep) {
+ double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
+ sleep_time = milliseconds{long(scrub_sleep)};
+ }
+ dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? "
+ << m_needs_sleep << dendl;
+
+ if (sleep_time.count()) {
+ // schedule a transition for some 'sleep_time' ms in the future
+
+ m_needs_sleep = false;
+ m_sleep_started_at = ceph_clock_now();
+
+ // the following log line is used by osd-scrub-test.sh
+ dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl;
+
+ // the 'delayer' for crimson is different. Will be factored out.
+
+ spg_t pgid = m_pg->get_pgid();
+ auto callbk = new LambdaContext([osds = m_osds, pgid,
+ scrbr = this]([[maybe_unused]] int r) mutable {
+ PGRef pg = osds->osd->lookup_lock_pg(pgid);
+ if (!pg) {
+ lgeneric_subdout(g_ceph_context, osd, 10)
+ << "scrub_requeue_callback: Could not find "
+ << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
+ return;
+ }
+ scrbr->m_needs_sleep = true;
+ lgeneric_dout(scrbr->get_pg_cct(), 7)
+ << "scrub_requeue_callback: slept for "
+ << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
+
+ scrbr->m_sleep_started_at = utime_t{};
+ osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
+ pg->unlock();
+ });
+
+ std::lock_guard l(m_osds->sleep_lock);
+ m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
+
+ } else {
+ // just a requeue
+ m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+ }
+}
+
+eversion_t PgScrubber::search_log_for_updates() const
+{
+ auto& projected = m_pg->projected_log.log;
+ auto pi = find_if(
+ projected.crbegin(), projected.crend(),
+ [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
+
+ if (pi != projected.crend())
+ return pi->version;
+
+ // there was no relevant update entry in the log
+
+ auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
+ auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
+ return e.soid >= m_start && e.soid < m_end;
+ });
+
+ if (p == log.crend())
+ return eversion_t{};
+ else
+ return p->version;
+}
+
+void PgScrubber::get_replicas_maps(bool replica_can_preempt)
+{
+ dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
+ << m_interval_start
+ << " pg same_interval_since: " << m_pg->info.history.same_interval_since
+ << dendl;
+
+ m_primary_scrubmap_pos.reset();
+
+ // ask replicas to scan and send maps
+ for (const auto& i : m_pg->get_actingset()) {
+
+ if (i == m_pg_whoami)
+ continue;
+
+ m_maps_status.mark_replica_map_request(i);
+ _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
+ replica_can_preempt);
+ }
+
+ dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
+}
+
+bool PgScrubber::was_epoch_changed() const
+{
+ // for crimson we have m_pg->get_info().history.same_interval_since
+ dout(10) << __func__ << " epoch_start: " << m_interval_start
+ << " from pg: " << m_pg->get_history().same_interval_since << dendl;
+
+ return m_interval_start < m_pg->get_history().same_interval_since;
+}
+
+void PgScrubber::mark_local_map_ready()
+{
+ m_maps_status.mark_local_map_ready();
+}
+
+bool PgScrubber::are_all_maps_available() const
+{
+ return m_maps_status.are_all_maps_available();
+}
+
+std::string PgScrubber::dump_awaited_maps() const
+{
+ return m_maps_status.dump();
+}
+
+void PgScrubber::update_op_mode_text()
+{
+ auto visible_repair = state_test(PG_STATE_REPAIR);
+ m_mode_desc = (visible_repair ? "repair"sv : (m_is_deep ? "deep-scrub"sv : "scrub"sv));
+
+ dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
+ << ", internal: " << (m_is_repair ? "true" : "false")
+ << ". Displayed: " << m_mode_desc << dendl;
+}
+
+void PgScrubber::_request_scrub_map(pg_shard_t replica,
+ eversion_t version,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
+ bool allow_preemption)
+{
+ ceph_assert(replica != m_pg_whoami);
+ dout(10) << __func__ << " scrubmap from osd." << replica
+ << (deep ? " deep" : " shallow") << dendl;
+
+ auto repscrubop =
+ new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
+ get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
+ allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
+
+ // default priority. We want the replica-scrub processed prior to any recovery
+ // or client io messages (we are holding a lock!)
+ m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
+}
+
+void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
+{
+ if (!m_store)
+ return;
+
+ struct OnComplete : Context {
+ std::unique_ptr<Scrub::Store> store;
+ explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
+ {}
+ void finish(int) override {}
+ };
+ m_store->cleanup(t);
+ t->register_on_complete(new OnComplete(std::move(m_store)));
+ ceph_assert(!m_store);
+}
+
+void PgScrubber::on_init()
+{
+ // going upwards from 'inactive'
+ ceph_assert(!is_scrub_active());
+
+ preemption_data.reset();
+ m_pg->publish_stats_to_osd();
+ m_interval_start = m_pg->get_history().same_interval_since;
+
+ dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
+
+ // create a new store
+ {
+ ObjectStore::Transaction t;
+ cleanup_store(&t);
+ m_store.reset(
+ Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
+ m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ }
+
+ m_start = m_pg->info.pgid.pgid.get_hobj_start();
+ m_active = true;
+}
+
+void PgScrubber::on_replica_init()
+{
+ m_active = true;
+}
+
+void PgScrubber::_scan_snaps(ScrubMap& smap)
+{
+ hobject_t head;
+ SnapSet snapset;
+
+ // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
+ // in this function
+ dout(15) << "_scan_snaps starts" << dendl;
+
+ for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+ const hobject_t& hoid = i->first;
+ ScrubMap::object& o = i->second;
+
+ dout(20) << __func__ << " " << hoid << dendl;
+
+ ceph_assert(!hoid.is_snapdir());
+ if (hoid.is_head()) {
+ // parse the SnapSet
+ bufferlist bl;
+ if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+ continue;
+ }
+ bl.push_back(o.attrs[SS_ATTR]);
+ auto p = bl.cbegin();
+ try {
+ decode(snapset, p);
+ } catch (...) {
+ continue;
+ }
+ head = hoid.get_head();
+ continue;
+ }
+
+ if (hoid.snap < CEPH_MAXSNAP) {
+ // check and if necessary fix snap_mapper
+ if (hoid.get_head() != head) {
+ derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
+ continue;
+ }
+ set<snapid_t> obj_snaps;
+ auto p = snapset.clone_snaps.find(hoid.snap);
+ if (p == snapset.clone_snaps.end()) {
+ derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
+ continue;
+ }
+ obj_snaps.insert(p->second.begin(), p->second.end());
+ set<snapid_t> cur_snaps;
+ int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
+ if (r != 0 && r != -ENOENT) {
+ derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ if (r == -ENOENT || cur_snaps != obj_snaps) {
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
+ if (r == 0) {
+ r = m_pg->snap_mapper.remove_oid(hoid, &_t);
+ if (r != 0) {
+ derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+ ceph_abort();
+ }
+ m_pg->osd->clog->error()
+ << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+ << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
+ << ", oi: " << obj_snaps << "...repaired";
+ } else {
+ m_pg->osd->clog->error()
+ << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+ << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
+ << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
+ << "...repaired";
+ }
+ m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
+
+ // wait for repair to apply to avoid confusing other bits of the system.
+ {
+ dout(15) << __func__ << " wait on repair!" << dendl;
+
+ ceph::condition_variable my_cond;
+ ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
+ int e = 0;
+ bool done;
+
+ t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
+
+ e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
+ if (e != 0) {
+ derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
+ } else {
+ std::unique_lock l{my_lock};
+ my_cond.wait(l, [&done] { return done; });
+ }
+ }
+ }
+ }
+ }
+}
+
+int PgScrubber::build_primary_map_chunk()
+{
+ epoch_t map_building_since = m_pg->get_osdmap_epoch();
+ dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl;
+
+ auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
+ m_end, m_is_deep);
+
+ if (ret == -EINPROGRESS) {
+ // reschedule another round of asking the backend to collect the scrub data
+ m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
+ }
+ return ret;
+}
+
+int PgScrubber::build_replica_map_chunk()
+{
+ dout(10) << __func__ << " interval start: " << m_interval_start
+ << " current token: " << m_current_token << " epoch: " << m_epoch_start
+ << " deep: " << m_is_deep << dendl;
+
+ auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
+ m_is_deep);
+
+ switch (ret) {
+
+ case -EINPROGRESS:
+ // must wait for the backend to finish. No external event source.
+ // (note: previous version used low priority here. Now switched to using the
+ // priority of the original message)
+ m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
+ m_flags.priority, m_current_token);
+ break;
+
+ case 0: {
+ // finished!
+ m_cleaned_meta_map.clear_from(m_start);
+ m_cleaned_meta_map.insert(replica_scrubmap);
+ auto for_meta_scrub = clean_meta_map();
+ _scan_snaps(for_meta_scrub);
+
+ // the local map has been created. Send it to the primary.
+ // Note: once the message reaches the Primary, it may ask us for another
+ // chunk - and we better be done with the current scrub. Thus - the preparation of
+ // the reply message is separate, and we clear the scrub state before actually
+ // sending it.
+
+ auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
+ replica_handling_done();
+ dout(15) << __func__ << " chunk map sent " << dendl;
+ send_replica_map(reply);
+ } break;
+
+ default:
+ // negative retval: build_scrub_map_chunk() signalled an error
+ // Pre-Pacific code ignored this option, treating it as a success.
+ // \todo Add an error flag in the returning message.
+ dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
+ << dendl;
+ replica_handling_done();
+ // only in debug mode for now:
+ assert(false && "backend error");
+ break;
+ };
+
+ return ret;
+}
+
+int PgScrubber::build_scrub_map_chunk(
+ ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
+{
+ dout(10) << __func__ << " [" << start << "," << end << ") "
+ << " pos " << pos << " Deep: " << deep << dendl;
+
+ // start
+ while (pos.empty()) {
+
+ pos.deep = deep;
+ map.valid_through = m_pg->info.last_update;
+
+ // objects
+ vector<ghobject_t> rollback_obs;
+ pos.ret =
+ m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
+ dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
+ if (pos.ret < 0) {
+ dout(5) << "objects_list_range error: " << pos.ret << dendl;
+ return pos.ret;
+ }
+ dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
+ if (pos.ls.empty()) {
+ break;
+ }
+ m_pg->_scan_rollback_obs(rollback_obs);
+ pos.pos = 0;
+ return -EINPROGRESS;
+ }
+
+ // scan objects
+ while (!pos.done()) {
+
+ int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
+ if (r == -EINPROGRESS) {
+ dout(20) << __func__ << " in progress" << dendl;
+ return r;
+ }
+ }
+
+ // finish
+ dout(20) << __func__ << " finishing" << dendl;
+ ceph_assert(pos.done());
+ m_pg->_repair_oinfo_oid(map);
+
+ dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
+ return 0;
+}
+
+/*
+ * Process:
+ * Building a map of objects suitable for snapshot validation.
+ * The data in m_cleaned_meta_map is the left over partial items that need to
+ * be completed before they can be processed.
+ *
+ * Snapshots in maps precede the head object, which is why we are scanning backwards.
+ */
+ScrubMap PgScrubber::clean_meta_map()
+{
+ ScrubMap for_meta_scrub;
+
+ if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
+ m_cleaned_meta_map.swap(for_meta_scrub);
+ } else {
+ auto iter = m_cleaned_meta_map.objects.end();
+ --iter; // not empty, see 'if' clause
+ auto begin = m_cleaned_meta_map.objects.begin();
+ if (iter->first.has_snapset()) {
+ ++iter;
+ } else {
+ while (iter != begin) {
+ auto next = iter--;
+ if (next->first.get_head() != iter->first.get_head()) {
+ ++iter;
+ break;
+ }
+ }
+ }
+ for_meta_scrub.objects.insert(begin, iter);
+ m_cleaned_meta_map.objects.erase(begin, iter);
+ }
+
+ return for_meta_scrub;
+}
+
+void PgScrubber::run_callbacks()
+{
+ std::list<Context*> to_run;
+ to_run.swap(m_callbacks);
+
+ for (auto& tr : to_run) {
+ tr->complete(0);
+ }
+}
+
+void PgScrubber::maps_compare_n_cleanup()
+{
+ scrub_compare_maps();
+ m_start = m_end;
+ run_callbacks();
+ requeue_waiting();
+ m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
+}
+
+Scrub::preemption_t& PgScrubber::get_preemptor()
+{
+ return preemption_data;
+}
+
+/*
+ * Process note: called for the arriving "give me your map, replica!" request. Unlike
+ * the original implementation, we do not requeue the Op waiting for
+ * updates. Instead - we trigger the FSM.
+ */
+void PgScrubber::replica_scrub_op(OpRequestRef op)
+{
+ op->mark_started();
+ auto msg = op->get_req<MOSDRepScrub>();
+ dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
+ << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
+
+ // are we still processing a previous scrub-map request without noticing that the
+ // interval changed? won't see it here, but rather at the reservation stage.
+
+ if (msg->map_epoch < m_pg->info.history.same_interval_since) {
+ dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
+ << " < " << m_pg->info.history.same_interval_since << dendl;
+
+ // is there a general sync issue? are we holding a stale reservation?
+ // not checking now - assuming we will actively react to interval change.
+
+ return;
+ }
+
+ if (is_queued_or_active()) {
+ // this is bug!
+ // Somehow, we have received a new scrub request from our Primary, before
+ // having finished with the previous one. Did we go through an interval
+ // change without reseting the FSM? Possible responses:
+ // - crashing (the original assert_not_active() implemented that one), or
+ // - trying to recover:
+ // - (logging enough information to debug this scenario)
+ // - reset the FSM.
+ m_osds->clog->warn()
+ << __func__
+ << ": error: a second scrub-op received while handling the previous one";
+
+ scrub_clear_state();
+ m_osds->clog->warn() << __func__
+ << ": after a reset. Now handling the new OP";
+ }
+ // make sure the FSM is at NotActive
+ m_fsm->assert_not_active();
+
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos = ScrubMapBuilder{};
+
+ m_replica_min_epoch = msg->min_epoch;
+ m_start = msg->start;
+ m_end = msg->end;
+ m_max_end = msg->end;
+ m_is_deep = msg->deep;
+ m_interval_start = m_pg->info.history.same_interval_since;
+ m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
+ : Scrub::scrub_prio_t::low_priority;
+ m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
+
+ preemption_data.reset();
+ preemption_data.force_preemptability(msg->allow_preemption);
+
+ replica_scrubmap_pos.reset();
+
+ set_queued_or_active();
+ m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority,
+ m_flags.priority, m_current_token);
+}
+
+void PgScrubber::set_op_parameters(requested_scrub_t& request)
+{
+ dout(10) << __func__ << " input: " << request << dendl;
+
+ // write down the epoch of starting a new scrub. Will be used
+ // to discard stale messages from previous aborted scrubs.
+ m_epoch_start = m_pg->get_osdmap_epoch();
+
+ m_flags.check_repair = request.check_repair;
+ m_flags.auto_repair = request.auto_repair || request.need_auto;
+ m_flags.required = request.req_scrub || request.must_scrub;
+
+ m_flags.priority = (request.must_scrub || request.need_auto)
+ ? get_pg_cct()->_conf->osd_requested_scrub_priority
+ : m_pg->get_scrub_priority();
+
+ state_set(PG_STATE_SCRUBBING);
+
+ // will we be deep-scrubbing?
+ if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
+ state_set(PG_STATE_DEEP_SCRUB);
+ }
+
+ // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
+ // deep-scrub with the auto_repair configuration flag set). m_is_repair value
+ // determines the scrubber behavior.
+ // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
+ // PG status as appearing in the logs).
+ m_is_repair = request.must_repair || m_flags.auto_repair;
+ if (request.must_repair) {
+ state_set(PG_STATE_REPAIR);
+ // not calling update_op_mode_text() yet, as m_is_deep not set yet
+ }
+
+ // the publishing here seems to be required for tests synchronization
+ m_pg->publish_stats_to_osd();
+ m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
+}
+
+void PgScrubber::scrub_compare_maps()
+{
+ dout(10) << __func__ << " has maps, analyzing" << dendl;
+
+ // construct authoritative scrub map for type-specific scrubbing
+ m_cleaned_meta_map.insert(m_primary_scrubmap);
+ map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
+
+ map<pg_shard_t, ScrubMap*> maps;
+ maps[m_pg_whoami] = &m_primary_scrubmap;
+
+ for (const auto& i : m_pg->get_actingset()) {
+ if (i == m_pg_whoami)
+ continue;
+ dout(2) << __func__ << " replica " << i << " has "
+ << m_received_maps[i].objects.size() << " items" << dendl;
+ maps[i] = &m_received_maps[i];
+ }
+
+ set<hobject_t> master_set;
+
+ // Construct master set
+ for (const auto& map : maps) {
+ for (const auto& i : map.second->objects) {
+ master_set.insert(i.first);
+ }
+ }
+
+ stringstream ss;
+ m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
+
+ if (!ss.str().empty()) {
+ m_osds->clog->warn(ss);
+ }
+
+ if (m_pg->recovery_state.get_actingset().size() > 1) {
+
+ dout(10) << __func__ << " comparing replica scrub maps" << dendl;
+
+ // Map from object with errors to good peer
+ map<hobject_t, list<pg_shard_t>> authoritative;
+
+ dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has "
+ << m_primary_scrubmap.objects.size() << " items" << dendl;
+
+ ss.str("");
+ ss.clear();
+
+ m_pg->get_pgbackend()->be_compare_scrubmaps(
+ maps, master_set, m_is_repair, m_missing, m_inconsistent,
+ authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
+ m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
+
+ if (!ss.str().empty()) {
+ m_osds->clog->error(ss);
+ }
+
+ for (auto& i : authoritative) {
+ list<pair<ScrubMap::object, pg_shard_t>> good_peers;
+ for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
+ ++j) {
+ good_peers.emplace_back(maps[*j]->objects[i.first], *j);
+ }
+ m_authoritative.emplace(i.first, good_peers);
+ }
+
+ for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
+ m_cleaned_meta_map.objects.erase(i->first);
+ m_cleaned_meta_map.objects.insert(
+ *(maps[i->second.back()]->objects.find(i->first)));
+ }
+ }
+
+ auto for_meta_scrub = clean_meta_map();
+
+ // ok, do the pg-type specific scrubbing
+
+ // (Validates consistency of the object info and snap sets)
+ scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+
+ // Called here on the primary can use an authoritative map if it isn't the primary
+ _scan_snaps(for_meta_scrub);
+
+ if (!m_store->empty()) {
+
+ if (m_is_repair) {
+ dout(10) << __func__ << ": discarding scrub results" << dendl;
+ m_store->flush(nullptr);
+ } else {
+ dout(10) << __func__ << ": updating scrub object" << dendl;
+ ObjectStore::Transaction t;
+ m_store->flush(&t);
+ m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ }
+ }
+}
+
+ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg(
+ PreemptionNoted was_preempted)
+{
+ dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
+
+ auto reply =
+ make_message<MOSDRepScrubMap>(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
+ m_replica_min_epoch, m_pg_whoami);
+
+ reply->preempted = (was_preempted == PreemptionNoted::preempted);
+ ::encode(replica_scrubmap, reply->get_data());
+
+ return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch};
+}
+
+void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared)
+{
+ m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg,
+ preprepared.m_epoch, false);
+}
+
+void PgScrubber::send_preempted_replica()
+{
+ auto reply =
+ make_message<MOSDRepScrubMap>(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard},
+ m_replica_min_epoch, m_pg_whoami);
+
+ reply->preempted = true;
+ ::encode(replica_scrubmap, reply->get_data()); // must not skip this
+ m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false);
+}
+
+/*
+ * - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
+ * The state-machine will react to that when all replica maps are received.
+ * - when all maps are received, we signal the FSM with the GotReplicas event (see
+ * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
+ * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
+ * handle.
+ */
+void PgScrubber::map_from_replica(OpRequestRef op)
+{
+ auto m = op->get_req<MOSDRepScrubMap>();
+ dout(15) << __func__ << " " << *m << dendl;
+
+ if (m->map_epoch < m_pg->info.history.same_interval_since) {
+ dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
+ << m_pg->info.history.same_interval_since << dendl;
+ return;
+ }
+
+ auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+
+ m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
+ dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
+
+ auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+ if (!is_ok) {
+ // previously an unexpected map was triggering an assert. Now, as scrubs can be
+ // aborted at any time, the chances of this happening have increased, and aborting is
+ // not justified
+ dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
+ return;
+ }
+
+ if (m->preempted) {
+ dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+ preemption_data.do_preempt();
+ }
+
+ if (m_maps_status.are_all_maps_available()) {
+ dout(15) << __func__ << " all repl-maps available" << dendl;
+ m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+ auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
+
+ /*
+ * if we are currently holding a reservation, then:
+ * either (1) we, the scrubber, did not yet notice an interval change. The remembered
+ * reservation epoch is from before our interval, and we can silently discard the
+ * reservation (no message is required).
+ * or:
+ * (2) the interval hasn't changed, but the same Primary that (we think) holds the
+ * lock just sent us a new request. Note that we know it's the same Primary, as
+ * otherwise the interval would have changed.
+ * Ostensibly we can discard & redo the reservation. But then we
+ * will be temporarily releasing the OSD resource - and might not be able to grab it
+ * again. Thus, we simply treat this as a successful new request
+ * (but mark the fact that if there is a previous request from the primary to
+ * scrub a specific chunk - that request is now defunct).
+ */
+
+ if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
+ // we are holding a stale reservation from a past epoch
+ m_remote_osd_resource.reset();
+ dout(10) << __func__ << " cleared existing stale reservation" << dendl;
+ }
+
+ if (request_ep < m_pg->get_same_interval_since()) {
+ // will not ack stale requests
+ return;
+ }
+
+ bool granted{false};
+ if (m_remote_osd_resource.has_value()) {
+
+ dout(10) << __func__ << " already reserved." << dendl;
+
+ /*
+ * it might well be that we did not yet finish handling the latest scrub-op from
+ * our primary. This happens, for example, if 'noscrub' was set via a command, then
+ * reset. The primary in this scenario will remain in the same interval, but we do need
+ * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
+ * from the primary will see us in active state, crashing the OSD).
+ */
+ advance_token();
+ granted = true;
+
+ } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
+ !m_osds->is_recovery_active()) {
+ m_remote_osd_resource.emplace(this, m_pg, m_osds, request_ep);
+ // OSD resources allocated?
+ granted = m_remote_osd_resource->is_reserved();
+ if (!granted) {
+ // just forget it
+ m_remote_osd_resource.reset();
+ dout(20) << __func__ << ": failed to reserve remotely" << dendl;
+ }
+ }
+
+ dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
+
+ Message* reply = new MOSDScrubReserve(
+ spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
+ granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
+
+ m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
+}
+
+void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_reservations.has_value()) {
+ m_reservations->handle_reserve_grant(op, from);
+ } else {
+ dout(20) << __func__ << ": late/unsolicited reservation grant from osd "
+ << from << " (" << op << ")" << dendl;
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ if (m_reservations.has_value()) {
+ // there is an active reservation process. No action is required otherwise.
+ m_reservations->handle_reserve_reject(op, from);
+ }
+}
+
+void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
+{
+ dout(10) << __func__ << " " << *op->get_req() << dendl;
+ op->mark_started();
+
+ /*
+ * this specific scrub session has terminated. All incoming events carrying the old
+ * tag will be discarded.
+ */
+ advance_token();
+ m_remote_osd_resource.reset();
+}
+
+void PgScrubber::discard_replica_reservations()
+{
+ dout(10) << __func__ << dendl;
+ if (m_reservations.has_value()) {
+ m_reservations->discard_all();
+ }
+}
+
+void PgScrubber::clear_scrub_reservations()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.reset(); // the remote reservations
+ m_local_osd_resource.reset(); // the local reservation
+ m_remote_osd_resource.reset(); // we as replica reserved for a Primary
+}
+
+void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
+{
+ ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
+
+ std::vector<pair<int, Message*>> messages;
+ messages.reserve(m_pg->get_actingset().size());
+
+ epoch_t epch = get_osdmap_epoch();
+
+ for (auto& p : m_pg->get_actingset()) {
+
+ if (p == m_pg_whoami)
+ continue;
+
+ dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
+ << dendl;
+ Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
+ m_pg_whoami);
+ messages.push_back(std::make_pair(p.osd, m));
+ }
+
+ if (!messages.empty()) {
+ m_osds->send_message_osd_cluster(messages, epch);
+ }
+}
+
+void PgScrubber::unreserve_replicas()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.reset();
+}
+
+void PgScrubber::set_queued_or_active()
+{
+ m_queued_or_active = true;
+}
+
+void PgScrubber::clear_queued_or_active()
+{
+ m_queued_or_active = false;
+}
+
+bool PgScrubber::is_queued_or_active() const
+{
+ return m_queued_or_active;
+}
+
+[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
+{
+ dout(10) << __func__ << ": checking authoritative (mode="
+ << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
+ << ")" << dendl;
+
+ // authoritative only store objects which are missing or inconsistent.
+ if (!m_authoritative.empty()) {
+
+ stringstream ss;
+ ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
+ << m_inconsistent.size() << " inconsistent objects";
+ dout(2) << ss.str() << dendl;
+ m_osds->clog->error(ss);
+
+ if (m_is_repair) {
+ state_clear(PG_STATE_CLEAN);
+ // we know we have a problem, so it's OK to set the user-visible flag
+ // even if we only reached here via auto-repair
+ state_set(PG_STATE_REPAIR);
+ update_op_mode_text();
+
+ for (const auto& [hobj, shrd_list] : m_authoritative) {
+
+ auto missing_entry = m_missing.find(hobj);
+
+ if (missing_entry != m_missing.end()) {
+ m_pg->repair_object(hobj, shrd_list, missing_entry->second);
+ m_fixed_count += missing_entry->second.size();
+ }
+
+ if (m_inconsistent.count(hobj)) {
+ m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+ m_fixed_count += m_inconsistent[hobj].size();
+ }
+ }
+ }
+ }
+ return (!m_authoritative.empty() && m_is_repair);
+}
+
+/*
+ * note: only called for the Primary.
+ */
+void PgScrubber::scrub_finish()
+{
+ dout(10) << __func__ << " before flags: " << m_flags
+ << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair")
+ << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
+
+ ceph_assert(m_pg->is_locked());
+ ceph_assert(is_queued_or_active());
+
+ m_pg->m_planned_scrub = requested_scrub_t{};
+
+ // if the repair request comes from auto-repair and large number of errors,
+ // we would like to cancel auto-repair
+ if (m_is_repair && m_flags.auto_repair &&
+ m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+
+ dout(10) << __func__ << " undoing the repair" << dendl;
+ state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
+ m_is_repair = false;
+ update_op_mode_text();
+ }
+
+ bool do_auto_scrub = false;
+
+ // if a regular scrub had errors within the limit, do a deep scrub to auto repair
+ if (m_flags.deep_scrub_on_error && !m_authoritative.empty() &&
+ m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+ ceph_assert(!m_is_deep);
+ do_auto_scrub = true;
+ dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
+ }
+
+ m_flags.deep_scrub_on_error = false;
+
+ // type-specific finish (can tally more errors)
+ _scrub_finish();
+
+ bool has_error = scrub_process_inconsistent();
+
+ {
+ stringstream oss;
+ oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
+ int total_errors = m_shallow_errors + m_deep_errors;
+ if (total_errors)
+ oss << total_errors << " errors";
+ else
+ oss << "ok";
+ if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+ oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
+ << " remaining deep scrub error details lost)";
+ if (m_is_repair)
+ oss << ", " << m_fixed_count << " fixed";
+ if (total_errors)
+ m_osds->clog->error(oss);
+ else
+ m_osds->clog->debug(oss);
+ }
+
+ // Since we don't know which errors were fixed, we can only clear them
+ // when every one has been fixed.
+ if (m_is_repair) {
+ if (m_fixed_count == m_shallow_errors + m_deep_errors) {
+
+ ceph_assert(m_is_deep);
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ dout(20) << __func__ << " All may be fixed" << dendl;
+
+ } else if (has_error) {
+
+ // Deep scrub in order to get corrected error counts
+ m_pg->scrub_after_recovery = true;
+ m_pg->m_planned_scrub.req_scrub =
+ m_pg->m_planned_scrub.req_scrub || m_flags.required;
+
+ dout(20) << __func__ << " Current 'required': " << m_flags.required
+ << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
+
+ } else if (m_shallow_errors || m_deep_errors) {
+
+ // We have errors but nothing can be fixed, so there is no repair
+ // possible.
+ state_set(PG_STATE_FAILED_REPAIR);
+ dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
+ << " error(s) present with no repair possible" << dendl;
+ }
+ }
+
+ {
+ // finish up
+ ObjectStore::Transaction t;
+ m_pg->recovery_state.update_stats(
+ [this](auto& history, auto& stats) {
+ dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
+ utime_t now = ceph_clock_now();
+ history.last_scrub = m_pg->recovery_state.get_info().last_update;
+ history.last_scrub_stamp = now;
+ if (m_is_deep) {
+ history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
+ history.last_deep_scrub_stamp = now;
+ }
+
+ if (m_is_deep) {
+ if ((m_shallow_errors == 0) && (m_deep_errors == 0))
+ history.last_clean_scrub_stamp = now;
+ stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+ stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
+ stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
+ stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
+ stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
+ dout(25) << "scrub_finish shard " << m_pg_whoami
+ << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
+ << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl;
+ } else {
+ stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+ // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
+ // because of deep-scrub errors
+ if (m_shallow_errors == 0)
+ history.last_clean_scrub_stamp = now;
+ }
+ stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
+ stats.stats.sum.num_deep_scrub_errors;
+ if (m_flags.check_repair) {
+ m_flags.check_repair = false;
+ if (m_pg->info.stats.stats.sum.num_scrub_errors) {
+ state_set(PG_STATE_FAILED_REPAIR);
+ dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
+ << " error(s) still present after re-scrub" << dendl;
+ }
+ }
+ return true;
+ },
+ &t);
+ int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+ ceph_assert(tr == 0);
+ }
+
+ if (has_error) {
+ m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
+ get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
+ } else {
+ m_is_repair = false;
+ state_clear(PG_STATE_REPAIR);
+ update_op_mode_text();
+ }
+
+ cleanup_on_finish();
+ if (do_auto_scrub) {
+ request_rescrubbing(m_pg->m_planned_scrub);
+ }
+
+ if (m_pg->is_active() && m_pg->is_primary()) {
+ m_pg->recovery_state.share_pg_info();
+ }
+
+ // we may have blocked the snap trimmer
+ m_pg->snap_trimmer_scrub_complete();
+}
+
+void PgScrubber::on_digest_updates()
+{
+ dout(10) << __func__ << " #pending: " << num_digest_updates_pending
+ << (m_end.is_max() ? " <last chunk>" : " <mid chunk>")
+ << (is_queued_or_active() ? "" : " ** not marked as scrubbing **")
+ << dendl;
+
+ if (num_digest_updates_pending > 0) {
+ // do nothing for now. We will be called again when new updates arrive
+ return;
+ }
+
+ // got all updates, and finished with this chunk. Any more?
+ if (m_end.is_max()) {
+ m_osds->queue_scrub_is_finished(m_pg);
+ } else {
+ // go get a new chunk (via "requeue")
+ preemption_data.reset();
+ m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops());
+ }
+}
+
+
+/*
+ * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
+ * is cleared once scrubbing starts; Some of the values dumped here are
+ * thus transitory.
+ */
+void PgScrubber::dump(ceph::Formatter* f) const
+{
+ f->open_object_section("scrubber");
+ f->dump_stream("epoch_start") << m_interval_start;
+ f->dump_bool("active", m_active);
+ if (m_active) {
+ f->dump_stream("start") << m_start;
+ f->dump_stream("end") << m_end;
+ f->dump_stream("m_max_end") << m_max_end;
+ f->dump_stream("subset_last_update") << m_subset_last_update;
+ f->dump_bool("deep", m_is_deep);
+ f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
+ f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
+ f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
+ f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
+ f->dump_bool("req_scrub", m_flags.required);
+ f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
+ f->dump_bool("auto_repair", m_flags.auto_repair);
+ f->dump_bool("check_repair", m_flags.check_repair);
+ f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
+ f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t
+ f->dump_unsigned("priority", m_flags.priority);
+ f->dump_int("shallow_errors", m_shallow_errors);
+ f->dump_int("deep_errors", m_deep_errors);
+ f->dump_int("fixed", m_fixed_count);
+ {
+ f->open_array_section("waiting_on_whom");
+ for (const auto& p : m_maps_status.get_awaited()) {
+ f->dump_stream("shard") << p;
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+}
+
+
+void PgScrubber::handle_query_state(ceph::Formatter* f)
+{
+ dout(10) << __func__ << dendl;
+
+ f->open_object_section("scrub");
+ f->dump_stream("scrubber.epoch_start") << m_interval_start;
+ f->dump_bool("scrubber.active", m_active);
+ f->dump_stream("scrubber.start") << m_start;
+ f->dump_stream("scrubber.end") << m_end;
+ f->dump_stream("scrubber.m_max_end") << m_max_end;
+ f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
+ f->dump_bool("scrubber.deep", m_is_deep);
+ {
+ f->open_array_section("scrubber.waiting_on_whom");
+ for (const auto& p : m_maps_status.get_awaited()) {
+ f->dump_stream("shard") << p;
+ }
+ f->close_section();
+ }
+
+ f->dump_string("comment", "DEPRECATED - may be removed in the next release");
+
+ f->close_section();
+}
+
+PgScrubber::~PgScrubber() = default;
+
+PgScrubber::PgScrubber(PG* pg)
+ : m_pg{pg}
+ , m_pg_id{pg->pg_id}
+ , m_osds{m_pg->osd}
+ , m_pg_whoami{pg->pg_whoami}
+ , preemption_data{pg}
+{
+ m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
+ m_fsm->initiate();
+}
+
+void PgScrubber::scrub_begin()
+{
+ stringstream ss;
+ ss << m_pg->info.pgid.pgid << " " << m_mode_desc << " starts";
+ dout(2) << ss.str() << dendl;
+ m_osds->clog->debug(ss);
+}
+
+void PgScrubber::reserve_replicas()
+{
+ dout(10) << __func__ << dendl;
+ m_reservations.emplace(m_pg, m_pg_whoami);
+}
+
+void PgScrubber::cleanup_on_finish()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(m_pg->is_locked());
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+ m_pg->publish_stats_to_osd();
+
+ clear_scrub_reservations();
+ m_pg->publish_stats_to_osd();
+
+ requeue_waiting();
+
+ reset_internal_state();
+ m_flags = scrub_flags_t{};
+
+ // type-specific state clear
+ _scrub_clear_state();
+}
+
+// uses process_event(), so must be invoked externally
+void PgScrubber::scrub_clear_state()
+{
+ dout(10) << __func__ << dendl;
+
+ clear_pgscrub_state();
+ m_fsm->process_event(FullReset{});
+}
+
+/*
+ * note: does not access the state-machine
+ */
+void PgScrubber::clear_pgscrub_state()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(m_pg->is_locked());
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+
+ state_clear(PG_STATE_REPAIR);
+
+ clear_scrub_reservations();
+ m_pg->publish_stats_to_osd();
+
+ requeue_waiting();
+
+ reset_internal_state();
+ m_flags = scrub_flags_t{};
+
+ // type-specific state clear
+ _scrub_clear_state();
+}
+
+void PgScrubber::replica_handling_done()
+{
+ dout(10) << __func__ << dendl;
+
+ state_clear(PG_STATE_SCRUBBING);
+ state_clear(PG_STATE_DEEP_SCRUB);
+
+ reset_internal_state();
+
+ m_pg->publish_stats_to_osd();
+}
+
+/*
+ * note: performs run_callbacks()
+ * note: reservations-related variables are not reset here
+ */
+void PgScrubber::reset_internal_state()
+{
+ dout(10) << __func__ << dendl;
+
+ preemption_data.reset();
+ m_maps_status.reset();
+ m_received_maps.clear();
+
+ m_start = hobject_t{};
+ m_end = hobject_t{};
+ m_max_end = hobject_t{};
+ m_subset_last_update = eversion_t{};
+ m_shallow_errors = 0;
+ m_deep_errors = 0;
+ m_fixed_count = 0;
+ m_omap_stats = (const struct omap_stat_t){0};
+
+ run_callbacks();
+
+ m_inconsistent.clear();
+ m_missing.clear();
+ m_authoritative.clear();
+ num_digest_updates_pending = 0;
+ m_primary_scrubmap = ScrubMap{};
+ m_primary_scrubmap_pos.reset();
+ replica_scrubmap = ScrubMap{};
+ replica_scrubmap_pos.reset();
+ m_cleaned_meta_map = ScrubMap{};
+ m_needs_sleep = true;
+ m_sleep_started_at = utime_t{};
+
+ m_active = false;
+ clear_queued_or_active();
+}
+
+// note that only applicable to the Replica:
+void PgScrubber::advance_token()
+{
+ dout(10) << __func__ << " was: " << m_current_token << dendl;
+ m_current_token++;
+
+ // when advance_token() is called, it is assumed that no scrubbing takes place.
+ // We will, though, verify that. And if we are actually still handling a stale request -
+ // both our internal state and the FSM state will be cleared.
+ replica_handling_done();
+ m_fsm->process_event(FullReset{});
+}
+
+bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
+{
+ if (received_token == 0 || received_token == m_current_token) {
+ return true;
+ }
+ dout(5) << __func__ << " obsolete token (" << received_token
+ << " vs current " << m_current_token << dendl;
+
+ return false;
+}
+
+const OSDMapRef& PgScrubber::get_osdmap() const
+{
+ return m_pg->get_osdmap();
+}
+
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
+ return out << scrubber.m_flags;
+}
+
+std::ostream& PgScrubber::gen_prefix(std::ostream& out) const
+{
+ const auto fsm_state = m_fsm ? m_fsm->current_states_desc() : "- :";
+ if (m_pg) {
+ return m_pg->gen_prefix(out) << "scrubber " << fsm_state << ": ";
+ } else {
+ return out << " scrubber [~] " << fsm_state << ": ";
+ }
+}
+
+ostream& PgScrubber::show(ostream& out) const
+{
+ return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
+}
+
+// ///////////////////// preemption_data_t //////////////////////////////////
+
+PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
+{
+ m_left = static_cast<int>(
+ m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+}
+
+void PgScrubber::preemption_data_t::reset()
+{
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+ m_preemptable = false;
+ m_preempted = false;
+ m_left =
+ static_cast<int>(m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+ m_size_divisor = 1;
+}
+
+
+// ///////////////////// ReplicaReservations //////////////////////////////////
+namespace Scrub {
+
+void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
+{
+ auto m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, peer.shard), epoch,
+ MOSDScrubReserve::RELEASE, m_pg->pg_whoami);
+ m_osds->send_message_osd_cluster(peer.osd, m, epoch);
+}
+
+ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami)
+ : m_pg{pg}
+ , m_acting_set{pg->get_actingset()}
+ , m_osds{m_pg->osd}
+ , m_pending{static_cast<int>(m_acting_set.size()) - 1}
+{
+ epoch_t epoch = m_pg->get_osdmap_epoch();
+
+ {
+ std::stringstream prefix;
+ prefix << "osd." << m_osds->whoami << " ep: " << epoch
+ << " scrubber::ReplicaReservations pg[" << pg->pg_id << "]: ";
+ m_log_msg_prefix = prefix.str();
+ }
+
+ // handle the special case of no replicas
+ if (m_pending <= 0) {
+ // just signal the scrub state-machine to continue
+ send_all_done();
+
+ } else {
+
+ for (auto p : m_acting_set) {
+ if (p == whoami)
+ continue;
+ auto m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epoch,
+ MOSDScrubReserve::REQUEST, m_pg->pg_whoami);
+ m_osds->send_message_osd_cluster(p.osd, m, epoch);
+ m_waited_for_peers.push_back(p);
+ dout(10) << __func__ << ": reserve " << p.osd << dendl;
+ }
+ }
+}
+
+void ReplicaReservations::send_all_done()
+{
+ m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::send_reject()
+{
+ m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::discard_all()
+{
+ dout(10) << __func__ << ": " << m_reserved_peers << dendl;
+
+ m_had_rejections = true; // preventing late-coming responses from triggering events
+ m_reserved_peers.clear();
+ m_waited_for_peers.clear();
+}
+
+ReplicaReservations::~ReplicaReservations()
+{
+ m_had_rejections = true; // preventing late-coming responses from triggering events
+
+ // send un-reserve messages to all reserved replicas. We do not wait for answer (there
+ // wouldn't be one). Other incoming messages will be discarded on the way, by our
+ // owner.
+ epoch_t epoch = m_pg->get_osdmap_epoch();
+
+ for (auto& p : m_reserved_peers) {
+ release_replica(p, epoch);
+ }
+ m_reserved_peers.clear();
+
+ // note: the release will follow on the heels of the request. When tried otherwise,
+ // grants that followed a reject arrived after the whole scrub machine-state was
+ // reset, causing leaked reservations.
+ for (auto& p : m_waited_for_peers) {
+ release_replica(p, epoch);
+ }
+ m_waited_for_peers.clear();
+}
+
+/**
+ * @ATTN we would not reach here if the ReplicaReservation object managed by the
+ * scrubber was reset.
+ */
+void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << ": granted by " << from << dendl;
+ op->mark_started();
+
+ {
+ // reduce the amount of extra release messages. Not a must, but the log is cleaner
+ auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+ if (w != m_waited_for_peers.end())
+ m_waited_for_peers.erase(w);
+ }
+
+ // are we forced to reject the reservation?
+ if (m_had_rejections) {
+
+ dout(10) << __func__ << ": rejecting late-coming reservation from "
+ << from << dendl;
+ release_replica(from, m_pg->get_osdmap_epoch());
+
+ } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+ m_reserved_peers.end()) {
+
+ dout(10) << __func__ << ": already had osd." << from << " reserved" << dendl;
+
+ } else {
+
+ dout(10) << __func__ << ": osd." << from << " scrub reserve = success"
+ << dendl;
+ m_reserved_peers.push_back(from);
+ if (--m_pending == 0) {
+ send_all_done();
+ }
+ }
+}
+
+void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+ dout(10) << __func__ << ": rejected by " << from << dendl;
+ dout(15) << __func__ << ": " << *op->get_req() << dendl;
+ op->mark_started();
+
+ {
+ // reduce the amount of extra release messages. Not a must, but the log is cleaner
+ auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+ if (w != m_waited_for_peers.end())
+ m_waited_for_peers.erase(w);
+ }
+
+ if (m_had_rejections) {
+
+ // our failure was already handled when the first rejection arrived
+ dout(15) << __func__ << ": ignoring late-coming rejection from "
+ << from << dendl;
+
+ } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+ m_reserved_peers.end()) {
+
+ dout(10) << __func__ << ": already had osd." << from << " reserved" << dendl;
+
+ } else {
+
+ dout(10) << __func__ << ": osd." << from << " scrub reserve = fail" << dendl;
+ m_had_rejections = true; // preventing any additional notifications
+ send_reject();
+ }
+}
+
+std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const
+{
+ return out << m_log_msg_prefix;
+}
+
+// ///////////////////// LocalReservation //////////////////////////////////
+
+// note: no dout()s in LocalReservation functions. Client logs interactions.
+LocalReservation::LocalReservation(OSDService* osds)
+ : m_osds{osds}
+{
+ if (m_osds->inc_scrubs_local()) {
+ // the failure is signalled by not having m_holding_local_reservation set
+ m_holding_local_reservation = true;
+ }
+}
+
+LocalReservation::~LocalReservation()
+{
+ if (m_holding_local_reservation) {
+ m_holding_local_reservation = false;
+ m_osds->dec_scrubs_local();
+ }
+}
+
+// ///////////////////// ReservedByRemotePrimary ///////////////////////////////
+
+ReservedByRemotePrimary::ReservedByRemotePrimary(const PgScrubber* scrubber,
+ PG* pg,
+ OSDService* osds,
+ epoch_t epoch)
+ : m_scrubber{scrubber}
+ , m_pg{pg}
+ , m_osds{osds}
+ , m_reserved_at{epoch}
+{
+ if (!m_osds->inc_scrubs_remote()) {
+ dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
+ // the failure is signalled by not having m_reserved_by_remote_primary set
+ return;
+ }
+
+ dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl;
+ m_reserved_by_remote_primary = true;
+}
+
+bool ReservedByRemotePrimary::is_stale() const
+{
+ return m_reserved_at < m_pg->get_same_interval_since();
+}
+
+ReservedByRemotePrimary::~ReservedByRemotePrimary()
+{
+ if (m_reserved_by_remote_primary) {
+ m_reserved_by_remote_primary = false;
+ m_osds->dec_scrubs_remote();
+ }
+}
+
+std::ostream& ReservedByRemotePrimary::gen_prefix(std::ostream& out) const
+{
+ return m_scrubber->gen_prefix(out);
+}
+
+// ///////////////////// MapsCollectionStatus ////////////////////////////////
+
+auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
+ -> std::tuple<bool, std::string_view>
+{
+ auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from);
+ if (fe != m_maps_awaited_for.end()) {
+ // we are indeed waiting for a map from this replica
+ m_maps_awaited_for.erase(fe);
+ return std::tuple{true, ""sv};
+ } else {
+ return std::tuple{false, " unsolicited scrub-map"sv};
+ }
+}
+
+void MapsCollectionStatus::reset()
+{
+ *this = MapsCollectionStatus{};
+}
+
+std::string MapsCollectionStatus::dump() const
+{
+ std::string all;
+ for (const auto& rp : m_maps_awaited_for) {
+ all.append(rp.get_osd() + " "s);
+ }
+ return all;
+}
+
+ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
+{
+ out << " [ ";
+ for (const auto& rp : sf.m_maps_awaited_for) {
+ out << rp.get_osd() << " ";
+ }
+ if (!sf.m_local_map_ready) {
+ out << " local ";
+ }
+ return out << " ] ";
+}
+
+} // namespace Scrub
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h
new file mode 100644
index 000000000..392a4a588
--- /dev/null
+++ b/src/osd/pg_scrubber.h
@@ -0,0 +1,821 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "PG.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+#include "scrubber_common.h"
+
+class Callback;
+
+namespace Scrub {
+class ScrubMachine;
+struct BuildMap;
+
+/**
+ * Reserving/freeing scrub resources at the replicas.
+ *
+ * When constructed - sends reservation requests to the acting_set.
+ * A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
+ * All previous requests, whether already granted or not, are explicitly released.
+ *
+ * A note re performance: I've measured a few container alternatives for
+ * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
+ * expected. flat_set is only slightly better. Surprisingly - std::vector (with no
+ * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
+ */
+class ReplicaReservations {
+ using OrigSet = decltype(std::declval<PG>().get_actingset());
+
+ PG* m_pg;
+ OrigSet m_acting_set;
+ OSDService* m_osds;
+ std::vector<pg_shard_t> m_waited_for_peers;
+ std::vector<pg_shard_t> m_reserved_peers;
+ bool m_had_rejections{false};
+ int m_pending{-1};
+
+ void release_replica(pg_shard_t peer, epoch_t epoch);
+
+ void send_all_done(); ///< all reservations are granted
+
+ /// notify the scrubber that we have failed to reserve replicas' resources
+ void send_reject();
+
+ public:
+ std::string m_log_msg_prefix;
+
+ /**
+ * quietly discard all knowledge about existing reservations. No messages
+ * are sent to peers.
+ * To be used upon interval change, as we know the the running scrub is no longer
+ * relevant, and that the replicas had reset the reservations on their side.
+ */
+ void discard_all();
+
+ ReplicaReservations(PG* pg, pg_shard_t whoami);
+
+ ~ReplicaReservations();
+
+ void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+
+ void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
+
+ std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/**
+ * wraps the local OSD scrub resource reservation in an RAII wrapper
+ */
+class LocalReservation {
+ OSDService* m_osds;
+ bool m_holding_local_reservation{false};
+
+ public:
+ LocalReservation(OSDService* osds);
+ ~LocalReservation();
+ bool is_reserved() const { return m_holding_local_reservation; }
+};
+
+/**
+ * wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
+ */
+class ReservedByRemotePrimary {
+ const PgScrubber* m_scrubber; ///< we will be using its gen_prefix()
+ PG* m_pg;
+ OSDService* m_osds;
+ bool m_reserved_by_remote_primary{false};
+ const epoch_t m_reserved_at;
+
+ public:
+ ReservedByRemotePrimary(const PgScrubber* scrubber, PG* pg, OSDService* osds, epoch_t epoch);
+ ~ReservedByRemotePrimary();
+ [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
+
+ /// compare the remembered reserved-at epoch to the current interval
+ [[nodiscard]] bool is_stale() const;
+
+ std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/**
+ * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
+ * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
+ * combines the status of waiting for both the local map and the replicas, without
+ * resorting to adding dummy entries into a list.
+ */
+class MapsCollectionStatus {
+
+ bool m_local_map_ready{false};
+ std::vector<pg_shard_t> m_maps_awaited_for;
+
+ public:
+ [[nodiscard]] bool are_all_maps_available() const
+ {
+ return m_local_map_ready && m_maps_awaited_for.empty();
+ }
+
+ void mark_local_map_ready() { m_local_map_ready = true; }
+
+ void mark_replica_map_request(pg_shard_t from_whom)
+ {
+ m_maps_awaited_for.push_back(from_whom);
+ }
+
+ /// @returns true if indeed waiting for this one. Otherwise: an error string
+ auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
+
+ std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
+
+ void reset();
+
+ std::string dump() const;
+
+ friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
+};
+
+} // namespace Scrub
+
+
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+ unsigned int priority{0};
+
+ /**
+ * set by queue_scrub() if either planned_scrub.auto_repair or
+ * need_auto were set.
+ * Tested at scrub end.
+ */
+ bool auto_repair{false};
+
+ /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+ bool check_repair{false};
+
+ /// checked at the end of the scrub, to possibly initiate a deep-scrub
+ bool deep_scrub_on_error{false};
+
+ /**
+ * scrub must not be aborted.
+ * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+ * process with the 'repair' flag set (in the RequestScrub event).
+ */
+ bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
+class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
+
+ public:
+ explicit PgScrubber(PG* pg);
+
+ // ------------------ the I/F exposed to the PG (ScrubPgIF) -------------
+
+ /// are we waiting for resource reservation grants form our replicas?
+ [[nodiscard]] bool is_reserving() const final;
+
+ void initiate_regular_scrub(epoch_t epoch_queued) final;
+
+ void initiate_scrub_after_repair(epoch_t epoch_queued) final;
+
+ void send_scrub_resched(epoch_t epoch_queued) final;
+
+ void active_pushes_notification(epoch_t epoch_queued) final;
+
+ void update_applied_notification(epoch_t epoch_queued) final;
+
+ void send_scrub_unblock(epoch_t epoch_queued) final;
+
+ void digest_update_notification(epoch_t epoch_queued) final;
+
+ void send_replica_maps_ready(epoch_t epoch_queued) final;
+
+ void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+ void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+ void send_replica_pushes_upd(epoch_t epoch_queued) final;
+ /**
+ * The PG has updated its 'applied version'. It might be that we are waiting for this
+ * information: after selecting a range of objects to scrub, we've marked the latest
+ * version of these objects in m_subset_last_update. We will not start the map building
+ * before we know that the PG has reached this version.
+ */
+ void on_applied_when_primary(const eversion_t& applied_version) final;
+
+ void send_full_reset(epoch_t epoch_queued) final;
+
+ void send_chunk_free(epoch_t epoch_queued) final;
+
+ void send_chunk_busy(epoch_t epoch_queued) final;
+
+ void send_local_map_done(epoch_t epoch_queued) final;
+
+ void send_maps_compared(epoch_t epoch_queued) final;
+
+ void send_get_next_chunk(epoch_t epoch_queued) final;
+
+ void send_scrub_is_finished(epoch_t epoch_queued) final;
+
+ /**
+ * we allow some number of preemptions of the scrub, which mean we do
+ * not block. Then we start to block. Once we start blocking, we do
+ * not stop until the scrub range is completed.
+ */
+ bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+ /// true if the given range intersects the scrub interval in any way
+ bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+ /**
+ * we are a replica being asked by the Primary to reserve OSD resources for
+ * scrubbing
+ */
+ void handle_scrub_reserve_request(OpRequestRef op) final;
+
+ void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+ void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+ void handle_scrub_reserve_release(OpRequestRef op) final;
+ void discard_replica_reservations() final;
+ void clear_scrub_reservations() final; // PG::clear... fwds to here
+ void unreserve_replicas() final;
+
+ // managing scrub op registration
+
+ void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+ void unreg_next_scrub() final;
+
+ void scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags) final;
+
+ /**
+ * Reserve local scrub resources (managed by the OSD)
+ *
+ * Fails if OSD's local-scrubs budget was exhausted
+ * \returns were local resources reserved?
+ */
+ bool reserve_local() final;
+
+ void handle_query_state(ceph::Formatter* f) final;
+
+ void dump(ceph::Formatter* f) const override;
+
+ // used if we are a replica
+
+ void replica_scrub_op(OpRequestRef op) final;
+
+ /// the op priority, taken from the primary's request message
+ Scrub::scrub_prio_t replica_op_priority() const final
+ {
+ return m_replica_request_priority;
+ };
+
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const final;
+ /// the version that refers to m_flags.priority
+ unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+ void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+ [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc
+ {
+ return !m_callbacks.empty();
+ }
+
+ /// handle a message carrying a replica map
+ void map_from_replica(OpRequestRef op) final;
+
+ void scrub_clear_state() final;
+
+ bool is_queued_or_active() const final;
+
+ /**
+ * add to scrub statistics, but only if the soid is below the scrub start
+ */
+ virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid) override
+ {
+ ceph_assert(false);
+ }
+
+ /**
+ * finalize the parameters of the initiated scrubbing session:
+ *
+ * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+ * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+ */
+ void set_op_parameters(requested_scrub_t& request) final;
+
+ void cleanup_store(ObjectStore::Transaction* t) final;
+
+ bool get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const override
+ {
+ return false;
+ }
+
+ // -------------------------------------------------------------------------------------------
+ // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+ [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
+
+ void select_range_n_notify() final;
+
+ /// walk the log to find the latest update that affects our chunk
+ eversion_t search_log_for_updates() const final;
+
+ eversion_t get_last_update_applied() const final
+ {
+ return m_pg->recovery_state.get_last_update_applied();
+ }
+
+ int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+ void on_init() final;
+ void on_replica_init() final;
+ void replica_handling_done() final;
+
+ /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+ /// (thus can be called from FSM reactions)
+ void clear_pgscrub_state() final;
+
+ /*
+ * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+ * is asserted - after a configuration-dependent timeout.
+ */
+ void add_delayed_scheduling() final;
+
+ void get_replicas_maps(bool replica_can_preempt) final;
+
+ void on_digest_updates() final;
+
+ void scrub_begin() final;
+
+ void scrub_finish() final;
+
+ ScrubMachineListener::MsgAndEpoch
+ prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
+
+ void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
+
+ void send_preempted_replica() final;
+
+ void send_remotes_reserved(epoch_t epoch_queued) final;
+ void send_reservation_failure(epoch_t epoch_queued) final;
+
+ /**
+ * does the PG have newer updates than what we (the scrubber) know?
+ */
+ [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+ void set_subset_last_update(eversion_t e) final;
+
+ void maps_compare_n_cleanup() final;
+
+ Scrub::preemption_t& get_preemptor() final;
+
+ int build_primary_map_chunk() final;
+
+ int build_replica_map_chunk() final;
+
+ void reserve_replicas() final;
+
+ [[nodiscard]] bool was_epoch_changed() const final;
+
+ void set_queued_or_active() final;
+ void clear_queued_or_active() final;
+
+ void mark_local_map_ready() final;
+
+ [[nodiscard]] bool are_all_maps_available() const final;
+
+ std::string dump_awaited_maps() const final;
+
+ std::ostream& gen_prefix(std::ostream& out) const final;
+
+ protected:
+ bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+ void state_set(uint64_t m) { m_pg->state_set(m); }
+ void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+ [[nodiscard]] bool is_scrub_registered() const;
+
+ virtual void _scrub_clear_state() {}
+
+ utime_t m_scrub_reg_stamp; ///< stamp we registered for
+
+ ostream& show(ostream& out) const override;
+
+ public:
+ // -------------------------------------------------------------------------------------------
+
+ friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+ static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+ virtual ~PgScrubber(); // must be defined separately, in the .cc file
+
+ [[nodiscard]] bool is_scrub_active() const final { return m_active; }
+
+ private:
+ void reset_internal_state();
+
+ /**
+ * the current scrubbing operation is done. We should mark that fact, so that
+ * all events related to the previous operation can be discarded.
+ */
+ void advance_token();
+
+ bool is_token_current(Scrub::act_token_t received_token);
+
+ void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+ void _scan_snaps(ScrubMap& smap);
+
+ ScrubMap clean_meta_map();
+
+ /**
+ * mark down some parameters of the initiated scrub:
+ * - the epoch when started;
+ * - the depth of the scrub requested (from the PG_STATE variable)
+ */
+ void reset_epoch(epoch_t epoch_queued);
+
+ void run_callbacks();
+
+ // ----- methods used to verify the relevance of incoming events:
+
+ /**
+ * is the incoming event still relevant, and should be processed?
+ *
+ * It isn't if:
+ * - (1) we are no longer 'actively scrubbing'; or
+ * - (2) the message is from an epoch prior to when we started the current scrub
+ * session; or
+ * - (3) the message epoch is from a previous interval; or
+ * - (4) the 'abort' configuration flags were set.
+ *
+ * For (1) & (2) - teh incoming message is discarded, w/o further action.
+ *
+ * For (3): (see check_interval() for a full description) if we have not reacted yet
+ * to this specific new interval, we do now:
+ * - replica reservations are silently discarded (we count on the replicas to notice
+ * the interval change and un-reserve themselves);
+ * - the scrubbing is halted.
+ *
+ * For (4): the message will be discarded, but also:
+ * if this is the first time we've noticed the 'abort' request, we perform the abort.
+ *
+ * \returns should the incoming event be processed?
+ */
+ bool is_message_relevant(epoch_t epoch_to_verify);
+
+ /**
+ * check the 'no scrub' configuration options.
+ */
+ [[nodiscard]] bool should_abort() const;
+
+ /**
+ * Check the 'no scrub' configuration flags.
+ *
+ * Reset everything if the abort was not handled before.
+ * @returns false if the message was discarded due to abort flag.
+ */
+ [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
+
+ [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
+
+ epoch_t m_last_aborted{}; // last time we've noticed a request to abort
+
+ /**
+ * return true if any inconsistency/missing is repaired, false otherwise
+ */
+ [[nodiscard]] bool scrub_process_inconsistent();
+
+ void scrub_compare_maps();
+
+ bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always
+ ///< 'true', unless we just got out of a sleep period
+
+ utime_t m_sleep_started_at;
+
+
+ // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+ // to guarantee un-reserving when deleted.
+ std::optional<Scrub::ReplicaReservations> m_reservations;
+ std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+ /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+ std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+ void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when
+ // Active->NotActive
+
+ protected:
+ PG* const m_pg;
+
+ /**
+ * the derivative-specific scrub-finishing touches:
+ */
+ virtual void _scrub_finish() {}
+
+ /**
+ * Validate consistency of the object info and snap sets.
+ */
+ virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+ {}
+
+ // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+ int build_scrub_map_chunk(ScrubMap& map, // primary or replica?
+ ScrubMapBuilder& pos,
+ hobject_t start,
+ hobject_t end,
+ bool deep);
+
+ std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+ const spg_t m_pg_id; ///< a local copy of m_pg->pg_id
+ OSDService* const m_osds;
+ const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami;
+
+ epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled
+ /*
+ * the exact epoch when the scrubbing actually started (started here - cleared checks
+ * for no-scrub conf). Incoming events are verified against this, with stale events
+ * discarded.
+ */
+ epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started
+
+ /**
+ * (replica) a tag identifying a specific scrub "session". Incremented whenever the
+ * Primary releases the replica scrub resources.
+ * When the scrub session is terminated (even if the interval remains unchanged, as
+ * might happen following an asok no-scrub command), stale scrub-resched messages
+ * triggered by the backend will be discarded.
+ */
+ Scrub::act_token_t m_current_token{1};
+
+ scrub_flags_t m_flags;
+
+ bool m_active{false};
+
+ /**
+ * a flag designed to prevent the initiation of a second scrub on a PG for which scrubbing
+ * has been initiated.
+ *
+ * set once scrubbing was initiated (i.e. - even before the FSM event that
+ * will trigger a state-change out of Inactive was handled), and only reset
+ * once the FSM is back in Inactive.
+ * In other words - its ON period encompasses:
+ * - the time period covered today by 'queued', and
+ * - the time when m_active is set, and
+ * - all the time from scrub_finish() calling update_stats() till the
+ * FSM handles the 'finished' event
+ *
+ * Compared with 'm_active', this flag is asserted earlier and remains ON for longer.
+ */
+ bool m_queued_or_active{false};
+
+ eversion_t m_subset_last_update{};
+
+ std::unique_ptr<Scrub::Store> m_store;
+
+ int num_digest_updates_pending{0};
+ hobject_t m_start, m_end; ///< note: half-closed: [start,end)
+
+ /// Returns reference to current osdmap
+ const OSDMapRef& get_osdmap() const;
+
+ /// Returns epoch of current osdmap
+ epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+ CephContext* get_pg_cct() const { return m_pg->cct; }
+
+ // collected statistics
+ int m_shallow_errors{0};
+ int m_deep_errors{0};
+ int m_fixed_count{0};
+
+ /// Maps from objects with errors to missing peers
+ HobjToShardSetMapping m_missing;
+
+ protected:
+ /**
+ * 'm_is_deep' - is the running scrub a deep one?
+ *
+ * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+ * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+ * meaningful both for the primary and the replicas, and is used as a parameter when
+ * building the scrub maps.
+ */
+ bool m_is_deep{false};
+
+ /**
+ * If set: affects the backend & scrubber-backend functions called after all
+ * scrub maps are available.
+ *
+ * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
+ * a "user facing" status display only).
+ */
+ bool m_is_repair{false};
+
+ /**
+ * User-readable summary of the scrubber's current mode of operation. Used for
+ * both osd.*.log and the cluster log.
+ * One of:
+ * "repair"
+ * "deep-scrub",
+ * "scrub
+ *
+ * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
+ * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
+ * is detected).
+ */
+ std::string_view m_mode_desc;
+
+ void update_op_mode_text();
+
+private:
+
+ /**
+ * initiate a deep-scrub after the current scrub ended with errors.
+ */
+ void request_rescrubbing(requested_scrub_t& req_flags);
+
+ /*
+ * Select a range of objects to scrub.
+ *
+ * By:
+ * - setting tentative range based on conf and divisor
+ * - requesting a partial list of elements from the backend;
+ * - handling some head/clones issues
+ *
+ * The selected range is set directly into 'm_start' and 'm_end'
+ */
+ bool select_range();
+
+ std::list<Context*> m_callbacks;
+
+ /**
+ * send a replica (un)reservation request to the acting set
+ *
+ * @param opcode - one of MOSDScrubReserve::REQUEST
+ * or MOSDScrubReserve::RELEASE
+ */
+ void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+ hobject_t m_max_end; ///< Largest end that may have been sent to replicas
+ ScrubMap m_primary_scrubmap;
+ ScrubMapBuilder m_primary_scrubmap_pos;
+
+ std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+ /// Cleaned std::map pending snap metadata scrub
+ ScrubMap m_cleaned_meta_map;
+
+ void _request_scrub_map(pg_shard_t replica,
+ eversion_t version,
+ hobject_t start,
+ hobject_t end,
+ bool deep,
+ bool allow_preemption);
+
+
+ Scrub::MapsCollectionStatus m_maps_status;
+
+ omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+ /// Maps from objects with errors to inconsistent peers
+ HobjToShardSetMapping m_inconsistent;
+
+ /// Maps from object with errors to good peers
+ std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+ // ------------ members used if we are a replica
+
+ epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message
+
+ ScrubMapBuilder replica_scrubmap_pos;
+ ScrubMap replica_scrubmap;
+
+ /**
+ * we mark the request priority as it arrived. It influences the queuing priority
+ * when we wait for local updates
+ */
+ Scrub::scrub_prio_t m_replica_request_priority;
+
+ /**
+ * the 'preemption' "state-machine".
+ * Note: I was considering an orthogonal sub-machine implementation, but as
+ * the state diagram is extremely simple, the added complexity wasn't justified.
+ */
+ class preemption_data_t : public Scrub::preemption_t {
+ public:
+ preemption_data_t(PG* pg); // the PG access is used for conf access (and logs)
+
+ [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
+
+ bool do_preempt() final
+ {
+ if (m_preempted || !m_preemptable)
+ return false;
+
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+ if (!m_preemptable)
+ return false;
+
+ m_preempted = true;
+ return true;
+ }
+
+ /// same as 'do_preempt()' but w/o checks (as once a replica
+ /// was preempted, we cannot continue)
+ void replica_preempted() { m_preempted = true; }
+
+ void enable_preemption()
+ {
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+ if (are_preemptions_left() && !m_preempted) {
+ m_preemptable = true;
+ }
+ }
+
+ /// used by a replica to set preemptability state according to the Primary's request
+ void force_preemptability(bool is_allowed)
+ {
+ // note: no need to lock for a replica
+ m_preempted = false;
+ m_preemptable = is_allowed;
+ }
+
+ bool disable_and_test() final
+ {
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+ m_preemptable = false;
+ return m_preempted;
+ }
+
+ [[nodiscard]] bool was_preempted() const { return m_preempted; }
+
+ [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
+
+ void reset();
+
+ void adjust_parameters() final
+ {
+ std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+ if (m_preempted) {
+ m_preempted = false;
+ m_preemptable = adjust_left();
+ } else {
+ m_preemptable = are_preemptions_left();
+ }
+ }
+
+ private:
+ PG* m_pg;
+ mutable std::mutex m_preemption_lock;
+ bool m_preemptable{false};
+ bool m_preempted{false};
+ int m_left;
+ size_t m_size_divisor{1};
+ bool are_preemptions_left() const { return m_left > 0; }
+
+ bool adjust_left()
+ {
+ if (m_left > 0) {
+ --m_left;
+ m_size_divisor *= 2;
+ }
+ return m_left > 0;
+ }
+ };
+
+ preemption_data_t preemption_data;
+};
diff --git a/src/osd/recovery_types.cc b/src/osd/recovery_types.cc
new file mode 100644
index 000000000..3dd49a82d
--- /dev/null
+++ b/src/osd/recovery_types.cc
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "recovery_types.h"
+
+ostream& operator<<(ostream& out, const BackfillInterval& bi)
+{
+ out << "BackfillInfo(" << bi.begin << "-" << bi.end
+ << " " << bi.objects.size() << " objects";
+ if (!bi.objects.empty())
+ out << " " << bi.objects;
+ out << ")";
+ return out;
+}
+
+
diff --git a/src/osd/recovery_types.h b/src/osd/recovery_types.h
new file mode 100644
index 000000000..73a621882
--- /dev/null
+++ b/src/osd/recovery_types.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+
+#include "osd_types.h"
+
+/**
+ * BackfillInterval
+ *
+ * Represents the objects in a range [begin, end)
+ *
+ * Possible states:
+ * 1) begin == end == hobject_t() indicates the the interval is unpopulated
+ * 2) Else, objects contains all objects in [begin, end)
+ */
+struct BackfillInterval {
+ // info about a backfill interval on a peer
+ eversion_t version; /// version at which the scan occurred
+ std::map<hobject_t,eversion_t> objects;
+ hobject_t begin;
+ hobject_t end;
+
+ /// clear content
+ void clear() {
+ *this = BackfillInterval();
+ }
+
+ /// clear objects std::list only
+ void clear_objects() {
+ objects.clear();
+ }
+
+ /// reinstantiate with a new start+end position and sort order
+ void reset(hobject_t start) {
+ clear();
+ begin = end = start;
+ }
+
+ /// true if there are no objects in this interval
+ bool empty() const {
+ return objects.empty();
+ }
+
+ /// true if interval extends to the end of the range
+ bool extends_to_end() const {
+ return end.is_max();
+ }
+
+ /// removes items <= soid and adjusts begin to the first object
+ void trim_to(const hobject_t &soid) {
+ trim();
+ while (!objects.empty() &&
+ objects.begin()->first <= soid) {
+ pop_front();
+ }
+ }
+
+ /// Adjusts begin to the first object
+ void trim() {
+ if (!objects.empty())
+ begin = objects.begin()->first;
+ else
+ begin = end;
+ }
+
+ /// drop first entry, and adjust @begin accordingly
+ void pop_front() {
+ ceph_assert(!objects.empty());
+ objects.erase(objects.begin());
+ trim();
+ }
+
+ /// dump
+ void dump(ceph::Formatter *f) const {
+ f->dump_stream("begin") << begin;
+ f->dump_stream("end") << end;
+ f->open_array_section("objects");
+ for (std::map<hobject_t, eversion_t>::const_iterator i =
+ objects.begin();
+ i != objects.end();
+ ++i) {
+ f->open_object_section("object");
+ f->dump_stream("object") << i->first;
+ f->dump_stream("version") << i->second;
+ f->close_section();
+ }
+ f->close_section();
+ }
+};
+
+std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi);
+
diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc
new file mode 100644
index 000000000..3ce6fdb55
--- /dev/null
+++ b/src/osd/scheduler/OpScheduler.cc
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <ostream>
+
+#include "osd/scheduler/OpScheduler.h"
+
+#include "common/WeightedPriorityQueue.h"
+#include "osd/scheduler/mClockScheduler.h"
+
+namespace ceph::osd::scheduler {
+
+OpSchedulerRef make_scheduler(
+ CephContext *cct, uint32_t num_shards, bool is_rotational)
+{
+ const std::string *type = &cct->_conf->osd_op_queue;
+ if (*type == "debug_random") {
+ static const std::string index_lookup[] = { "mclock_scheduler",
+ "wpq" };
+ srand(time(NULL));
+ unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0]));
+ type = &index_lookup[which];
+ }
+
+ if (*type == "wpq" ) {
+ // default is 'wpq'
+ return std::make_unique<
+ ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>(
+ cct,
+ cct->_conf->osd_op_pq_max_tokens_per_priority,
+ cct->_conf->osd_op_pq_min_cost
+ );
+ } else if (*type == "mclock_scheduler") {
+ return std::make_unique<mClockScheduler>(cct, num_shards, is_rotational);
+ } else {
+ ceph_assert("Invalid choice of wq" == 0);
+ }
+}
+
+std::ostream &operator<<(std::ostream &lhs, const OpScheduler &rhs) {
+ rhs.print(lhs);
+ return lhs;
+}
+
+}
diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h
new file mode 100644
index 000000000..6e2bb5abd
--- /dev/null
+++ b/src/osd/scheduler/OpScheduler.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <ostream>
+#include <variant>
+
+#include "common/ceph_context.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+namespace ceph::osd::scheduler {
+
+using client = uint64_t;
+using WorkItem = std::variant<std::monostate, OpSchedulerItem, double>;
+
+/**
+ * Base interface for classes responsible for choosing
+ * op processing order in the OSD.
+ */
+class OpScheduler {
+public:
+ // Enqueue op for scheduling
+ virtual void enqueue(OpSchedulerItem &&item) = 0;
+
+ // Enqueue op for processing as though it were enqueued prior
+ // to other items already scheduled.
+ virtual void enqueue_front(OpSchedulerItem &&item) = 0;
+
+ // Returns true iff there are no ops scheduled
+ virtual bool empty() const = 0;
+
+ // Return next op to be processed
+ virtual WorkItem dequeue() = 0;
+
+ // Dump formatted representation for the queue
+ virtual void dump(ceph::Formatter &f) const = 0;
+
+ // Print human readable brief description with relevant parameters
+ virtual void print(std::ostream &out) const = 0;
+
+ // Apply config changes to the scheduler (if any)
+ virtual void update_configuration() = 0;
+
+ // Destructor
+ virtual ~OpScheduler() {};
+};
+
+std::ostream &operator<<(std::ostream &lhs, const OpScheduler &);
+using OpSchedulerRef = std::unique_ptr<OpScheduler>;
+
+OpSchedulerRef make_scheduler(
+ CephContext *cct, uint32_t num_shards, bool is_rotational);
+
+/**
+ * Implements OpScheduler in terms of OpQueue
+ *
+ * Templated on queue type to avoid dynamic dispatch, T should implement
+ * OpQueue<OpSchedulerItem, client>. This adapter is mainly responsible for
+ * the boilerplate priority cutoff/strict concept which is needed for
+ * OpQueue based implementations.
+ */
+template <typename T>
+class ClassedOpQueueScheduler final : public OpScheduler {
+ unsigned cutoff;
+ T queue;
+
+ static unsigned int get_io_prio_cut(CephContext *cct) {
+ if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
+ srand(time(NULL));
+ return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
+ } else if (cct->_conf->osd_op_queue_cut_off == "high") {
+ return CEPH_MSG_PRIO_HIGH;
+ } else {
+ // default / catch-all is 'low'
+ return CEPH_MSG_PRIO_LOW;
+ }
+ }
+public:
+ template <typename... Args>
+ ClassedOpQueueScheduler(CephContext *cct, Args&&... args) :
+ cutoff(get_io_prio_cut(cct)),
+ queue(std::forward<Args>(args)...)
+ {}
+
+ void enqueue(OpSchedulerItem &&item) final {
+ unsigned priority = item.get_priority();
+ unsigned cost = item.get_cost();
+
+ if (priority >= cutoff)
+ queue.enqueue_strict(
+ item.get_owner(), priority, std::move(item));
+ else
+ queue.enqueue(
+ item.get_owner(), priority, cost, std::move(item));
+ }
+
+ void enqueue_front(OpSchedulerItem &&item) final {
+ unsigned priority = item.get_priority();
+ unsigned cost = item.get_cost();
+ if (priority >= cutoff)
+ queue.enqueue_strict_front(
+ item.get_owner(),
+ priority, std::move(item));
+ else
+ queue.enqueue_front(
+ item.get_owner(),
+ priority, cost, std::move(item));
+ }
+
+ bool empty() const final {
+ return queue.empty();
+ }
+
+ WorkItem dequeue() final {
+ return queue.dequeue();
+ }
+
+ void dump(ceph::Formatter &f) const final {
+ return queue.dump(&f);
+ }
+
+ void print(std::ostream &out) const final {
+ out << "ClassedOpQueueScheduler(queue=";
+ queue.print(out);
+ out << ", cutoff=" << cutoff << ")";
+ }
+
+ void update_configuration() final {
+ // no-op
+ }
+
+ ~ClassedOpQueueScheduler() final {};
+};
+
+}
diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc
new file mode 100644
index 000000000..27db1dfa3
--- /dev/null
+++ b/src/osd/scheduler/OpSchedulerItem.cc
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "osd/scheduler/OpSchedulerItem.h"
+#include "osd/OSD.h"
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+namespace ceph::osd::scheduler {
+
+void PGOpItem::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+#ifdef HAVE_JAEGER
+ auto PGOpItem_span = jaeger_tracing::child_span("PGOpItem::run", op->osd_parent_span);
+#endif
+ osd->dequeue_op(pg, op, handle);
+ pg->unlock();
+}
+
+void PGPeeringItem::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+ osd->dequeue_peering_evt(sdata, pg.get(), evt, handle);
+}
+
+void PGSnapTrim::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+ pg->snap_trimmer(epoch_queued);
+ pg->unlock();
+}
+
+void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+ pg->scrub(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubAfterRepair::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->recovery_scrub(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubResched::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_scrub_resched(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubResourcesOK::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_resources_granted(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubDenied::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_resources_denied(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubPushesUpdate::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_pushes_update(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubAppliedUpdate::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_applied_update(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubUnblocked::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_unblocking(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubDigestUpdate::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_digest_update(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubGotLocalMap::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_local_map_ready(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubGotReplMaps::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_replmaps_ready(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubMapsCompared::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_maps_compared(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+ pg->replica_scrub(epoch_queued, activation_index, handle);
+ pg->unlock();
+}
+
+void PGRepScrubResched::run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->replica_scrub_resched(epoch_queued, activation_index, handle);
+ pg->unlock();
+}
+
+void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_replica_pushes(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubScrubFinished::run([[maybe_unused]] OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_scrub_is_finished(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubGetNextChunk::run([[maybe_unused]] OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_get_next_chunk(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubChunkIsBusy::run([[maybe_unused]] OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_chunk_busy(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGScrubChunkIsFree::run([[maybe_unused]] OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle)
+{
+ pg->scrub_send_chunk_free(epoch_queued, handle);
+ pg->unlock();
+}
+
+void PGRecovery::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+ osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, handle);
+ pg->unlock();
+}
+
+void PGRecoveryContext::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+ c.release()->complete(handle);
+ pg->unlock();
+}
+
+void PGDelete::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+ osd->dequeue_delete(sdata, pg.get(), epoch_queued, handle);
+}
+
+void PGRecoveryMsg::run(
+ OSD *osd,
+ OSDShard *sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle &handle)
+{
+ osd->dequeue_op(pg, op, handle);
+ pg->unlock();
+}
+
+}
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h
new file mode 100644
index 000000000..7ba59838e
--- /dev/null
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -0,0 +1,629 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "include/types.h"
+#include "include/utime.h"
+#include "osd/OpRequest.h"
+#include "osd/PG.h"
+#include "osd/PGPeeringEvent.h"
+#include "messages/MOSDOp.h"
+
+
+class OSD;
+class OSDShard;
+
+namespace ceph::osd::scheduler {
+
+enum class op_scheduler_class : uint8_t {
+ background_recovery = 0,
+ background_best_effort,
+ immediate,
+ client,
+};
+
+class OpSchedulerItem {
+public:
+ class OrderLocker {
+ public:
+ using Ref = std::unique_ptr<OrderLocker>;
+ virtual void lock() = 0;
+ virtual void unlock() = 0;
+ virtual ~OrderLocker() {}
+ };
+
+ // Abstraction for operations queueable in the op queue
+ class OpQueueable {
+ public:
+ enum class op_type_t {
+ client_op,
+ peering_event,
+ bg_snaptrim,
+ bg_recovery,
+ bg_scrub,
+ bg_pg_delete
+ };
+ using Ref = std::unique_ptr<OpQueueable>;
+
+ /// Items with the same queue token will end up in the same shard
+ virtual uint32_t get_queue_token() const = 0;
+
+ /* Items will be dequeued and locked atomically w.r.t. other items with the
+ * same ordering token */
+ virtual const spg_t& get_ordering_token() const = 0;
+ virtual OrderLocker::Ref get_order_locker(PGRef pg) = 0;
+ virtual op_type_t get_op_type() const = 0;
+ virtual std::optional<OpRequestRef> maybe_get_op() const {
+ return std::nullopt;
+ }
+
+ virtual uint64_t get_reserved_pushes() const {
+ return 0;
+ }
+
+ virtual bool is_peering() const {
+ return false;
+ }
+ virtual bool peering_requires_pg() const {
+ ceph_abort();
+ }
+ virtual const PGCreateInfo *creates_pg() const {
+ return nullptr;
+ }
+
+ virtual std::ostream &print(std::ostream &rhs) const = 0;
+
+ virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0;
+ virtual op_scheduler_class get_scheduler_class() const = 0;
+
+ virtual ~OpQueueable() {}
+ friend std::ostream& operator<<(std::ostream& out, const OpQueueable& q) {
+ return q.print(out);
+ }
+
+ };
+
+private:
+ OpQueueable::Ref qitem;
+ int cost;
+ unsigned priority;
+ utime_t start_time;
+ uint64_t owner; ///< global id (e.g., client.XXX)
+ epoch_t map_epoch; ///< an epoch we expect the PG to exist in
+
+public:
+ OpSchedulerItem(
+ OpQueueable::Ref &&item,
+ int cost,
+ unsigned priority,
+ utime_t start_time,
+ uint64_t owner,
+ epoch_t e)
+ : qitem(std::move(item)),
+ cost(cost),
+ priority(priority),
+ start_time(start_time),
+ owner(owner),
+ map_epoch(e)
+ {}
+ OpSchedulerItem(OpSchedulerItem &&) = default;
+ OpSchedulerItem(const OpSchedulerItem &) = delete;
+ OpSchedulerItem &operator=(OpSchedulerItem &&) = default;
+ OpSchedulerItem &operator=(const OpSchedulerItem &) = delete;
+
+ OrderLocker::Ref get_order_locker(PGRef pg) {
+ return qitem->get_order_locker(pg);
+ }
+ uint32_t get_queue_token() const {
+ return qitem->get_queue_token();
+ }
+ const spg_t& get_ordering_token() const {
+ return qitem->get_ordering_token();
+ }
+ using op_type_t = OpQueueable::op_type_t;
+ OpQueueable::op_type_t get_op_type() const {
+ return qitem->get_op_type();
+ }
+ std::optional<OpRequestRef> maybe_get_op() const {
+ return qitem->maybe_get_op();
+ }
+ uint64_t get_reserved_pushes() const {
+ return qitem->get_reserved_pushes();
+ }
+ void run(OSD *osd, OSDShard *sdata,PGRef& pg, ThreadPool::TPHandle &handle) {
+ qitem->run(osd, sdata, pg, handle);
+ }
+ unsigned get_priority() const { return priority; }
+ int get_cost() const { return cost; }
+ utime_t get_start_time() const { return start_time; }
+ uint64_t get_owner() const { return owner; }
+ epoch_t get_map_epoch() const { return map_epoch; }
+
+ bool is_peering() const {
+ return qitem->is_peering();
+ }
+
+ const PGCreateInfo *creates_pg() const {
+ return qitem->creates_pg();
+ }
+
+ bool peering_requires_pg() const {
+ return qitem->peering_requires_pg();
+ }
+
+ op_scheduler_class get_scheduler_class() const {
+ return qitem->get_scheduler_class();
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const OpSchedulerItem& item) {
+ out << "OpSchedulerItem("
+ << item.get_ordering_token() << " " << *item.qitem
+ << " prio " << item.get_priority()
+ << " cost " << item.get_cost()
+ << " e" << item.get_map_epoch();
+ if (item.get_reserved_pushes()) {
+ out << " reserved_pushes " << item.get_reserved_pushes();
+ }
+ return out << ")";
+ }
+}; // class OpSchedulerItem
+
+/// Implements boilerplate for operations queued for the pg lock
+class PGOpQueueable : public OpSchedulerItem::OpQueueable {
+ spg_t pgid;
+protected:
+ const spg_t& get_pgid() const {
+ return pgid;
+ }
+public:
+ explicit PGOpQueueable(spg_t pg) : pgid(pg) {}
+ uint32_t get_queue_token() const final {
+ return get_pgid().ps();
+ }
+
+ const spg_t& get_ordering_token() const final {
+ return get_pgid();
+ }
+
+ OpSchedulerItem::OrderLocker::Ref get_order_locker(PGRef pg) final {
+ class Locker : public OpSchedulerItem::OrderLocker {
+ PGRef pg;
+ public:
+ explicit Locker(PGRef pg) : pg(pg) {}
+ void lock() final {
+ pg->lock();
+ }
+ void unlock() final {
+ pg->unlock();
+ }
+ };
+ return OpSchedulerItem::OrderLocker::Ref(
+ new Locker(pg));
+ }
+};
+
+class PGOpItem : public PGOpQueueable {
+ OpRequestRef op;
+
+ const MOSDOp *maybe_get_mosd_op() const {
+ auto req = op->get_req();
+ if (req->get_type() == CEPH_MSG_OSD_OP) {
+ return op->get_req<MOSDOp>();
+ } else {
+ return nullptr;
+ }
+ }
+
+public:
+ PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {}
+ op_type_t get_op_type() const final {
+
+ return op_type_t::client_op;
+ }
+
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGOpItem(op=" << *(op->get_req()) << ")";
+ }
+
+ std::optional<OpRequestRef> maybe_get_op() const final {
+ return op;
+ }
+
+ op_scheduler_class get_scheduler_class() const final {
+ auto type = op->get_req()->get_type();
+ if (type == CEPH_MSG_OSD_OP ||
+ type == CEPH_MSG_OSD_BACKOFF) {
+ return op_scheduler_class::client;
+ } else {
+ return op_scheduler_class::immediate;
+ }
+ }
+
+ void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+};
+
+class PGPeeringItem : public PGOpQueueable {
+ PGPeeringEventRef evt;
+public:
+ PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::peering_event;
+ }
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGPeeringEvent(" << evt->get_desc() << ")";
+ }
+ void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+ bool is_peering() const override {
+ return true;
+ }
+ bool peering_requires_pg() const override {
+ return evt->requires_pg;
+ }
+ const PGCreateInfo *creates_pg() const override {
+ return evt->create_info.get();
+ }
+ op_scheduler_class get_scheduler_class() const final {
+ return op_scheduler_class::immediate;
+ }
+};
+
+class PGSnapTrim : public PGOpQueueable {
+ epoch_t epoch_queued;
+public:
+ PGSnapTrim(
+ spg_t pg,
+ epoch_t epoch_queued)
+ : PGOpQueueable(pg), epoch_queued(epoch_queued) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::bg_snaptrim;
+ }
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGSnapTrim(pgid=" << get_pgid()
+ << " epoch_queued=" << epoch_queued
+ << ")";
+ }
+ void run(
+ OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+ op_scheduler_class get_scheduler_class() const final {
+ return op_scheduler_class::background_best_effort;
+ }
+};
+
+class PGScrub : public PGOpQueueable {
+ epoch_t epoch_queued;
+public:
+ PGScrub(
+ spg_t pg,
+ epoch_t epoch_queued)
+ : PGOpQueueable(pg), epoch_queued(epoch_queued) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::bg_scrub;
+ }
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGScrub(pgid=" << get_pgid()
+ << "epoch_queued=" << epoch_queued
+ << ")";
+ }
+ void run(
+ OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+ op_scheduler_class get_scheduler_class() const final {
+ return op_scheduler_class::background_best_effort;
+ }
+};
+
+class PGScrubItem : public PGOpQueueable {
+ protected:
+ epoch_t epoch_queued;
+ Scrub::act_token_t activation_index;
+ std::string_view message_name;
+ PGScrubItem(spg_t pg, epoch_t epoch_queued, std::string_view derivative_name)
+ : PGOpQueueable{pg}
+ , epoch_queued{epoch_queued}
+ , activation_index{0}
+ , message_name{derivative_name}
+ {}
+ PGScrubItem(spg_t pg,
+ epoch_t epoch_queued,
+ Scrub::act_token_t op_index,
+ std::string_view derivative_name)
+ : PGOpQueueable{pg}
+ , epoch_queued{epoch_queued}
+ , activation_index{op_index}
+ , message_name{derivative_name}
+ {}
+ op_type_t get_op_type() const final { return op_type_t::bg_scrub; }
+ std::ostream& print(std::ostream& rhs) const final
+ {
+ return rhs << message_name << "(pgid=" << get_pgid()
+ << "epoch_queued=" << epoch_queued
+ << " scrub-token=" << activation_index << ")";
+ }
+ void run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ ThreadPool::TPHandle& handle) override = 0;
+ op_scheduler_class get_scheduler_class() const final
+ {
+ return op_scheduler_class::background_best_effort;
+ }
+};
+
+class PGScrubResched : public PGScrubItem {
+ public:
+ PGScrubResched(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubResched"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+/**
+ * all replicas have granted our scrub resources request
+ */
+class PGScrubResourcesOK : public PGScrubItem {
+ public:
+ PGScrubResourcesOK(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubResourcesOK"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+/**
+ * scrub resources requests denied by replica(s)
+ */
+class PGScrubDenied : public PGScrubItem {
+ public:
+ PGScrubDenied(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubDenied"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+/**
+ * called when a repair process completes, to initiate scrubbing. No local/remote
+ * resources are allocated.
+ */
+class PGScrubAfterRepair : public PGScrubItem {
+ public:
+ PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubPushesUpdate : public PGScrubItem {
+ public:
+ PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubAppliedUpdate : public PGScrubItem {
+ public:
+ PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"}
+ {}
+ void run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubUnblocked : public PGScrubItem {
+ public:
+ PGScrubUnblocked(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"}
+ {}
+ void run(OSD* osd,
+ OSDShard* sdata,
+ PGRef& pg,
+ [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubDigestUpdate : public PGScrubItem {
+ public:
+ PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGotLocalMap : public PGScrubItem {
+ public:
+ PGScrubGotLocalMap(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubGotLocalMap"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGotReplMaps : public PGScrubItem {
+ public:
+ PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubMapsCompared : public PGScrubItem {
+ public:
+ PGScrubMapsCompared(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubMapsCompared"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrub : public PGScrubItem {
+ public:
+ PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+ : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrub"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrubResched : public PGScrubItem {
+ public:
+ PGRepScrubResched(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+ : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrubResched"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubReplicaPushes : public PGScrubItem {
+ public:
+ PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubScrubFinished : public PGScrubItem {
+ public:
+ PGScrubScrubFinished(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubScrubFinished"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGetNextChunk : public PGScrubItem {
+ public:
+ PGScrubGetNextChunk(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubGetNextChunk"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubChunkIsBusy : public PGScrubItem {
+ public:
+ PGScrubChunkIsBusy(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsBusy"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubChunkIsFree : public PGScrubItem {
+ public:
+ PGScrubChunkIsFree(spg_t pg, epoch_t epoch_queued)
+ : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsFree"}
+ {}
+ void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRecovery : public PGOpQueueable {
+ epoch_t epoch_queued;
+ uint64_t reserved_pushes;
+public:
+ PGRecovery(
+ spg_t pg,
+ epoch_t epoch_queued,
+ uint64_t reserved_pushes)
+ : PGOpQueueable(pg),
+ epoch_queued(epoch_queued),
+ reserved_pushes(reserved_pushes) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::bg_recovery;
+ }
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGRecovery(pgid=" << get_pgid()
+ << " epoch_queued=" << epoch_queued
+ << " reserved_pushes=" << reserved_pushes
+ << ")";
+ }
+ uint64_t get_reserved_pushes() const final {
+ return reserved_pushes;
+ }
+ void run(
+ OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+ op_scheduler_class get_scheduler_class() const final {
+ return op_scheduler_class::background_recovery;
+ }
+};
+
+class PGRecoveryContext : public PGOpQueueable {
+ std::unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+ epoch_t epoch;
+public:
+ PGRecoveryContext(spg_t pgid,
+ GenContext<ThreadPool::TPHandle&> *c, epoch_t epoch)
+ : PGOpQueueable(pgid),
+ c(c), epoch(epoch) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::bg_recovery;
+ }
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGRecoveryContext(pgid=" << get_pgid()
+ << " c=" << c.get() << " epoch=" << epoch
+ << ")";
+ }
+ void run(
+ OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+ op_scheduler_class get_scheduler_class() const final {
+ return op_scheduler_class::background_recovery;
+ }
+};
+
+class PGDelete : public PGOpQueueable {
+ epoch_t epoch_queued;
+public:
+ PGDelete(
+ spg_t pg,
+ epoch_t epoch_queued)
+ : PGOpQueueable(pg),
+ epoch_queued(epoch_queued) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::bg_pg_delete;
+ }
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGDelete(" << get_pgid()
+ << " e" << epoch_queued
+ << ")";
+ }
+ void run(
+ OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+ op_scheduler_class get_scheduler_class() const final {
+ return op_scheduler_class::background_best_effort;
+ }
+};
+
+class PGRecoveryMsg : public PGOpQueueable {
+ OpRequestRef op;
+
+public:
+ PGRecoveryMsg(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {}
+ op_type_t get_op_type() const final {
+ return op_type_t::bg_recovery;
+ }
+
+ std::ostream &print(std::ostream &rhs) const final {
+ return rhs << "PGRecoveryMsg(op=" << *(op->get_req()) << ")";
+ }
+
+ std::optional<OpRequestRef> maybe_get_op() const final {
+ return op;
+ }
+
+ op_scheduler_class get_scheduler_class() const final {
+ auto priority = op->get_req()->get_priority();
+ if (priority >= CEPH_MSG_PRIO_HIGH) {
+ return op_scheduler_class::immediate;
+ }
+ return op_scheduler_class::background_recovery;
+ }
+
+ void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+};
+
+}
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc
new file mode 100644
index 000000000..f2f0ffc3d
--- /dev/null
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -0,0 +1,514 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <memory>
+#include <functional>
+
+#include "osd/scheduler/mClockScheduler.h"
+#include "common/dout.h"
+
+namespace dmc = crimson::dmclock;
+using namespace std::placeholders;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << "mClockScheduler: "
+
+
+namespace ceph::osd::scheduler {
+
+mClockScheduler::mClockScheduler(CephContext *cct,
+ uint32_t num_shards,
+ bool is_rotational)
+ : cct(cct),
+ num_shards(num_shards),
+ is_rotational(is_rotational),
+ scheduler(
+ std::bind(&mClockScheduler::ClientRegistry::get_info,
+ &client_registry,
+ _1),
+ dmc::AtLimit::Wait,
+ cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+{
+ cct->_conf.add_observer(this);
+ ceph_assert(num_shards > 0);
+ set_max_osd_capacity();
+ set_osd_mclock_cost_per_io();
+ set_osd_mclock_cost_per_byte();
+ set_mclock_profile();
+ enable_mclock_profile_settings();
+ client_registry.update_from_config(cct->_conf);
+}
+
+void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+{
+ default_external_client_info.update(
+ conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
+
+ internal_client_infos[
+ static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
+
+ internal_client_infos[
+ static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
+ conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
+ const client_profile_id_t &client) const
+{
+ auto ret = external_client_infos.find(client);
+ if (ret == external_client_infos.end())
+ return &default_external_client_info;
+ else
+ return &(ret->second);
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
+ const scheduler_id_t &id) const {
+ switch (id.class_id) {
+ case op_scheduler_class::immediate:
+ ceph_assert(0 == "Cannot schedule immediate");
+ return (dmc::ClientInfo*)nullptr;
+ case op_scheduler_class::client:
+ return get_external_client(id.client_profile_id);
+ default:
+ ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
+ return &internal_client_infos[static_cast<size_t>(id.class_id)];
+ }
+}
+
+void mClockScheduler::set_max_osd_capacity()
+{
+ if (is_rotational) {
+ max_osd_capacity =
+ cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
+ } else {
+ max_osd_capacity =
+ cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
+ }
+ // Set per op-shard iops limit
+ max_osd_capacity /= num_shards;
+ dout(1) << __func__ << " #op shards: " << num_shards
+ << std::fixed << std::setprecision(2)
+ << " max osd capacity(iops) per shard: " << max_osd_capacity
+ << dendl;
+}
+
+void mClockScheduler::set_osd_mclock_cost_per_io()
+{
+ std::chrono::seconds sec(1);
+ if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
+ osd_mclock_cost_per_io =
+ cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
+ } else {
+ if (is_rotational) {
+ osd_mclock_cost_per_io =
+ cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
+ // For HDDs, convert value to seconds
+ osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
+ } else {
+ // For SSDs, convert value to milliseconds
+ osd_mclock_cost_per_io =
+ cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
+ osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
+ }
+ }
+ dout(1) << __func__ << " osd_mclock_cost_per_io: "
+ << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
+ << dendl;
+}
+
+void mClockScheduler::set_osd_mclock_cost_per_byte()
+{
+ std::chrono::seconds sec(1);
+ if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
+ osd_mclock_cost_per_byte =
+ cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
+ } else {
+ if (is_rotational) {
+ osd_mclock_cost_per_byte =
+ cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
+ // For HDDs, convert value to seconds
+ osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
+ } else {
+ osd_mclock_cost_per_byte =
+ cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
+ // For SSDs, convert value to milliseconds
+ osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
+ }
+ }
+ dout(1) << __func__ << " osd_mclock_cost_per_byte: "
+ << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
+ << dendl;
+}
+
+void mClockScheduler::set_mclock_profile()
+{
+ mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+ dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
+}
+
+std::string mClockScheduler::get_mclock_profile()
+{
+ return mclock_profile;
+}
+
+void mClockScheduler::set_balanced_profile_allocations()
+{
+ // Client Allocation:
+ // reservation: 40% | weight: 1 | limit: 100% |
+ // Background Recovery Allocation:
+ // reservation: 40% | weight: 1 | limit: 150% |
+ // Background Best Effort Allocation:
+ // reservation: 20% | weight: 2 | limit: max |
+
+ // Client
+ uint64_t client_res = static_cast<uint64_t>(
+ std::round(0.40 * max_osd_capacity));
+ uint64_t client_lim = static_cast<uint64_t>(
+ std::round(max_osd_capacity));
+ uint64_t client_wgt = default_min;
+
+ // Background Recovery
+ uint64_t rec_res = static_cast<uint64_t>(
+ std::round(0.40 * max_osd_capacity));
+ uint64_t rec_lim = static_cast<uint64_t>(
+ std::round(1.5 * max_osd_capacity));
+ uint64_t rec_wgt = default_min;
+
+ // Background Best Effort
+ uint64_t best_effort_res = static_cast<uint64_t>(
+ std::round(0.20 * max_osd_capacity));
+ uint64_t best_effort_lim = default_max;
+ uint64_t best_effort_wgt = 2;
+
+ // Set the allocations for the mclock clients
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::client)].update(
+ client_res,
+ client_wgt,
+ client_lim);
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+ rec_res,
+ rec_wgt,
+ rec_lim);
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+ best_effort_res,
+ best_effort_wgt,
+ best_effort_lim);
+}
+
+void mClockScheduler::set_high_recovery_ops_profile_allocations()
+{
+ // Client Allocation:
+ // reservation: 30% | weight: 1 | limit: 80% |
+ // Background Recovery Allocation:
+ // reservation: 60% | weight: 2 | limit: 200% |
+ // Background Best Effort Allocation:
+ // reservation: 1 | weight: 2 | limit: max |
+
+ // Client
+ uint64_t client_res = static_cast<uint64_t>(
+ std::round(0.30 * max_osd_capacity));
+ uint64_t client_lim = static_cast<uint64_t>(
+ std::round(0.80 * max_osd_capacity));
+ uint64_t client_wgt = default_min;
+
+ // Background Recovery
+ uint64_t rec_res = static_cast<uint64_t>(
+ std::round(0.60 * max_osd_capacity));
+ uint64_t rec_lim = static_cast<uint64_t>(
+ std::round(2.0 * max_osd_capacity));
+ uint64_t rec_wgt = 2;
+
+ // Background Best Effort
+ uint64_t best_effort_res = default_min;
+ uint64_t best_effort_lim = default_max;
+ uint64_t best_effort_wgt = 2;
+
+ // Set the allocations for the mclock clients
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::client)].update(
+ client_res,
+ client_wgt,
+ client_lim);
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+ rec_res,
+ rec_wgt,
+ rec_lim);
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+ best_effort_res,
+ best_effort_wgt,
+ best_effort_lim);
+}
+
+void mClockScheduler::set_high_client_ops_profile_allocations()
+{
+ // Client Allocation:
+ // reservation: 50% | weight: 2 | limit: max |
+ // Background Recovery Allocation:
+ // reservation: 25% | weight: 1 | limit: 100% |
+ // Background Best Effort Allocation:
+ // reservation: 25% | weight: 2 | limit: max |
+
+ // Client
+ uint64_t client_res = static_cast<uint64_t>(
+ std::round(0.50 * max_osd_capacity));
+ uint64_t client_wgt = 2;
+ uint64_t client_lim = default_max;
+
+ // Background Recovery
+ uint64_t rec_res = static_cast<uint64_t>(
+ std::round(0.25 * max_osd_capacity));
+ uint64_t rec_lim = static_cast<uint64_t>(
+ std::round(max_osd_capacity));
+ uint64_t rec_wgt = default_min;
+
+ // Background Best Effort
+ uint64_t best_effort_res = static_cast<uint64_t>(
+ std::round(0.25 * max_osd_capacity));
+ uint64_t best_effort_lim = default_max;
+ uint64_t best_effort_wgt = 2;
+
+ // Set the allocations for the mclock clients
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::client)].update(
+ client_res,
+ client_wgt,
+ client_lim);
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+ rec_res,
+ rec_wgt,
+ rec_lim);
+ client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+ best_effort_res,
+ best_effort_wgt,
+ best_effort_lim);
+}
+
+void mClockScheduler::enable_mclock_profile_settings()
+{
+ // Nothing to do for "custom" profile
+ if (mclock_profile == "custom") {
+ return;
+ }
+
+ // Set mclock and ceph config options for the chosen profile
+ if (mclock_profile == "balanced") {
+ set_balanced_profile_allocations();
+ } else if (mclock_profile == "high_recovery_ops") {
+ set_high_recovery_ops_profile_allocations();
+ } else if (mclock_profile == "high_client_ops") {
+ set_high_client_ops_profile_allocations();
+ } else {
+ ceph_assert("Invalid choice of mclock profile" == 0);
+ return;
+ }
+
+ // Set the mclock config parameters
+ set_profile_config();
+}
+
+void mClockScheduler::set_profile_config()
+{
+ ClientAllocs client = client_allocs[
+ static_cast<size_t>(op_scheduler_class::client)];
+ ClientAllocs rec = client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_recovery)];
+ ClientAllocs best_effort = client_allocs[
+ static_cast<size_t>(op_scheduler_class::background_best_effort)];
+
+ // Set external client params
+ cct->_conf.set_val("osd_mclock_scheduler_client_res",
+ std::to_string(client.res));
+ cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
+ std::to_string(client.wgt));
+ cct->_conf.set_val("osd_mclock_scheduler_client_lim",
+ std::to_string(client.lim));
+
+ // Set background recovery client params
+ cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
+ std::to_string(rec.res));
+ cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
+ std::to_string(rec.wgt));
+ cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
+ std::to_string(rec.lim));
+
+ // Set background best effort client params
+ cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
+ std::to_string(best_effort.res));
+ cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
+ std::to_string(best_effort.wgt));
+ cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
+ std::to_string(best_effort.lim));
+}
+
+int mClockScheduler::calc_scaled_cost(int item_cost)
+{
+ // Calculate total scaled cost in secs
+ int scaled_cost =
+ std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
+ return std::max(scaled_cost, 1);
+}
+
+void mClockScheduler::update_configuration()
+{
+ // Apply configuration change. The expectation is that
+ // at least one of the tracked mclock config option keys
+ // is modified before calling this method.
+ cct->_conf.apply_changes(nullptr);
+}
+
+void mClockScheduler::dump(ceph::Formatter &f) const
+{
+}
+
+void mClockScheduler::enqueue(OpSchedulerItem&& item)
+{
+ auto id = get_scheduler_id(item);
+
+ // TODO: move this check into OpSchedulerItem, handle backwards compat
+ if (op_scheduler_class::immediate == id.class_id) {
+ immediate.push_front(std::move(item));
+ } else {
+ int cost = calc_scaled_cost(item.get_cost());
+ // Add item to scheduler queue
+ scheduler.add_request(
+ std::move(item),
+ id,
+ cost);
+ }
+}
+
+void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
+{
+ immediate.push_back(std::move(item));
+ // TODO: item may not be immediate, update mclock machinery to permit
+ // putting the item back in the queue
+}
+
+WorkItem mClockScheduler::dequeue()
+{
+ if (!immediate.empty()) {
+ WorkItem work_item{std::move(immediate.back())};
+ immediate.pop_back();
+ return work_item;
+ } else {
+ mclock_queue_t::PullReq result = scheduler.pull_request();
+ if (result.is_future()) {
+ return result.getTime();
+ } else if (result.is_none()) {
+ ceph_assert(
+ 0 == "Impossible, must have checked empty() first");
+ return {};
+ } else {
+ ceph_assert(result.is_retn());
+
+ auto &retn = result.get_retn();
+ return std::move(*retn.request);
+ }
+ }
+}
+
+const char** mClockScheduler::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_mclock_scheduler_client_res",
+ "osd_mclock_scheduler_client_wgt",
+ "osd_mclock_scheduler_client_lim",
+ "osd_mclock_scheduler_background_recovery_res",
+ "osd_mclock_scheduler_background_recovery_wgt",
+ "osd_mclock_scheduler_background_recovery_lim",
+ "osd_mclock_scheduler_background_best_effort_res",
+ "osd_mclock_scheduler_background_best_effort_wgt",
+ "osd_mclock_scheduler_background_best_effort_lim",
+ "osd_mclock_cost_per_io_usec",
+ "osd_mclock_cost_per_io_usec_hdd",
+ "osd_mclock_cost_per_io_usec_ssd",
+ "osd_mclock_cost_per_byte_usec",
+ "osd_mclock_cost_per_byte_usec_hdd",
+ "osd_mclock_cost_per_byte_usec_ssd",
+ "osd_mclock_max_capacity_iops_hdd",
+ "osd_mclock_max_capacity_iops_ssd",
+ "osd_mclock_profile",
+ NULL
+ };
+ return KEYS;
+}
+
+void mClockScheduler::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ if (changed.count("osd_mclock_cost_per_io_usec") ||
+ changed.count("osd_mclock_cost_per_io_usec_hdd") ||
+ changed.count("osd_mclock_cost_per_io_usec_ssd")) {
+ set_osd_mclock_cost_per_io();
+ }
+ if (changed.count("osd_mclock_cost_per_byte_usec") ||
+ changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
+ changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
+ set_osd_mclock_cost_per_byte();
+ }
+ if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
+ changed.count("osd_mclock_max_capacity_iops_ssd")) {
+ set_max_osd_capacity();
+ if (mclock_profile != "custom") {
+ enable_mclock_profile_settings();
+ client_registry.update_from_config(conf);
+ }
+ }
+ if (changed.count("osd_mclock_profile")) {
+ set_mclock_profile();
+ if (mclock_profile != "custom") {
+ enable_mclock_profile_settings();
+ client_registry.update_from_config(conf);
+ }
+ }
+ if (changed.count("osd_mclock_scheduler_client_res") ||
+ changed.count("osd_mclock_scheduler_client_wgt") ||
+ changed.count("osd_mclock_scheduler_client_lim") ||
+ changed.count("osd_mclock_scheduler_background_recovery_res") ||
+ changed.count("osd_mclock_scheduler_background_recovery_wgt") ||
+ changed.count("osd_mclock_scheduler_background_recovery_lim") ||
+ changed.count("osd_mclock_scheduler_background_best_effort_res") ||
+ changed.count("osd_mclock_scheduler_background_best_effort_wgt") ||
+ changed.count("osd_mclock_scheduler_background_best_effort_lim")) {
+ if (mclock_profile == "custom") {
+ client_registry.update_from_config(conf);
+ }
+ }
+}
+
+mClockScheduler::~mClockScheduler()
+{
+ cct->_conf.remove_observer(this);
+}
+
+}
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h
new file mode 100644
index 000000000..32f3851ec
--- /dev/null
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <ostream>
+#include <map>
+#include <vector>
+
+#include "boost/variant.hpp"
+
+#include "dmclock/src/dmclock_server.h"
+
+#include "osd/scheduler/OpScheduler.h"
+#include "common/config.h"
+#include "include/cmp.h"
+#include "common/ceph_context.h"
+#include "common/mClockPriorityQueue.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+
+namespace ceph::osd::scheduler {
+
+constexpr uint64_t default_min = 1;
+constexpr uint64_t default_max = 999999;
+
+using client_id_t = uint64_t;
+using profile_id_t = uint64_t;
+
+struct client_profile_id_t {
+ client_id_t client_id;
+ profile_id_t profile_id;
+};
+
+WRITE_EQ_OPERATORS_2(client_profile_id_t, client_id, profile_id)
+WRITE_CMP_OPERATORS_2(client_profile_id_t, client_id, profile_id)
+
+
+struct scheduler_id_t {
+ op_scheduler_class class_id;
+ client_profile_id_t client_profile_id;
+};
+
+WRITE_EQ_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
+WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
+
+/**
+ * Scheduler implementation based on mclock.
+ *
+ * TODO: explain configs
+ */
+class mClockScheduler : public OpScheduler, md_config_obs_t {
+
+ CephContext *cct;
+ const uint32_t num_shards;
+ bool is_rotational;
+ double max_osd_capacity;
+ double osd_mclock_cost_per_io;
+ double osd_mclock_cost_per_byte;
+ std::string mclock_profile = "high_client_ops";
+ struct ClientAllocs {
+ uint64_t res;
+ uint64_t wgt;
+ uint64_t lim;
+
+ ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+ update(_res, _wgt, _lim);
+ }
+
+ inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+ res = _res;
+ wgt = _wgt;
+ lim = _lim;
+ }
+ };
+ std::array<
+ ClientAllocs,
+ static_cast<size_t>(op_scheduler_class::client) + 1
+ > client_allocs = {
+ // Placeholder, get replaced with configured values
+ ClientAllocs(1, 1, 1), // background_recovery
+ ClientAllocs(1, 1, 1), // background_best_effort
+ ClientAllocs(1, 1, 1), // immediate (not used)
+ ClientAllocs(1, 1, 1) // client
+ };
+ class ClientRegistry {
+ std::array<
+ crimson::dmclock::ClientInfo,
+ static_cast<size_t>(op_scheduler_class::immediate)
+ > internal_client_infos = {
+ // Placeholder, gets replaced with configured values
+ crimson::dmclock::ClientInfo(1, 1, 1),
+ crimson::dmclock::ClientInfo(1, 1, 1)
+ };
+
+ crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
+ std::map<client_profile_id_t,
+ crimson::dmclock::ClientInfo> external_client_infos;
+ const crimson::dmclock::ClientInfo *get_external_client(
+ const client_profile_id_t &client) const;
+ public:
+ void update_from_config(const ConfigProxy &conf);
+ const crimson::dmclock::ClientInfo *get_info(
+ const scheduler_id_t &id) const;
+ } client_registry;
+
+ using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
+ scheduler_id_t,
+ OpSchedulerItem,
+ true,
+ true,
+ 2>;
+ mclock_queue_t scheduler;
+ std::list<OpSchedulerItem> immediate;
+
+ static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) {
+ return scheduler_id_t{
+ item.get_scheduler_class(),
+ client_profile_id_t{
+ item.get_owner(),
+ 0
+ }
+ };
+ }
+
+public:
+ mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational);
+ ~mClockScheduler() override;
+
+ // Set the max osd capacity in iops
+ void set_max_osd_capacity();
+
+ // Set the cost per io for the osd
+ void set_osd_mclock_cost_per_io();
+
+ // Set the cost per byte for the osd
+ void set_osd_mclock_cost_per_byte();
+
+ // Set the mclock profile type to enable
+ void set_mclock_profile();
+
+ // Get the active mclock profile
+ std::string get_mclock_profile();
+
+ // Set "balanced" profile allocations
+ void set_balanced_profile_allocations();
+
+ // Set "high_recovery_ops" profile allocations
+ void set_high_recovery_ops_profile_allocations();
+
+ // Set "high_client_ops" profile allocations
+ void set_high_client_ops_profile_allocations();
+
+ // Set the mclock related config params based on the profile
+ void enable_mclock_profile_settings();
+
+ // Set mclock config parameter based on allocations
+ void set_profile_config();
+
+ // Calculate scale cost per item
+ int calc_scaled_cost(int cost);
+
+ // Enqueue op in the back of the regular queue
+ void enqueue(OpSchedulerItem &&item) final;
+
+ // Enqueue the op in the front of the regular queue
+ void enqueue_front(OpSchedulerItem &&item) final;
+
+ // Return an op to be dispatch
+ WorkItem dequeue() final;
+
+ // Returns if the queue is empty
+ bool empty() const final {
+ return immediate.empty() && scheduler.empty();
+ }
+
+ // Formatted output of the queue
+ void dump(ceph::Formatter &f) const final;
+
+ void print(std::ostream &ostream) const final {
+ ostream << "mClockScheduler";
+ }
+
+ // Update data associated with the modified mclock config key(s)
+ void update_configuration() final;
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) final;
+};
+
+}
diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc
new file mode 100644
index 000000000..fff372081
--- /dev/null
+++ b/src/osd/scrub_machine.cc
@@ -0,0 +1,534 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "scrub_machine.h"
+
+#include <chrono>
+#include <typeinfo>
+
+#include <boost/core/demangle.hpp>
+
+#include "OSD.h"
+#include "OpRequest.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << " scrubberFSM "
+
+using namespace std::chrono;
+using namespace std::chrono_literals;
+namespace sc = boost::statechart;
+
+#define DECLARE_LOCALS \
+ ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
+ std::ignore = scrbr; \
+ auto pg_id = context<ScrubMachine>().m_pg_id; \
+ std::ignore = pg_id;
+
+namespace Scrub {
+
+// --------- trace/debug auxiliaries -------------------------------
+
+void on_event_creation(std::string_view nm)
+{
+ dout(20) << " event: --vvvv---- " << nm << dendl;
+}
+
+void on_event_discard(std::string_view nm)
+{
+ dout(20) << " event: --^^^^---- " << nm << dendl;
+}
+
+std::string ScrubMachine::current_states_desc() const
+{
+ std::string sts{"<"};
+ for (auto si = state_begin(); si != state_end(); ++si) {
+ const auto& siw{ *si }; // prevents a warning re side-effects
+ // the '7' is the size of the 'scrub::'
+ sts += boost::core::demangle(typeid(siw).name()).substr(7, std::string::npos) + "/";
+ }
+ return sts + ">";
+}
+
+void ScrubMachine::assert_not_active() const
+{
+ ceph_assert(state_cast<const NotActive*>());
+}
+
+bool ScrubMachine::is_reserving() const
+{
+ return state_cast<const ReservingReplicas*>();
+}
+
+bool ScrubMachine::is_accepting_updates() const
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ ceph_assert(scrbr->is_primary());
+
+ return state_cast<const WaitLastUpdate*>();
+}
+
+// for the rest of the code in this file - we know what PG we are dealing with:
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->context<ScrubMachine>())
+
+template <class T>
+static ostream& _prefix(std::ostream* _dout, T& t)
+{
+ return t.gen_prefix(*_dout);
+}
+
+std::ostream& ScrubMachine::gen_prefix(std::ostream& out) const
+{
+ return m_scrbr->gen_prefix(out) << "FSM: ";
+}
+
+// ////////////// the actual actions
+
+// ----------------------- NotActive -----------------------------------------
+
+NotActive::NotActive(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> NotActive" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->clear_queued_or_active();
+}
+
+// ----------------------- ReservingReplicas ---------------------------------
+
+ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> ReservingReplicas" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->scrub_begin();
+ scrbr->reserve_replicas();
+}
+
+sc::result ReservingReplicas::react(const ReservationFailure&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
+
+ // the Scrubber must release all resources and abort the scrubbing
+ scrbr->clear_pgscrub_state();
+ return transit<NotActive>();
+}
+
+/**
+ * note: the event poster is handling the scrubber reset
+ */
+sc::result ReservingReplicas::react(const FullReset&)
+{
+ dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
+ return transit<NotActive>();
+}
+
+// ----------------------- ActiveScrubbing -----------------------------------
+
+ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> ActiveScrubbing" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->on_init();
+}
+
+/**
+ * upon exiting the Active state
+ */
+ActiveScrubbing::~ActiveScrubbing()
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(15) << __func__ << dendl;
+ scrbr->unreserve_replicas();
+ scrbr->clear_queued_or_active();
+}
+
+/*
+ * The only source of an InternalError event as of now is the BuildMap state,
+ * when encountering a backend error.
+ * We kill the scrub and reset the FSM.
+ */
+sc::result ActiveScrubbing::react(const InternalError&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << __func__ << dendl;
+ scrbr->clear_pgscrub_state();
+ return transit<NotActive>();
+}
+
+sc::result ActiveScrubbing::react(const FullReset&)
+{
+ dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
+ // caller takes care of clearing the scrubber & FSM states
+ return transit<NotActive>();
+}
+
+// ----------------------- RangeBlocked -----------------------------------
+
+/*
+ * Blocked. Will be released by kick_object_context_blocked() (or upon
+ * an abort)
+ */
+RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
+}
+
+// ----------------------- PendingTimer -----------------------------------
+
+/**
+ * Sleeping till timer reactivation - or just requeuing
+ */
+PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/PendingTimer" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ scrbr->add_delayed_scheduling();
+}
+
+// ----------------------- NewChunk -----------------------------------
+
+/**
+ * Preconditions:
+ * - preemption data was set
+ * - epoch start was updated
+ */
+NewChunk::NewChunk(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/NewChunk" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ scrbr->get_preemptor().adjust_parameters();
+
+ // choose range to work on
+ // select_range_n_notify() will signal either SelectedChunkFree or
+ // ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the
+ // range to become available.
+ scrbr->select_range_n_notify();
+}
+
+sc::result NewChunk::react(const SelectedChunkFree&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
+
+ scrbr->set_subset_last_update(scrbr->search_log_for_updates());
+ return transit<WaitPushes>();
+}
+
+// ----------------------- WaitPushes -----------------------------------
+
+WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
+{
+ dout(10) << " -- state -->> Act/WaitPushes" << dendl;
+ post_event(ActivePushesUpd{});
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result WaitPushes::react(const ActivePushesUpd&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
+ << scrbr->pending_active_pushes() << dendl;
+
+ if (!scrbr->pending_active_pushes()) {
+ // done waiting
+ return transit<WaitLastUpdate>();
+ }
+
+ return discard_event();
+}
+
+// ----------------------- WaitLastUpdate -----------------------------------
+
+WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
+{
+ dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
+ post_event(UpdatesApplied{});
+}
+
+/**
+ * Note:
+ * Updates are locally readable immediately. Thus, on the replicas we do need
+ * to wait for the update notifications before scrubbing. For the Primary it's
+ * a bit different: on EC (and only there) rmw operations have an additional
+ * read roundtrip. That means that on the Primary we need to wait for
+ * last_update_applied (the replica side, even on EC, is still safe
+ * since the actual transaction will already be readable by commit time.
+ */
+void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
+
+ if (scrbr->has_pg_marked_new_updates()) {
+ post_event(InternalAllUpdates{});
+ } else {
+ // will be requeued by op_applied
+ dout(10) << "wait for EC read/modify/writes to queue" << dendl;
+ }
+}
+
+/*
+ * request maps from the replicas in the acting set
+ */
+sc::result WaitLastUpdate::react(const InternalAllUpdates&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
+
+ scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
+ return transit<BuildMap>();
+}
+
+// ----------------------- BuildMap -----------------------------------
+
+BuildMap::BuildMap(my_context ctx) : my_base(ctx)
+{
+ dout(10) << " -- state -->> Act/BuildMap" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+
+ // no need to check for an epoch change, as all possible flows that brought us here have
+ // a check_interval() verification of their final event.
+
+ if (scrbr->get_preemptor().was_preempted()) {
+
+ // we were preempted, either directly or by a replica
+ dout(10) << __func__ << " preempted!!!" << dendl;
+ scrbr->mark_local_map_ready();
+ post_event(IntBmPreempted{});
+
+ } else {
+
+ auto ret = scrbr->build_primary_map_chunk();
+
+ if (ret == -EINPROGRESS) {
+ // must wait for the backend to finish. No specific event provided.
+ // build_primary_map_chunk() has already requeued us.
+ dout(20) << "waiting for the backend..." << dendl;
+
+ } else if (ret < 0) {
+
+ dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
+ post_event(InternalError{});
+
+ } else {
+
+ // the local map was created
+ post_event(IntLocalMapDone{});
+ }
+ }
+}
+
+sc::result BuildMap::react(const IntLocalMapDone&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
+
+ scrbr->mark_local_map_ready();
+ return transit<WaitReplicas>();
+}
+
+// ----------------------- DrainReplMaps -----------------------------------
+
+DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
+ // we may have received all maps already. Send the event that will make us check.
+ post_event(GotReplicas{});
+}
+
+sc::result DrainReplMaps::react(const GotReplicas&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
+
+ if (scrbr->are_all_maps_available()) {
+ // NewChunk will handle the preemption that brought us to this state
+ return transit<PendingTimer>();
+ }
+
+ dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
+ << scrbr->dump_awaited_maps() << dendl;
+ return discard_event();
+}
+
+// ----------------------- WaitReplicas -----------------------------------
+
+WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
+ post_event(GotReplicas{});
+}
+
+/**
+ * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state
+ * for a while even after we got all our maps, we must prevent are_all_maps_available()
+ * (actually - the code after the if()) from being called more than once.
+ * This is basically a separate state, but it's too transitory and artificial to justify
+ * the cost of a separate state.
+
+ * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately
+ * after initiating the process. The actual termination of the maps comparing etc' is
+ * signalled via an event. As we share the code with "classic" OSD, here too
+ * maps_compare_n_cleanup() is responsible for signalling the completion of the
+ * processing.
+ */
+sc::result WaitReplicas::react(const GotReplicas&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
+
+ if (!all_maps_already_called && scrbr->are_all_maps_available()) {
+ dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
+
+ all_maps_already_called = true;
+
+ // were we preempted?
+ if (scrbr->get_preemptor().disable_and_test()) { // a test&set
+
+
+ dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
+ return transit<PendingTimer>();
+
+ } else {
+
+ // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent:
+ scrbr->maps_compare_n_cleanup();
+ return discard_event();
+ }
+ } else {
+ return discard_event();
+ }
+}
+
+// ----------------------- WaitDigestUpdate -----------------------------------
+
+WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
+
+ // perform an initial check: maybe we already
+ // have all the updates we need:
+ // (note that DigestUpdate is usually an external event)
+ post_event(DigestUpdate{});
+}
+
+sc::result WaitDigestUpdate::react(const DigestUpdate&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
+
+ // on_digest_updates() will either:
+ // - do nothing - if we are still waiting for updates, or
+ // - finish the scrubbing of the current chunk, and:
+ // - send NextChunk, or
+ // - send ScrubFinished
+
+ scrbr->on_digest_updates();
+ return discard_event();
+}
+
+sc::result WaitDigestUpdate::react(const ScrubFinished&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "WaitDigestUpdate::react(const ScrubFinished&)" << dendl;
+ scrbr->scrub_finish();
+ return transit<NotActive>();
+}
+
+ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
+ : m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
+{
+}
+
+ScrubMachine::~ScrubMachine() = default;
+
+// -------- for replicas -----------------------------------------------------
+
+// ----------------------- ReplicaWaitUpdates --------------------------------
+
+ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
+{
+ dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ scrbr->on_replica_init();
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
+ << scrbr->pending_active_pushes() << dendl;
+
+ if (scrbr->pending_active_pushes() == 0) {
+
+ // done waiting
+ return transit<ActiveReplica>();
+ }
+
+ return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ReplicaWaitUpdates::react(const FullReset&)
+{
+ dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
+ return transit<NotActive>();
+}
+
+// ----------------------- ActiveReplica -----------------------------------
+
+ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "-- state -->> ActiveReplica" << dendl;
+ scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates
+ post_event(SchedReplica{});
+}
+
+sc::result ActiveReplica::react(const SchedReplica&)
+{
+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
+ dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
+ << scrbr->get_preemptor().is_preemptable() << dendl;
+
+ if (scrbr->get_preemptor().was_preempted()) {
+ dout(10) << "replica scrub job preempted" << dendl;
+
+ scrbr->send_preempted_replica();
+ scrbr->replica_handling_done();
+ return transit<NotActive>();
+ }
+
+ // start or check progress of build_replica_map_chunk()
+ auto ret_init = scrbr->build_replica_map_chunk();
+ if (ret_init != -EINPROGRESS) {
+ return transit<NotActive>();
+ }
+
+ return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ActiveReplica::react(const FullReset&)
+{
+ dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
+ return transit<NotActive>();
+}
+
+} // namespace Scrub
diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h
new file mode 100644
index 000000000..7f88a675a
--- /dev/null
+++ b/src/osd/scrub_machine.h
@@ -0,0 +1,344 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/deferral.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/in_state_reaction.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "scrub_machine_lstnr.h"
+#include "scrubber_common.h"
+
+using namespace std::string_literals;
+
+class PG; // holding a pointer to that one - just for testing
+class PgScrubber;
+namespace Scrub {
+
+namespace sc = ::boost::statechart;
+namespace mpl = ::boost::mpl;
+
+//
+// EVENTS
+//
+
+void on_event_creation(std::string_view nm);
+void on_event_discard(std::string_view nm);
+
+#define MEV(E) \
+ struct E : sc::event<E> { \
+ inline static int actv{0}; \
+ E() \
+ { \
+ if (!actv++) \
+ on_event_creation(#E); \
+ } \
+ ~E() \
+ { \
+ if (!--actv) \
+ on_event_discard(#E); \
+ } \
+ void print(std::ostream* out) const { *out << #E; } \
+ std::string_view print() const { return #E; } \
+ };
+
+MEV(RemotesReserved) ///< all replicas have granted our reserve request
+
+MEV(ReservationFailure) ///< a reservation request has failed
+
+MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary)
+
+MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery
+ ///< completion.
+
+MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for
+ ///< scrubbing. Via the PGScrubUnblocked op
+
+MEV(InternalSchedScrub)
+
+MEV(SelectedChunkFree)
+
+MEV(ChunkIsBusy)
+
+MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery
+ ///< that is in-flight to the local ObjectStore
+MEV(UpdatesApplied) ///< (Primary only) all updates are committed
+
+MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied
+
+MEV(GotReplicas) ///< got a map from a replica
+
+MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the
+ ///< ctor
+
+MEV(InternalError)
+
+MEV(IntLocalMapDone)
+
+MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See
+ ///< scrub_snapshot_metadata()
+
+MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done
+
+MEV(StartReplica) ///< initiating replica scrub.
+
+MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates
+
+MEV(SchedReplica)
+
+MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery
+ ///< that is in-flight to the local ObjectStore
+
+MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
+
+MEV(NextChunk) ///< finished handling this chunk. Go get the next one
+
+MEV(ScrubFinished) ///< all chunks handled
+
+
+struct NotActive; ///< the quiescent state. No active scrubbing.
+struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs
+struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine.
+struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active
+ ///< operations to finish.
+struct ActiveReplica; ///< an active state for a replica.
+
+
+class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
+ public:
+ friend class PgScrubber;
+
+ public:
+ explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub);
+ ~ScrubMachine();
+
+ spg_t m_pg_id;
+ ScrubMachineListener* m_scrbr;
+ std::ostream& gen_prefix(std::ostream& out) const;
+
+ std::string current_states_desc() const;
+ void assert_not_active() const;
+ [[nodiscard]] bool is_reserving() const;
+ [[nodiscard]] bool is_accepting_updates() const;
+};
+
+/**
+ * The Scrubber's base (quiescent) state.
+ * Scrubbing is triggered by one of the following events:
+ * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
+ * reservation process. Will be issued by PG::scrub(), following a
+ * queued "PGScrub" op.
+ * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
+ * not required to reserve resources.
+ * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
+ * MOSDRepScrub message.
+ *
+ * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
+ * for replica resources to be acquired. But once replicas started using the
+ * resource-request to identify and tag the scrub session, this bypass cannot be
+ * supported anymore.
+ */
+struct NotActive : sc::state<NotActive, ScrubMachine> {
+ explicit NotActive(my_context ctx);
+
+ using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
+ // a scrubbing that was initiated at recovery completion,
+ // and requires no resource reservations:
+ sc::transition<AfterRepairScrub, ReservingReplicas>,
+ sc::transition<StartReplica, ReplicaWaitUpdates>,
+ sc::transition<StartReplicaNoWait, ActiveReplica>>;
+};
+
+struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
+
+ explicit ReservingReplicas(my_context ctx);
+ using reactions = mpl::list<sc::custom_reaction<FullReset>,
+ // all replicas granted our resources request
+ sc::transition<RemotesReserved, ActiveScrubbing>,
+ sc::custom_reaction<ReservationFailure>>;
+
+ sc::result react(const FullReset&);
+
+ /// at least one replica denied us the scrub resources we've requested
+ sc::result react(const ReservationFailure&);
+};
+
+
+// the "active" sub-states
+
+struct RangeBlocked; ///< the objects range is blocked
+struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just
+ ///< requeue
+struct NewChunk; ///< select a chunk to scrub, and verify its availability
+struct WaitPushes;
+struct WaitLastUpdate;
+struct BuildMap;
+struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report,
+ ///< then restart.
+struct WaitReplicas; ///< wait for all replicas to report
+struct WaitDigestUpdate;
+
+struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer> {
+
+ explicit ActiveScrubbing(my_context ctx);
+ ~ActiveScrubbing();
+
+ using reactions = mpl::list<
+ sc::custom_reaction<InternalError>,
+ sc::custom_reaction<FullReset>>;
+
+ sc::result react(const FullReset&);
+ sc::result react(const InternalError&);
+};
+
+struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
+ explicit RangeBlocked(my_context ctx);
+ using reactions = mpl::list<sc::transition<Unblocked, PendingTimer>>;
+};
+
+struct PendingTimer : sc::state<PendingTimer, ActiveScrubbing> {
+
+ explicit PendingTimer(my_context ctx);
+
+ using reactions = mpl::list<sc::transition<InternalSchedScrub, NewChunk>>;
+};
+
+struct NewChunk : sc::state<NewChunk, ActiveScrubbing> {
+
+ explicit NewChunk(my_context ctx);
+
+ using reactions = mpl::list<sc::transition<ChunkIsBusy, RangeBlocked>,
+ sc::custom_reaction<SelectedChunkFree>>;
+
+ sc::result react(const SelectedChunkFree&);
+};
+
+/**
+ * initiate the update process for this chunk
+ *
+ * Wait fo 'active_pushes' to clear.
+ * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
+ * scrub waits until the correct data is readable (in-flight data to the Objectstore is
+ * not readable until written to disk, termed 'applied' here)
+ */
+struct WaitPushes : sc::state<WaitPushes, ActiveScrubbing> {
+
+ explicit WaitPushes(my_context ctx);
+
+ using reactions = mpl::list<sc::custom_reaction<ActivePushesUpd>>;
+
+ sc::result react(const ActivePushesUpd&);
+};
+
+struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
+
+ explicit WaitLastUpdate(my_context ctx);
+
+ void on_new_updates(const UpdatesApplied&);
+
+ using reactions = mpl::list<sc::custom_reaction<InternalAllUpdates>,
+ sc::in_state_reaction<UpdatesApplied,
+ WaitLastUpdate,
+ &WaitLastUpdate::on_new_updates>>;
+
+ sc::result react(const InternalAllUpdates&);
+};
+
+struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
+ explicit BuildMap(my_context ctx);
+
+ // possible error scenarios:
+ // - an error reported by the backend will trigger an 'InternalError' event,
+ // handled by our parent state;
+ // - if preempted, we switch to DrainReplMaps, where we will wait for all
+ // replicas to send their maps before acknowledging the preemption;
+ // - an interval change will be handled by the relevant 'send-event' functions,
+ // and will translated into a 'FullReset' event.
+ using reactions =
+ mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
+ sc::transition<InternalSchedScrub, BuildMap>, // looping, waiting
+ // for the backend to
+ // finish
+ sc::custom_reaction<IntLocalMapDone>>;
+
+ sc::result react(const IntLocalMapDone&);
+};
+
+/*
+ * "drain" scrub-maps responses from replicas
+ */
+struct DrainReplMaps : sc::state<DrainReplMaps, ActiveScrubbing> {
+ explicit DrainReplMaps(my_context ctx);
+
+ using reactions =
+ mpl::list<sc::custom_reaction<GotReplicas> // all replicas are accounted for
+ >;
+
+ sc::result react(const GotReplicas&);
+};
+
+struct WaitReplicas : sc::state<WaitReplicas, ActiveScrubbing> {
+ explicit WaitReplicas(my_context ctx);
+
+ using reactions =
+ mpl::list<sc::custom_reaction<GotReplicas>, // all replicas are accounted for
+ sc::transition<MapsCompared, WaitDigestUpdate>,
+ sc::deferral<DigestUpdate> // might arrive before we've reached WDU
+ >;
+
+ sc::result react(const GotReplicas&);
+
+ bool all_maps_already_called{false}; // see comment in react code
+};
+
+struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
+ explicit WaitDigestUpdate(my_context ctx);
+
+ using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
+ sc::custom_reaction<ScrubFinished>,
+ sc::transition<NextChunk, PendingTimer>>;
+ sc::result react(const DigestUpdate&);
+ sc::result react(const ScrubFinished&);
+};
+
+// ----------------------------- the "replica active" states -----------------------
+
+/*
+ * Waiting for 'active_pushes' to complete
+ *
+ * When in this state:
+ * - the details of the Primary's request were internalized by PgScrubber;
+ * - 'active' scrubbing is set
+ */
+struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
+ explicit ReplicaWaitUpdates(my_context ctx);
+ using reactions =
+ mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<FullReset>>;
+
+ sc::result react(const ReplicaPushesUpd&);
+ sc::result react(const FullReset&);
+};
+
+
+struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
+ explicit ActiveReplica(my_context ctx);
+ using reactions = mpl::list<sc::custom_reaction<SchedReplica>,
+ sc::custom_reaction<FullReset>,
+ sc::transition<ScrubFinished, NotActive>>;
+
+ sc::result react(const SchedReplica&);
+ sc::result react(const FullReset&);
+};
+
+} // namespace Scrub
diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h
new file mode 100644
index 000000000..8d9622b9b
--- /dev/null
+++ b/src/osd/scrub_machine_lstnr.h
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+/**
+ * \file the PgScrubber interface used by the scrub FSM
+ */
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "osd_types.h"
+
+namespace Scrub {
+
+enum class PreemptionNoted { no_preemption, preempted };
+
+/// the interface exposed by the PgScrubber into its internal
+/// preemption_data object
+struct preemption_t {
+
+ virtual ~preemption_t() = default;
+
+ [[nodiscard]] virtual bool is_preemptable() const = 0;
+
+ [[nodiscard]] virtual bool was_preempted() const = 0;
+
+ virtual void adjust_parameters() = 0;
+
+ /**
+ * Try to preempt the scrub.
+ * 'true' (i.e. - preempted) if:
+ * preemptable && not already preempted
+ */
+ virtual bool do_preempt() = 0;
+
+ /**
+ * disables preemptions.
+ * Returns 'true' if we were already preempted
+ */
+ virtual bool disable_and_test() = 0;
+};
+
+} // namespace Scrub
+
+struct ScrubMachineListener {
+
+ struct MsgAndEpoch {
+ MessageRef m_msg;
+ epoch_t m_epoch;
+ };
+
+ virtual ~ScrubMachineListener() = default;
+
+ virtual void select_range_n_notify() = 0;
+
+ [[nodiscard]] virtual bool is_primary() const = 0;
+
+ /// walk the log to find the latest update that affects our chunk
+ virtual eversion_t search_log_for_updates() const = 0;
+
+ virtual eversion_t get_last_update_applied() const = 0;
+
+ virtual int pending_active_pushes() const = 0;
+
+ virtual int build_primary_map_chunk() = 0;
+
+ virtual int build_replica_map_chunk() = 0;
+
+ virtual void on_init() = 0;
+
+ virtual void on_replica_init() = 0;
+
+ virtual void replica_handling_done() = 0;
+
+ /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+ /// (thus can be called from FSM reactions)
+ virtual void clear_pgscrub_state() = 0;
+
+ /*
+ * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+ * is asserted - after a configuration-dependent timeout.
+ */
+ virtual void add_delayed_scheduling() = 0;
+
+ /**
+ * Ask all replicas for their scrub maps for the current chunk.
+ */
+ virtual void get_replicas_maps(bool replica_can_preempt) = 0;
+
+ virtual void on_digest_updates() = 0;
+
+ virtual void scrub_begin() = 0;
+
+ /// the part that actually finalizes a scrub
+ virtual void scrub_finish() = 0;
+
+ /**
+ * Prepare a MOSDRepScrubMap message carrying the requested scrub map
+ * @param was_preempted - were we preempted?
+ * @return the message, and the current value of 'm_replica_min_epoch' (which is
+ * used when sending the message, but will be overwritten before that).
+ */
+ [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg(
+ Scrub::PreemptionNoted was_preempted) = 0;
+
+ /**
+ * Send to the primary the pre-prepared message containing the requested map
+ */
+ virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0;
+
+ /**
+ * Let the primary know that we were preempted while trying to build the
+ * requested map.
+ */
+ virtual void send_preempted_replica() = 0;
+
+ [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
+
+ virtual void set_subset_last_update(eversion_t e) = 0;
+
+ [[nodiscard]] virtual bool was_epoch_changed() const = 0;
+
+ virtual Scrub::preemption_t& get_preemptor() = 0;
+
+ /**
+ * a "technical" collection of the steps performed once all
+ * rep maps are available:
+ * - the maps are compared
+ * - the scrub region markers (start_ & end_) are advanced
+ * - callbacks and ops that were pending are allowed to run
+ */
+ virtual void maps_compare_n_cleanup() = 0;
+
+ /**
+ * order the PgScrubber to initiate the process of reserving replicas' scrub
+ * resources.
+ */
+ virtual void reserve_replicas() = 0;
+
+ virtual void unreserve_replicas() = 0;
+
+ /**
+ * Manipulate the 'I am being scrubbed now' Scrubber's flag
+ */
+ virtual void set_queued_or_active() = 0;
+ virtual void clear_queued_or_active() = 0;
+
+ /**
+ * the FSM interface into the "are we waiting for maps, either our own or from
+ * replicas" state.
+ * The FSM can only:
+ * - mark the local map as available, and
+ * - query status
+ */
+ virtual void mark_local_map_ready() = 0;
+
+ [[nodiscard]] virtual bool are_all_maps_available() const = 0;
+
+ /// a log/debug interface
+ virtual std::string dump_awaited_maps() const = 0;
+
+ /// exposed to be used by the scrub_machine logger
+ virtual std::ostream& gen_prefix(std::ostream& out) const = 0;
+};
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
new file mode 100644
index 000000000..65014b594
--- /dev/null
+++ b/src/osd/scrubber_common.h
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "common/scrub_types.h"
+#include "include/types.h"
+#include "os/ObjectStore.h"
+
+#include "OpRequest.h"
+
+namespace ceph {
+class Formatter;
+}
+
+namespace Scrub {
+
+/// high/low OP priority
+enum class scrub_prio_t : bool { low_priority = false, high_priority = true };
+
+/// Identifies a specific scrub activation within an interval,
+/// see ScrubPGgIF::m_current_token
+using act_token_t = uint32_t;
+
+} // namespace Scrub
+
+
+/**
+ * Flags affecting the scheduling and behaviour of the *next* scrub.
+ *
+ * we hold two of these flag collections: one
+ * for the next scrub, and one frozen at initiation (i.e. in pg::queue_scrub())
+ */
+struct requested_scrub_t {
+
+ // flags to indicate explicitly requested scrubs (by admin):
+ // bool must_scrub, must_deep_scrub, must_repair, need_auto;
+
+ /**
+ * 'must_scrub' is set by an admin command (or by need_auto).
+ * Affects the priority of the scrubbing, and the sleep periods
+ * during the scrub.
+ */
+ bool must_scrub{false};
+
+ /**
+ * scrub must not be aborted.
+ * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+ * process with the 'repair' flag set (in the RequestScrub event).
+ *
+ * Will be copied into the 'required' scrub flag upon scrub start.
+ */
+ bool req_scrub{false};
+
+ /**
+ * Set from:
+ * - scrub_requested() with need_auto param set, which only happens in
+ * - scrub_finish() - if deep_scrub_on_error is set, and we have errors
+ *
+ * If set, will prevent the OSD from casually postponing our scrub. When scrubbing
+ * starts, will cause must_scrub, must_deep_scrub and auto_repair to be set.
+ */
+ bool need_auto{false};
+
+ /**
+ * Set for scrub-after-recovery just before we initiate the recovery deep scrub,
+ * or if scrub_requested() was called with either need_auto ot repair.
+ * Affects PG_STATE_DEEP_SCRUB.
+ */
+ bool must_deep_scrub{false};
+
+ /**
+ * (An intermediary flag used by pg::sched_scrub() on the first time
+ * a planned scrub has all its resources). Determines whether the next
+ * repair/scrub will be 'deep'.
+ *
+ * Note: 'dumped' by PgScrubber::dump() and such. In reality, being a
+ * temporary that is set and reset by the same operation, will never
+ * appear externally to be set
+ */
+ bool time_for_deep{false};
+
+ bool deep_scrub_on_error{false};
+
+ /**
+ * If set, we should see must_deep_scrub and must_repair set, too
+ *
+ * - 'must_repair' is checked by the OSD when scheduling the scrubs.
+ * - also checked & cleared at pg::queue_scrub()
+ */
+ bool must_repair{false};
+
+ /*
+ * the value of auto_repair is determined in sched_scrub() (once per scrub. previous
+ * value is not remembered). Set if
+ * - allowed by configuration and backend, and
+ * - must_scrub is not set (i.e. - this is a periodic scrub),
+ * - time_for_deep was just set
+ */
+ bool auto_repair{false};
+
+ /**
+ * indicating that we are scrubbing post repair to verify everything is fixed.
+ * Otherwise - PG_STATE_FAILED_REPAIR will be asserted.
+ */
+ bool check_repair{false};
+};
+
+ostream& operator<<(ostream& out, const requested_scrub_t& sf);
+
+/**
+ * The interface used by the PG when requesting scrub-related info or services
+ */
+struct ScrubPgIF {
+
+ virtual ~ScrubPgIF() = default;
+
+ friend ostream& operator<<(ostream& out, const ScrubPgIF& s) { return s.show(out); }
+
+ virtual ostream& show(ostream& out) const = 0;
+
+ // --------------- triggering state-machine events:
+
+ virtual void initiate_regular_scrub(epoch_t epoch_queued) = 0;
+
+ virtual void initiate_scrub_after_repair(epoch_t epoch_queued) = 0;
+
+ virtual void send_scrub_resched(epoch_t epoch_queued) = 0;
+
+ virtual void active_pushes_notification(epoch_t epoch_queued) = 0;
+
+ virtual void update_applied_notification(epoch_t epoch_queued) = 0;
+
+ virtual void digest_update_notification(epoch_t epoch_queued) = 0;
+
+ virtual void send_scrub_unblock(epoch_t epoch_queued) = 0;
+
+ virtual void send_replica_maps_ready(epoch_t epoch_queued) = 0;
+
+ virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0;
+
+ virtual void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
+
+ virtual void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
+
+ virtual void on_applied_when_primary(const eversion_t &applied_version) = 0;
+
+ virtual void send_full_reset(epoch_t epoch_queued) = 0;
+
+ virtual void send_chunk_free(epoch_t epoch_queued) = 0;
+
+ virtual void send_chunk_busy(epoch_t epoch_queued) = 0;
+
+ virtual void send_local_map_done(epoch_t epoch_queued) = 0;
+
+ virtual void send_get_next_chunk(epoch_t epoch_queued) = 0;
+
+ virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0;
+
+ virtual void send_maps_compared(epoch_t epoch_queued) = 0;
+
+ // --------------------------------------------------
+
+ [[nodiscard]] virtual bool are_callbacks_pending()
+ const = 0; // currently only used for an assert
+
+ /**
+ * the scrubber is marked 'active':
+ * - for the primary: when all replica OSDs grant us the requested resources
+ * - for replicas: upon receiving the scrub request from the primary
+ */
+ [[nodiscard]] virtual bool is_scrub_active() const = 0;
+
+ /**
+ * 'true' until after the FSM processes the 'scrub-finished' event,
+ * and scrubbing is completely cleaned-up.
+ *
+ * In other words - holds longer than is_scrub_active(), thus preventing
+ * a rescrubbing of the same PG while the previous scrub has not fully
+ * terminated.
+ */
+ [[nodiscard]] virtual bool is_queued_or_active() const = 0;
+
+ /**
+ * Manipulate the 'scrubbing request has been queued, or - we are
+ * actually scrubbing' Scrubber's flag
+ */
+ virtual void set_queued_or_active() = 0;
+ virtual void clear_queued_or_active() = 0;
+
+ /// are we waiting for resource reservation grants form our replicas?
+ [[nodiscard]] virtual bool is_reserving() const = 0;
+
+ /// handle a message carrying a replica map
+ virtual void map_from_replica(OpRequestRef op) = 0;
+
+ virtual void replica_scrub_op(OpRequestRef op) = 0;
+
+ virtual void set_op_parameters(requested_scrub_t&) = 0;
+
+ virtual void scrub_clear_state() = 0;
+
+ virtual void handle_query_state(ceph::Formatter* f) = 0;
+
+ virtual void dump(ceph::Formatter* f) const = 0;
+
+ /**
+ * Return true if soid is currently being scrubbed and pending IOs should block.
+ * May have a side effect of preempting an in-progress scrub -- will return false
+ * in that case.
+ *
+ * @param soid object to check for ongoing scrub
+ * @return boolean whether a request on soid should block until scrub completion
+ */
+ virtual bool write_blocked_by_scrub(const hobject_t& soid) = 0;
+
+ /// Returns whether any objects in the range [begin, end] are being scrubbed
+ virtual bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) = 0;
+
+ /// the op priority, taken from the primary's request message
+ virtual Scrub::scrub_prio_t replica_op_priority() const = 0;
+
+ /// the priority of the on-going scrub (used when requeuing events)
+ virtual unsigned int scrub_requeue_priority(
+ Scrub::scrub_prio_t with_priority) const = 0;
+ virtual unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+ unsigned int suggested_priority) const = 0;
+
+ virtual void add_callback(Context* context) = 0;
+
+ /// add to scrub statistics, but only if the soid is below the scrub start
+ virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+ const hobject_t& soid) = 0;
+
+ /**
+ * the version of 'scrub_clear_state()' that does not try to invoke FSM services
+ * (thus can be called from FSM reactions)
+ */
+ virtual void clear_pgscrub_state() = 0;
+
+ /**
+ * triggers the 'RemotesReserved' (all replicas granted scrub resources)
+ * state-machine event
+ */
+ virtual void send_remotes_reserved(epoch_t epoch_queued) = 0;
+
+ /**
+ * triggers the 'ReservationFailure' (at least one replica denied us the requested
+ * resources) state-machine event
+ */
+ virtual void send_reservation_failure(epoch_t epoch_queued) = 0;
+
+ virtual void cleanup_store(ObjectStore::Transaction* t) = 0;
+
+ virtual bool get_store_errors(const scrub_ls_arg_t& arg,
+ scrub_ls_result_t& res_inout) const = 0;
+
+ // --------------- reservations -----------------------------------
+
+ /**
+ * message all replicas with a request to "unreserve" scrub
+ */
+ virtual void unreserve_replicas() = 0;
+
+ /**
+ * "forget" all replica reservations. No messages are sent to the
+ * previously-reserved.
+ *
+ * Used upon interval change. The replicas' state is guaranteed to
+ * be reset separately by the interval-change event.
+ */
+ virtual void discard_replica_reservations() = 0;
+
+ /**
+ * clear both local and OSD-managed resource reservation flags
+ */
+ virtual void clear_scrub_reservations() = 0;
+
+ /**
+ * Reserve local scrub resources (managed by the OSD)
+ *
+ * Fails if OSD's local-scrubs budget was exhausted
+ * \returns were local resources reserved?
+ */
+ virtual bool reserve_local() = 0;
+
+ // on the replica:
+ virtual void handle_scrub_reserve_request(OpRequestRef op) = 0;
+ virtual void handle_scrub_reserve_release(OpRequestRef op) = 0;
+
+ // and on the primary:
+ virtual void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) = 0;
+ virtual void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) = 0;
+
+ virtual void reg_next_scrub(const requested_scrub_t& request_flags) = 0;
+ virtual void unreg_next_scrub() = 0;
+ virtual void scrub_requested(scrub_level_t scrub_level,
+ scrub_type_t scrub_type,
+ requested_scrub_t& req_flags) = 0;
+};